package org.apache.tika.parser.html;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.regex.Pattern;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Schema;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/apache/tika/parser/html/HtmlParserTest.class */
public class HtmlParserTest {
    @Test
    public void testParseAscii() throws Exception {
        final StringWriter stringWriter = new StringWriter();
        final StringWriter stringWriter2 = new StringWriter();
        ContentHandler bodyContentHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        InputStream resourceAsStream = HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html");
        try {
            new HtmlParser().parse(resourceAsStream, new TeeContentHandler(new ContentHandler[]{bodyContentHandler, new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.1
                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    if ("a".equals(str2)) {
                        if (attributes.getValue("href") != null) {
                            stringWriter.append((CharSequence) attributes.getValue("href"));
                        } else if (attributes.getValue("name") != null) {
                            stringWriter2.append((CharSequence) attributes.getValue("name"));
                        }
                    }
                }
            }}), metadata, new ParseContext());
            resourceAsStream.close();
            Assert.assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
            Assert.assertEquals("Tika Developers", metadata.get("Author"));
            Assert.assertEquals("5", metadata.get("refresh"));
            Assert.assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
            Assert.assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
            Assert.assertEquals("http://www.apache.org/", stringWriter.toString());
            Assert.assertEquals("test-anchor", stringWriter2.toString());
            String obj = bodyContentHandler.toString();
            Assert.assertTrue("Did not contain expected text:Test Indexation Html", obj.contains("Test Indexation Html"));
            Assert.assertTrue("Did not contain expected text:Indexation du fichier", obj.contains("Indexation du fichier"));
        } catch (Throwable th) {
            resourceAsStream.close();
            throw th;
        }
    }

    @Test
    @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
        String parseToString = new Tika().parseToString(HtmlParserTest.class.getResourceAsStream("/test-documents/testXHTML_utf8.html"), new Metadata());
        Assert.assertTrue("Did not contain expected text:Title : Tilte with UTF-8 chars √∂√§√•", parseToString.contains("Title : Tilte with UTF-8 chars √∂√§√•"));
        Assert.assertTrue("Did not contain expected text:Content with UTF-8 chars", parseToString.contains("Content with UTF-8 chars"));
        Assert.assertTrue("Did not contain expected text:√•√§√∂", parseToString.contains("√•√§√∂"));
    }

    @Test
    public void testXhtmlParsing() throws Exception {
        Metadata metadata = new Metadata();
        String parseToString = new Tika().parseToString(HtmlParserTest.class.getResourceAsStream("/test-documents/testXHTML.html"), metadata);
        Assert.assertEquals("application/xhtml+xml", metadata.get("Content-Type"));
        Assert.assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
        Assert.assertEquals("Tika Developers", metadata.get("Author"));
        Assert.assertEquals("5", metadata.get("refresh"));
        Assert.assertTrue(parseToString.contains("ability of Apache Tika"));
        Assert.assertTrue(parseToString.contains("extract content"));
        Assert.assertTrue(parseToString.contains("an XHTML document"));
    }

    @Test
    public void testParseEmpty() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), bodyContentHandler, new Metadata(), new ParseContext());
        Assert.assertEquals("", bodyContentHandler.toString());
    }

    @Test
    public void testCharactersDirectlyUnderBodyElement() throws Exception {
        Assert.assertEquals("test", new Tika().parseToString(new ByteArrayInputStream("<html><body>test</body></html>".getBytes("UTF-8"))));
    }

    @Test
    public void testBaseHref() throws Exception {
        assertRelativeLink("http://lucene.apache.org/tika/", "http://lucene.apache.org/", "tika/");
        assertRelativeLink("http://domain.com/?pid=1", "http://domain.com", "?pid=1");
        assertRelativeLink("http://domain.com/?pid=2", "http://domain.com?pid=1", "?pid=2");
        assertRelativeLink("http://domain.com/file.html", "http://domain.com/path/", "/file.html");
        assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", "./file.html");
        assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", "file.html");
        assertRelativeLink("http://domain2.com/newpath", "http://domain.com/path/to/file", "http://domain2.com/newpath");
        assertRelativeLink("http://domain.com/path/?pid=1", "http://domain.com/path/", "?pid=1");
        assertRelativeLink("http://domain.com/file?pid=1", "http://domain.com/file", "?pid=1");
        assertRelativeLink("http://domain.com/path/d;p?pid=1", "http://domain.com/path/d;p?q#f", "?pid=1");
    }

    private void assertRelativeLink(String str, String str2, String str3) throws Exception {
        final ArrayList arrayList = new ArrayList();
        new HtmlParser().parse(new ByteArrayInputStream(("<html><head><base href=\"" + str2 + "\"></head><body><a href=\"" + str3 + "\">test</a></body></html>").getBytes("UTF-8")), new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.2
            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str4, String str5, String str6, Attributes attributes) {
                if (!str6.equals("a") || attributes.getValue("", "href") == null) {
                    return;
                }
                arrayList.add(attributes.getValue("", "href"));
            }
        }, new Metadata(), new ParseContext());
        Assert.assertEquals(1L, arrayList.size());
        Assert.assertEquals(str, arrayList.get(0));
    }

    @Test
    public void testWhitespaceBetweenTableCells() throws Exception {
        String parseToString = new Tika().parseToString(new ByteArrayInputStream("<html><body><table><tr><td>a</td><td>b</td></table></body></html>".getBytes("UTF-8")));
        Assert.assertTrue(parseToString.contains("a"));
        Assert.assertTrue(parseToString.contains("b"));
        Assert.assertFalse(parseToString.contains("ab"));
    }

    @Test
    public void testHttpEquivCharset() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("ISO-8859-1", metadata.get("Content-Encoding"));
    }

    @Test
    public void testHtml5Charset() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><meta charset=\"ISO-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("ISO-8859-15", metadata.get("Content-Encoding"));
    }

    @Test
    public void testDetectOfCharset() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Ž</title></head><body></body></html>".getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("Ž", metadata.get(TikaCoreProperties.TITLE));
    }

    @Test
    public void testUsingCharsetInContentTypeHeader() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("UTF-8", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        metadata2.set("Content-Type", "text/html; charset=ISO-8859-1");
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata2, new ParseContext());
        Assert.assertEquals("ISO-8859-1", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testLineBreak() throws Exception {
        String[] split = new Tika().parseToString(new ByteArrayInputStream("<html><body><div>foo<br>bar</div>baz</body></html>".getBytes("US-ASCII"))).trim().split("\\s+");
        Assert.assertEquals(3L, split.length);
        Assert.assertEquals("foo", split[0]);
        Assert.assertEquals("bar", split[1]);
        Assert.assertEquals("baz", split[2]);
    }

    @Test
    public void testIgnoreCharsetDetectorLanguage() throws Exception {
        Metadata metadata = new Metadata();
        metadata.add("Content-Language", "en");
        new HtmlParser().parse(new ByteArrayInputStream("<html><title>Simple Content</title><body></body></html>".getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("en", metadata.get("Content-Language"));
    }

    @Test
    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("ISO-8859-15", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html;;charset=ISO-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata2, new ParseContext());
        Assert.assertEquals("ISO-8859-15", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("UTF-8", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        metadata2.set("Content-Type", "charset=ISO-8859-1;text/html");
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata2, new ParseContext());
        Assert.assertEquals("ISO-8859-1", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/big-preamble.html"), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("windows-1251", metadata.get("Content-Encoding"));
    }

    @Test
    public void testBoilerplateRemoval() throws Exception {
        Metadata metadata = new Metadata();
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/boilerplate.html"), new BoilerpipeContentHandler(bodyContentHandler), metadata, new ParseContext());
        String bodyContentHandler2 = bodyContentHandler.toString();
        Assert.assertTrue(bodyContentHandler2.startsWith("This is the real meat"));
        Assert.assertTrue(bodyContentHandler2.endsWith("This is the end of the text.\n"));
        Assert.assertFalse(bodyContentHandler2.contains("boilerplate"));
        Assert.assertFalse(bodyContentHandler2.contains("footer"));
    }

    @Test
    public void testElementOrdering() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><meta http-equiv=\"content-type\" content=\"text/html\"><link rel=\"next\" href=\"next.html\" /></head><body><p>Simple Content</p></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", stringWriter2));
        Assert.assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", stringWriter2));
        Assert.assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s).*</body>.*</html>$", stringWriter2));
    }

    @Test
    public void testImgUrlExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><img src=\"image.jpg\" /></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", stringWriter.toString()));
    }

    @Test
    public void testFrameSrcExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><frameset><frame src=\"frame.html\" /></frameset></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", stringWriter.toString()));
    }

    @Test
    public void testIFrameSrcExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\"><p>Your browser doesn't support iframes!</p></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", stringWriter.toString()));
    }

    @Test
    public void testAreaExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><p><map name=\"map\" id=\"map\"><area shape=\"rect\" href=\"map.html\" alt=\"\" /></map></p></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", stringWriter.toString()));
    }

    @Test
    public void testObjectExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><p><object data=\"object.data\" type=\"text/html\"><param name=\"name\" value=\"value\" /></object></p></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue("<object> tag not correctly found in:\n" + stringWriter2, Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", stringWriter2));
    }

    @Test
    public void testMetaTagHandling() throws Exception {
        Metadata metadata = new Metadata();
        metadata.add("Content-Type", "text/html; charset=utf-8");
        metadata.add("Language", (String) null);
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><body><h1>header</h1><p>some text</p></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", stringWriter2));
        Assert.assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", stringWriter2));
    }

    @Test
    public void testBrokenFrameset() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", stringWriter2));
        Assert.assertFalse(Pattern.matches("(?s).*<body>.*$", stringWriter2));
        StringWriter stringWriter3 = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title> my title </title></head><body><frameset rows=\"20,*\"><frame src=\"top.html\"></frame><frameset cols=\"20,*\"><frame src=\"left.html\"></frame><frame src=\"invalid.html\"/></frame><frame src=\"right.html\"></frame></frameset></frameset></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter3), new Metadata(), new ParseContext());
        String stringWriter4 = stringWriter3.toString();
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", stringWriter4));
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", stringWriter4));
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", stringWriter4));
        Assert.assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", stringWriter4));
        Assert.assertFalse(Pattern.matches("(?s).*<body>.*$", stringWriter4));
    }

    @Test
    public void testBoilerplateDelegation() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/boilerplate.html"), makeHtmlTransformer(stringWriter), metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", stringWriter2));
        Assert.assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", stringWriter2));
    }

    @Test
    public void testLinkHrefResolution() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /><link rel=\"next\" href=\"next.html\" /></head><body></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", stringWriter.toString()));
    }

    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
        TransformerHandler newTransformerHandler = ((SAXTransformerFactory) SAXTransformerFactory.newInstance()).newTransformerHandler();
        newTransformerHandler.getTransformer().setOutputProperty("method", "html");
        newTransformerHandler.getTransformer().setOutputProperty("indent", "no");
        newTransformerHandler.getTransformer().setOutputProperty("encoding", "utf-8");
        newTransformerHandler.setResult(new StreamResult(writer));
        return newTransformerHandler;
    }

    @Test
    public void testBoilerplateWithMarkup() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter stringWriter = new StringWriter();
        BoilerpipeContentHandler boilerpipeContentHandler = new BoilerpipeContentHandler(makeHtmlTransformer(stringWriter));
        boilerpipeContentHandler.setIncludeMarkup(true);
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/boilerplate.html"), boilerpipeContentHandler, metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertTrue("Has empty table elements", stringWriter2.contains("<body><table><tr><td><table><tr><td>"));
        Assert.assertTrue("Has empty a element", stringWriter2.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
        Assert.assertTrue("Has real content", stringWriter2.contains("<p>This is the real meat"));
        Assert.assertTrue("Ends with appropriate HTML", stringWriter2.endsWith("</p></body></html>"));
        Assert.assertFalse(stringWriter2.contains("boilerplate"));
        Assert.assertFalse(stringWriter2.contains("footer"));
    }

    @Test
    public void testPushback() throws IOException, TikaException {
        Assert.assertNotNull(new Tika().parseToString(HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata()));
    }

    @Test
    public void testIdentityMapper() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        StringWriter stringWriter = new StringWriter();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title></head><body></body></html>".getBytes("UTF-8")), makeHtmlTransformer(stringWriter), metadata, parseContext);
        Assert.assertTrue(Pattern.matches("(?s).*<body/>.*$", stringWriter.toString()));
    }

    @Test
    public void testNewlineAndIndent() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><title>Title</title></head><body><ul><li>one</li></ul></body></html>".getBytes("UTF-8")), bodyContentHandler, new Metadata(), new ParseContext());
        Assert.assertTrue(Pattern.matches("\tone\n\n", bodyContentHandler.toString()));
    }

    @Test
    public void testBoilerplateWhitespace() throws Exception {
        Metadata metadata = new Metadata();
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        BoilerpipeContentHandler boilerpipeContentHandler = new BoilerpipeContentHandler(bodyContentHandler);
        boilerpipeContentHandler.setIncludeMarkup(true);
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/boilerplate-whitespace.html"), boilerpipeContentHandler, metadata, new ParseContext());
        String bodyContentHandler2 = bodyContentHandler.toString();
        Assert.assertFalse(bodyContentHandler2.contains("item_aitem_b"));
        Assert.assertTrue(bodyContentHandler2.contains("item_a\nitem_b"));
        Assert.assertTrue(bodyContentHandler2.contains("有什么需要我帮你的"));
    }

    @Test
    public void testOpenGraphMetadata() throws Exception {
        Metadata metadata = new Metadata();
        new HtmlParser().parse(new ByteArrayInputStream("<html><head><meta property=\"og:description\" content=\"some description\" /><meta property=\"og:image\" content=\"http://example.com/image1.jpg\" /><meta property=\"og:image\" content=\"http://example.com/image2.jpg\" /><title>hello</title></head><body></body></html>".getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("some description", metadata.get("og:description"));
        Assert.assertTrue(metadata.isMultiValued("og:image"));
    }

    @Test
    public void testUserDefinedCharset() throws Exception {
        Assert.assertNotNull(new Tika().parseToString(HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata()));
    }

    @Test
    public void testNoisyMetaCharsetHeaders() throws Exception {
        Tika tika = new Tika();
        for (int i = 1; i <= 4; i++) {
            String str = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
            Assert.assertTrue("testing: " + str, tika.parseToString(HtmlParserTest.class.getResourceAsStream(str)).contains("أعرب"));
        }
    }

    @Test
    public void testCustomHtmlSchema() throws Exception {
        Metadata metadata = new Metadata();
        LinkContentHandler linkContentHandler = new LinkContentHandler();
        new HtmlParser().parse(new ByteArrayInputStream("<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>".getBytes("ISO-8859-1")), linkContentHandler, metadata, new ParseContext());
        Assert.assertEquals("", ((Link) linkContentHandler.getLinks().get(0)).getText());
        HTMLSchema hTMLSchema = new HTMLSchema();
        hTMLSchema.elementType("a", -1, 65535, 0);
        ParseContext parseContext = new ParseContext();
        parseContext.set(Schema.class, hTMLSchema);
        LinkContentHandler linkContentHandler2 = new LinkContentHandler();
        new HtmlParser().parse(new ByteArrayInputStream("<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>".getBytes("ISO-8859-1")), linkContentHandler2, metadata, parseContext);
        Assert.assertEquals("\ttext\n\n", ((Link) linkContentHandler2.getLinks().get(0)).getText());
    }

    @Test
    public void testLocator() throws Exception {
        final int[] iArr = new int[2];
        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"), new ContentHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.3
            Locator locator;

            @Override // org.xml.sax.ContentHandler
            public void setDocumentLocator(Locator locator) {
                this.locator = locator;
            }

            @Override // org.xml.sax.ContentHandler
            public void startDocument() throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endDocument() throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void startPrefixMapping(String str, String str2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endPrefixMapping(String str) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void characters(char[] cArr, int i, int i2) throws SAXException {
                if (!new String(cArr, i, i2).equals("Test Indexation Html") || this.locator == null) {
                    return;
                }
                iArr[0] = this.locator.getLineNumber();
                iArr[1] = this.locator.getColumnNumber();
            }

            @Override // org.xml.sax.ContentHandler
            public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void processingInstruction(String str, String str2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void skippedEntity(String str) throws SAXException {
            }
        }, new Metadata(), new ParseContext());
        Assert.assertEquals(24L, iArr[0]);
        Assert.assertTrue(Math.abs(iArr[1] - 47) < 10);
    }
}
