-SAXで子タグ
<html> <body> <form> <input type="text"> <test attr="hoge">test</test> </input> </form> </body> </html>
こういうのを子タグというのであれば、取れているみたいです。
package test.org.seasar.kariyushi; import java.io.InputStream; import junit.framework.TestCase; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; public class SaxParserTest extends TestCase implements ContentHandler { public SaxParserTest(String arg) { super(arg); } public void testSaxParse() throws Exception { org.cyberneko.html.parsers.SAXParser parser = new org.cyberneko.html.parsers.SAXParser(); parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "Shift_JIS"); parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "default"); parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "match"); ClassLoader loader = getClass().getClassLoader(); InputStream stream = loader.getResourceAsStream( "test/org/seasar/kariyushi/test2.html"); InputSource source = new InputSource(); source.setByteStream(stream); parser.setContentHandler(this); parser.parse(source); } public void characters(char ch, int start, int length) throws SAXException { String body = new String(ch, start, length).trim(); if(body.length() > 0) { System.out.println("characters: " + body); } } public void endDocument() throws SAXException { System.out.println("endDocument"); } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { System.out.println("endElement: " + localName); } public void startDocument() throws SAXException { System.out.println("startDocument"); } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { System.out.println("startElement: " + localName + " " + atts.getLocalName(0) + "=" + atts.getValue(0)); } public void startPrefixMapping(String prefix, String uri) throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void skippedEntity(String name) throws SAXException { } }
startDocument
startElement: html null=null
startElement: body null=null
startElement: form null=null
startElement: input type=text
endElement: input
startElement: test attr=hoge
characters: test
endElement: test
endElement: form
endElement: body
endElement: html
endDocument
おぉ、イベントの発生順が!!まあ、これはなんとかできるでしょう。org.cyberneko.html.HTMLTagBalancer#startElement()の中でendElement()呼んでるところ。たどると、org.cyberneko.html.HTMLElements#ELEMENTS_ARRAYという配列の中で、
new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),
この第三引数でEMPTYのため、endElementイベントを補完しちゃうんですな。