-SAXで子タグ

<html>
  <body>
    <form>
      <input type="text">
        <test attr="hoge">test</test>
      </input>
    </form>
  </body>
</html>

こういうのを子タグというのであれば、取れているみたいです。

package test.org.seasar.kariyushi;
import java.io.InputStream;
import junit.framework.TestCase;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
public class SaxParserTest extends TestCase implements ContentHandler {
    public SaxParserTest(String arg) {
        super(arg);
    }    
    public void testSaxParse() throws Exception {
        org.cyberneko.html.parsers.SAXParser parser = 
            new org.cyberneko.html.parsers.SAXParser();
        parser.setProperty(
            "http://cyberneko.org/html/properties/default-encoding",
            "Shift_JIS");
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/attrs",
            "default");
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/elems",
            "match");
        ClassLoader loader = getClass().getClassLoader();
        InputStream stream = loader.getResourceAsStream(
            "test/org/seasar/kariyushi/test2.html");  
        InputSource source = new InputSource();
        source.setByteStream(stream);
        parser.setContentHandler(this);
        parser.parse(source);
    }
    public void characters(char ch, int start, int length)
            throws SAXException {
        String body = new String(ch, start, length).trim();
        if(body.length() > 0) {
            System.out.println("characters: " + body);
        }
    }
    public void endDocument() throws SAXException {
        System.out.println("endDocument");
    }
    public void endElement(String namespaceURI, 
        String localName, String qName) throws SAXException {
        System.out.println("endElement: " + localName);
    }
    public void startDocument() throws SAXException {
        System.out.println("startDocument");
    }
    public void startElement(String namespaceURI, String localName,
            String qName, Attributes atts) throws SAXException {
        System.out.println("startElement: " + localName + " " + 
                atts.getLocalName(0) + "=" + atts.getValue(0));
    }
    public void startPrefixMapping(String prefix, String uri)
        throws SAXException {
    }
    public void endPrefixMapping(String prefix) throws SAXException {
    }
    public void ignorableWhitespace(char ch, int start, int length)
        throws SAXException {
    }
    public void processingInstruction(String target, String data)
        throws SAXException {
    }
    public void setDocumentLocator(Locator locator) {
    }
    public void skippedEntity(String name) throws SAXException {
    }
}

startDocument
startElement: html null=null
startElement: body null=null
startElement: form null=null
startElement: input type=text
endElement: input
startElement: test attr=hoge
characters: test
endElement: test
endElement: form
endElement: body
endElement: html
endDocument

おぉ、イベントの発生順が!!まあ、これはなんとかできるでしょう。org.cyberneko.html.HTMLTagBalancer#startElement()の中でendElement()呼んでるところ。たどると、org.cyberneko.html.HTMLElements#ELEMENTS_ARRAYという配列の中で、

new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),

この第三引数でEMPTYのため、endElementイベントを補完しちゃうんですな。