Re: Detect XML document encodings with SAX

From:

Steven Simpson <ss@domain.invalid>

Newsgroups:

comp.lang.java.programmer

Date:

Sun, 25 Nov 2012 11:00:06 +0000

Message-ID:

<maa9o9-2vr.ln1@s.simpson148.btinternet.com>

On 21/11/12 14:32, Sebastian wrote:

Does anyone have an idea why that is so? And how I could
go about making some XML parser determine the correct encoding?

Sussed it! (Come to think of it, I feel I've sussed this before...)

The charset returned by the locator changes during parsing. At
startDocument(), it is the assumed charset, possibly based on the first
four-or-so bytes. At endDocument(), it is reset to null. On the first
call to startElement, it has the correct value. There might be an
earlier event where it is correct - I didn't investigate.

SSCCE...

import org.xml.sax.*;
import org.xml.sax.ext.*;
import org.xml.sax.helpers.*;

import java.io.*;
import java.nio.charset.*;

public class SAXEncodingDetector extends DefaultHandler {
     static void escape(PrintWriter out, CharsetEncoder enc, CharSequence text) {
         final int len = text.length();
         for (int i = 0; i < len; i++) {
             char c = text.charAt(i);
             if (enc.canEncode(c))
                 out.print(c);
             else
                 out.printf("&#x%x;", (int) c);
         }
     }

     static final String MESSAGE = "L\u00f6we \u20ac";

     static byte[] createXMLBytes(String charsetName)
         throws UnsupportedEncodingException {
         Charset charset = Charset.forName(charsetName);
         CharsetEncoder encoder = charset.newEncoder();
         ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
         PrintWriter out =
             new PrintWriter(new OutputStreamWriter(bytesOut, charset));
         out.printf("<?xml version=\"1.0\" encoding=\"%s\" ?>%n", charsetName);
         out.print("<root>");
         escape(out, encoder, MESSAGE);
         out.println("</root>");
         out.close();
         return bytesOut.toByteArray();
     }

     public static void main(String[] args) throws SAXException, IOException {
         for (int i = 0; i < args.length; i++) {
             String inCharset = args[i];
             byte[] bytes = createXMLBytes(inCharset);
             System.out.printf("%nCharset %s: (%d bytes)%n",
                               inCharset, bytes.length);
             printBytes(bytes, System.out);
             ByteArrayInputStream in = new ByteArrayInputStream(bytes);

             XMLReader parser = XMLReaderFactory.createXMLReader();
             SAXEncodingDetector handler = new SAXEncodingDetector();
             parser.setContentHandler(handler);
             parser.parse(new InputSource(in));

             System.out.printf("Charset at document start: %s%n",
                               handler.encodingAtDocumentStart);
             System.out.printf(" Charset at element start: %s%n",
                               handler.encodingAtElementStart);
             System.out.printf(" Charset at element end: %s%n",
                               handler.encodingAtElementEnd);
             System.out.printf(" Charset at document end: %s%n",
                               handler.encodingAtDocumentEnd);
             String content = handler.content.toString();
             System.out.println("Content: " + content);
             if (!content.equals(MESSAGE))
                 System.out.println("Warning: message corrupted");
         }
     }

     private String encodingAtDocumentStart;
     private String encodingAtElementStart;
     private String encodingAtElementEnd;
     private String encodingAtDocumentEnd;
     private Locator2 locator;
     private StringWriter content = new StringWriter();

     private boolean inElement;

     @Override
     public void setDocumentLocator(Locator locator) {
         if (locator instanceof Locator2) {
             this.locator = (Locator2) locator;
         }
     }

     @Override
     public void startDocument() throws SAXException {
         if (locator != null) {
             this.encodingAtDocumentStart = locator.getEncoding();
         }
     }

     @Override
     public void endDocument() throws SAXException {
         if (locator != null) {
             this.encodingAtDocumentEnd = locator.getEncoding();
         }
     }

     @Override
     public void startElement(String uri, String localName,
                              String qName, Attributes atts) {
         if (localName.equals("root")) {
             if (locator != null)
                 this.encodingAtElementStart = locator.getEncoding();
             inElement = true;
         }
     }

     @Override
     public void endElement(String uri, String localName, String qName) {
         if (localName.equals("root")) {
             if (locator != null)
                 this.encodingAtElementEnd = locator.getEncoding();
             inElement = false;
         }
     }

     @Override
     public void characters(char[] ch, int start, int length) {
         if (inElement)
             content.write(ch, start, length);
     }

     static void printBytes(byte[] bytes, PrintStream out) {
         for (int major = 0; major < bytes.length; major += 16) {
             final int lim = Math.min(major + 16, bytes.length) - major;
             for (int minor = 0; minor < 16; minor++) {
                 if (minor < lim) {
                     final int pos = major + minor;
                     out.printf("%02X ", bytes[pos]);
                 } else {
                     out.print(".. ");
                 }
             }

             for (int minor = 0; minor < 16; minor++) {
                 if (minor < lim) {
                     final int pos = major + minor;
                     final int c = bytes[pos] & 0xff;
                     if (c == 10) {
                         out.print("\\n");
                     } else if (c == 13) {
                         out.print("\\r");
                     } else if (c == 9) {
                         out.print("\\t");
                     } else if (c < 32) {
                         out.printf("^%c", (char) (c + 64));
                     } else if (c >= 127 && c <= 160) {
                         out.printf("%02X", c);
                     } else {
                         out.printf("%c ", (char) c);
                     }
                 } else {
                     out.print("..");
                 }
             }

             out.println();
         }
     }
}

Command:

java SAXEncodingDetector US-ASCII ISO-8859-1 UTF-8 windows-1252

Output:

Charset US-ASCII: (75 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 53 . 0 " e n c o d i n g = " U S
2D 41 53 43 49 49 22 20 3F 3E 0A 3C 72 6F 6F 74 - A S C I I " ? > \n< r o o t
3E 4C 26 23 78 66 36 3B 77 65 20 26 23 78 32 30 > L & # x f 6 ; w e & # x 2 0
61 63 3B 3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. a c ; < / r o o t > \n..........
Charset at document start: UTF-8
  Charset at element start: US-ASCII
    Charset at element end: US-ASCII
   Charset at document end: null
Content: L?we ?

Charset ISO-8859-1: (72 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 49 53 . 0 " e n c o d i n g = " I S
4F 2D 38 38 35 39 2D 31 22 20 3F 3E 0A 3C 72 6F O - 8 8 5 9 - 1 " ? > \n< r o
6F 74 3E 4C F6 77 65 20 26 23 78 32 30 61 63 3B o t > L ? w e & # x 2 0 a c ;
3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. .. .. .. < / r o o t > \n................
Charset at document start: UTF-8
  Charset at element start: ISO-8859-1
    Charset at element end: ISO-8859-1
   Charset at document end: null
Content: L?we ?

Charset UTF-8: (63 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 54 . 0 " e n c o d i n g = " U T
46 2D 38 22 20 3F 3E 0A 3C 72 6F 6F 74 3E 4C C3 F - 8 " ? > \n< r o o t > L ?
B6 77 65 20 E2 82 AC 3C 2F 72 6F 6F 74 3E 0A .. ? w e ? 82? < / r o o t > \n..
Charset at document start: UTF-8
  Charset at element start: UTF-8
    Charset at element end: UTF-8
   Charset at document end: null
Content: L?we ?

Charset windows-1252: (67 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 77 69 . 0 " e n c o d i n g = " w i
6E 64 6F 77 73 2D 31 32 35 32 22 20 3F 3E 0A 3C n d o w s - 1 2 5 2 " ? > \n<
72 6F 6F 74 3E 4C F6 77 65 20 80 3C 2F 72 6F 6F r o o t > L ? w e 80< / r o o
74 3E 0A .. .. .. .. .. .. .. .. .. .. .. .. .. t > \n..........................
Charset at document start: UTF-8
  Charset at element start: windows-1252
    Charset at element end: windows-1252
   Charset at document end: null
Content: L?we ?

--
ss at comp dot lancs dot ac dot uk