001/** 002 * 003 * Copyright © 2015 Florian Schmaus 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.jxmpp.xml.splitter; 018 019import java.io.IOException; 020import java.io.OutputStream; 021import java.nio.ByteBuffer; 022 023/** 024 * Extended version of {@link XmppXmlSplitter} allowing input to be bytes or 025 * {@link ByteBuffer} representing a UTF-8 encoded XML string for XMPP. Just as 026 * they come from a network socket. 027 * <p> 028 * This class respects the byte order mark (BOM )requirement of RFC 6120 11.6 029 * and treats the BOM as zero width no-break space, and not as byte order mark. 030 * </p> 031 * 032 * @author Florian Schmaus 033 * 034 */ 035public class Utf8ByteXmppXmlSplitter extends OutputStream { 036 037 private final XmppXmlSplitter xmppXmlSplitter; 038 039 /** 040 * Create a new splitter with the given callback. 041 * 042 * @param xmppElementCallback the callback invoked once a complete element has been processed. 043 */ 044 public Utf8ByteXmppXmlSplitter(XmppElementCallback xmppElementCallback) { 045 xmppXmlSplitter = new XmppXmlSplitter(xmppElementCallback); 046 } 047 048 private final char[] writeBuffer = new char[2]; 049 private final byte[] buffer = new byte[6]; 050 private byte count; 051 private byte expectedLength; 052 053 /** 054 * Write a single byte. The byte must be part of a UTF-8 String. 055 * 056 * @param b the byte to write. 057 * @throws IOException if an error occurs. 058 */ 059 public void write(byte b) throws IOException { 060 buffer[count] = b; 061 062 if (count == 0) { 063 int firstByte = buffer[0] & 0xff; 064 if (firstByte < 0x80) { 065 expectedLength = 1; 066 } else if (firstByte < 0xe0) { 067 expectedLength = 2; 068 } else if (firstByte < 0xf0) { 069 expectedLength = 3; 070 } else if (firstByte < 0xf8) { 071 expectedLength = 4; 072 } else { 073 throw new IOException("Invalid first UTF-8 byte: " + firstByte); 074 } 075 } 076 077 if (++count == expectedLength) { 078 int codepoint; 079 if (expectedLength == 1) { 080 codepoint = buffer[0] & 0x7f; 081 } else { 082 // The following switch-case could also be omitted. Note sure 083 // how it would affect performance. Using switch-case means that 084 // the bitsToMask does not need to be calculated, but the code 085 // would be shorter if the switch-code was not here and maybe 086 // this affects JIT'ed performance (maybe even positive). 087 switch (expectedLength) { 088 case 2: 089 codepoint = buffer[0] & 0x1f; 090 codepoint <<= 6 * 1; 091 break; 092 case 3: 093 codepoint = buffer[0] & 0xf; 094 codepoint <<= 6 * 2; 095 break; 096 case 4: 097 codepoint = buffer[0] & 0x6; 098 codepoint <<= 6 * 3; 099 break; 100 default: 101 throw new IllegalStateException(); 102 } 103 104 for (int i = 1; i < expectedLength; i++) { 105 // Get the lower 6 bits. 106 int bits = buffer[i] & 0x3f; 107 // Shift the bits to the right position. 108 bits <<= 6 * (expectedLength - 1 - i); 109 codepoint |= bits; 110 } 111 } 112 113 int len; 114 if (codepoint < 0x10000) { 115 len = 1; 116 writeBuffer[0] = (char) codepoint; 117 } else { 118 // We have to convert the codepoint into a surrogate pair. 119 len = 2; 120 // high surrogate: top ten bits added to 0xd800 give the first 16-bit code unit. 121 writeBuffer[0] = (char) (0xd800 + (codepoint & 0xffa00000)); 122 // low surrogate: low ten bits added to 0xdc00 give the second 16-bit code unit. 123 writeBuffer[1] = (char) (0xdc00 + (codepoint & 0x3ff)); 124 } 125 126 xmppXmlSplitter.write(writeBuffer, 0, len); 127 128 // Reset count since we are done handling this UTF-8 codepoint. 129 count = 0; 130 } 131 } 132 133 @Override 134 public void write(int b) throws IOException { 135 write((byte) (b & 0xff)); 136 } 137}