001/**
002 *
003 * Copyright © 2015 Florian Schmaus
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.jxmpp.xml.splitter;
018
019import java.io.IOException;
020import java.io.OutputStream;
021import java.nio.ByteBuffer;
022
023/**
024 * Extended version of {@link XmppXmlSplitter} allowing input to be bytes or
025 * {@link ByteBuffer} representing a UTF-8 encoded XML string for XMPP. Just as
026 * they come from a network socket.
027 * <p>
028 * This class respects the byte order mark (BOM )requirement of RFC 6120 11.6
029 * and treats the BOM as zero width no-break space, and not as byte order mark.
030 * </p>
031 * 
032 * @author Florian Schmaus
033 *
034 */
035public class Utf8ByteXmppXmlSplitter extends OutputStream {
036
037        private final XmppXmlSplitter xmppXmlSplitter;
038
039        /**
040         * Create a new splitter with the given callback.
041         *
042         * @param xmppElementCallback the callback invoked once a complete element has been processed.
043         */
044        public Utf8ByteXmppXmlSplitter(XmppElementCallback xmppElementCallback) {
045                xmppXmlSplitter = new XmppXmlSplitter(xmppElementCallback);
046        }
047
048        private final char[] writeBuffer = new char[2];
049        private final byte[] buffer = new byte[6];
050        private byte count;
051        private byte expectedLength;
052
053        /**
054         * Write a single byte. The byte must be part of a UTF-8 String.
055         *
056         * @param b the byte to write.
057         * @throws IOException if an error occurs.
058         */
059        public void write(byte b) throws IOException {
060                buffer[count] = b;
061
062                if (count == 0) {
063                        int firstByte = buffer[0] & 0xff;
064                        if (firstByte < 0x80) {
065                                expectedLength = 1;
066                        } else if (firstByte < 0xe0) {
067                                expectedLength = 2;
068                        } else if (firstByte < 0xf0) {
069                                expectedLength = 3;
070                        } else if (firstByte < 0xf8) {
071                                expectedLength = 4;
072                        } else {
073                                throw new IOException("Invalid first UTF-8 byte: " + firstByte);
074                        }
075                }
076
077                if (++count == expectedLength) {
078                        int codepoint;
079                        if (expectedLength == 1) {
080                                codepoint = buffer[0] & 0x7f;
081                        } else {
082                                // The following switch-case could also be omitted. Note sure
083                                // how it would affect performance. Using switch-case means that
084                                // the bitsToMask does not need to be calculated, but the code
085                                // would be shorter if the switch-code was not here and maybe
086                                // this affects JIT'ed performance (maybe even positive).
087                                switch (expectedLength) {
088                                case 2:
089                                        codepoint = buffer[0] & 0x1f;
090                                        codepoint <<= 6 * 1;
091                                        break;
092                                case 3:
093                                        codepoint = buffer[0] & 0xf;
094                                        codepoint <<= 6 * 2;
095                                        break;
096                                case 4:
097                                        codepoint = buffer[0] & 0x6;
098                                        codepoint <<= 6 * 3;
099                                        break;
100                                default:
101                                        throw new IllegalStateException();
102                                }
103
104                                for (int i = 1; i < expectedLength; i++) {
105                                        // Get the lower 6 bits.
106                                        int bits = buffer[i] & 0x3f;
107                                        // Shift the bits to the right position.
108                                        bits <<= 6 * (expectedLength - 1 - i);
109                                        codepoint |= bits;
110                                }
111                        }
112
113                        int len;
114                        if (codepoint < 0x10000) {
115                                len = 1;
116                                writeBuffer[0] = (char) codepoint;
117                        } else {
118                                // We have to convert the codepoint into a surrogate pair.
119                                len = 2;
120                                // high surrogate: top ten bits added to 0xd800 give the first 16-bit code unit.
121                                writeBuffer[0] = (char) (0xd800 + (codepoint & 0xffa00000));
122                                // low surrogate: low ten bits added to 0xdc00 give the second 16-bit code unit.
123                                writeBuffer[1] = (char) (0xdc00 + (codepoint & 0x3ff));
124                        }
125
126                        xmppXmlSplitter.write(writeBuffer, 0, len);
127
128                        // Reset count since we are done handling this UTF-8 codepoint.
129                        count = 0;
130                }
131        }
132
133        @Override
134        public void write(int b) throws IOException {
135                write((byte) (b & 0xff));
136        }
137}