001/**
002 *
003 * Copyright © 2015-2021 Florian Schmaus
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.jxmpp.xml.splitter;
018
019import java.io.IOException;
020import java.io.Writer;
021import java.util.HashMap;
022import java.util.Map;
023
024/**
025 * A XML splitter capable of splitting XML into top-level elements.
026 * <p>
027 * Note that this class does not yet support the feature set of XML. Only the
028 * required features for XMPP are supported. XML comments and processing
029 * instructions are not supported.
030 * </p>
031 *
032 */
033public class XmlSplitter extends Writer {
034
035        enum State {
036                START,
037                TAG_LEFT_ANGLE_BRACKET,
038                TAG_RIGHT_ANGLE_BRACKET,
039                END_TAG_SOLIDUS,
040                IN_TAG_NAME,
041                IN_END_TAG,
042                AFTER_START_NAME,
043                IN_EMPTY_TAG,
044                IN_ATTRIBUTE_NAME,
045                AFTER_ATTRIBUTE_EQUALS,
046                IN_ATTRIBUTE_VALUE,
047                AFTER_COMMENT_BANG,
048                AFTER_COMMENT_DASH1,
049                AFTER_COMMENT_DASH2,
050                AFTER_COMMENT,
051                AFTER_COMMENT_CLOSING_DASH1,
052                AFTER_COMMENT_CLOSING_DASH2,
053                IN_PROCESSING_INSTRUCTION_OR_DECLARATION,
054                IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE,
055                IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK,
056        }
057
058        private final DeclarationCallback declarationCallback;
059        private final ProcessingInstructionCallback processingInstructionCallback;
060
061        private final XmlPrinter xmlPrinter;
062
063        protected final CompleteElementCallback completeElementCallback;
064
065        private final StringBuilder splittedPartBuffer;
066
067        private final StringBuilder tokenBuffer = new StringBuilder(256);
068        private final Map<String, String> attributes = new HashMap<>();
069
070        private int depth;
071        private String qName;
072        private String attributeName;
073        private State state = State.START;
074
075        private enum AttributeValueQuotes {
076                apos('\''),
077                quot('"'),
078                ;
079
080                final char c;
081
082                AttributeValueQuotes(char c) {
083                        this.c = c;
084                }
085        }
086
087        /**
088         * The type of quotation used for the current (or last) attribute. Note that depending on which quotation is used,
089         * the other quotation does not need to be escaped within the value. Therefore we need to remember it to reliable
090         * detect the end quotation of the value.
091         */
092        private AttributeValueQuotes attributeValueQuotes;
093
094        /**
095         * Construct a new XML splitter.
096         *
097         * @param bufferSize the initial size of the buffer.
098         * @param completeElementCallback the callback invoked once a complete element has been processed.
099         * @param declarationCallback a optional callback for the XML declaration.
100         * @param processingInstructionCallback a optional callback for Processing Instructions.
101         */
102        public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback, DeclarationCallback declarationCallback, ProcessingInstructionCallback processingInstructionCallback) {
103                this(bufferSize, completeElementCallback, declarationCallback, processingInstructionCallback, null);
104        }
105
106        /**
107         * Construct a new XML splitter.
108         *
109         * @param bufferSize the initial size of the buffer.
110         * @param completeElementCallback the callback invoked once a complete element has been processed.
111         * @param xmlPrinter an optional {@link XmlPrinter}.
112         */
113        public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback, XmlPrinter xmlPrinter) {
114                this(bufferSize, completeElementCallback, null, null, xmlPrinter);
115        }
116
117        /**
118         * Construct a new XML splitter.
119         *
120         * @param bufferSize the initial size of the buffer.
121         * @param completeElementCallback the callback invoked once a complete element has been processed.
122         */
123        public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback) {
124                this(bufferSize, completeElementCallback, null, null);
125        }
126
127        /**
128         * Construct a new XML splitter.
129         *
130         * @param bufferSize the initial size of the buffer.
131         * @param completeElementCallback the callback invoked once a complete element has been processed.
132         * @param declarationCallback a optional callback for the XML declaration.
133         * @param processingInstructionCallback a optional callback for Processing Instructions.
134         * @param xmlPrinter an optional {@link XmlPrinter}.
135         */
136        public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback,
137                        DeclarationCallback declarationCallback, ProcessingInstructionCallback processingInstructionCallback,
138                        XmlPrinter xmlPrinter) {
139                if (bufferSize < 0) {
140                        bufferSize = 128;
141                }
142                this.splittedPartBuffer = new StringBuilder(bufferSize);
143                this.completeElementCallback = completeElementCallback;
144                this.declarationCallback = declarationCallback;
145                this.processingInstructionCallback = processingInstructionCallback;
146                this.xmlPrinter = xmlPrinter;
147        }
148
149        @Override
150        public void write(char[] cbuf, int off, int len) throws IOException {
151                if (xmlPrinter != null) {
152                        xmlPrinter.onChunkStart();
153                }
154                for (int cur = off; cur < off+len; cur++) {
155                        processChar(cbuf[off+cur]);
156                }
157                if (xmlPrinter != null) {
158                        xmlPrinter.onChunkEnd();
159                }
160        }
161
162        @Override
163        public void flush() {
164        }
165
166        @Override
167        public void close() {
168        }
169
170        /**
171         * Get the size in bytes of the splitted part currently being processed.
172         * 
173         * @return the size of the current splitted part in chars.
174         */
175        public final int getCurrentSplittedPartSize() {
176                return splittedPartBuffer.length();
177        }
178
179        protected void onNextChar() throws IOException {
180        }
181
182        protected void onStartTag(String prefix, String localpart, Map<String, String> attributes) {
183        }
184
185        protected void onEndTag(String qName) {
186        }
187
188        protected final void newSplittedPart() {
189                depth = 0;
190                splittedPartBuffer.setLength(0);
191
192                assert state != State.START;
193                state = State.START;
194        }
195
196        @SuppressWarnings("fallthrough")
197        private void processChar(char c) throws IOException {
198                onNextChar();
199
200                // Append every char we see to the buffer. This helps for example XmppXmlSplitter to ensure a certain size is
201                // not exceeded. In case of XMPP, the size is usually for the top level stream element (Stanzas and Nonzas), but
202                // also other XML pseudo-elements like the Declaration or Processing Instructions's size is limited by this.
203                splittedPartBuffer.append(c);
204
205                boolean endTagFinished = false;
206                State initialState = state;
207
208                switch (state) {
209                case TAG_RIGHT_ANGLE_BRACKET:
210                        state = State.START;
211                case START:
212                        switch (c) {
213                        case '<':
214                                state = State.TAG_LEFT_ANGLE_BRACKET;
215                                break;
216                        }
217                        break;
218                case TAG_LEFT_ANGLE_BRACKET:
219                        switch (c) {
220                        case '/':
221                                state = State.END_TAG_SOLIDUS;
222                                break;
223                        case '?':
224                                state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION;
225                                break;
226                        case '!':
227                                state = State.AFTER_COMMENT_BANG;
228                                break;
229                        default:
230                                tokenBuffer.append(c);
231                                state = State.IN_TAG_NAME;
232                                break;
233                        }
234                        break;
235                case END_TAG_SOLIDUS:
236                        // TODO: We could perform some verification here, like "c != '>'" or no space (?).
237                        state = State.IN_END_TAG;
238                        tokenBuffer.append(c);
239                        break;
240                case IN_TAG_NAME:
241                        switch (c) {
242                        // XML 1.1 § 2.3 "White Space"
243                        case ' ':
244                        case '\n':
245                        case '\r':
246                        case '\t':
247                                qName = getToken();
248                                state = State.AFTER_START_NAME;
249                                break;
250                        case '/':
251                                qName = getToken();
252                                onStartTagFinished();
253                                state = State.IN_EMPTY_TAG;
254                                break;
255                        case '>':
256                                qName = getToken();
257                                onStartTagFinished();
258                                state = State.TAG_RIGHT_ANGLE_BRACKET;
259                                break;
260                        default:
261                                tokenBuffer.append(c);
262                                break;
263                        }
264                        break;
265                case IN_END_TAG:
266                        switch (c) {
267                        case '>':
268                                endTagFinished = true;
269                                state = State.TAG_RIGHT_ANGLE_BRACKET;
270                                break;
271                        default:
272                                tokenBuffer.append(c);
273                                break;
274                        }
275                        break;
276                case AFTER_START_NAME:
277                        switch (c) {
278                        case '/':
279                                onStartTagFinished();
280                                state = State.IN_EMPTY_TAG;
281                                break;
282                        case '>':
283                                onStartTagFinished();
284                                state = State.TAG_RIGHT_ANGLE_BRACKET;
285                                break;
286                        // XML 1.1 § 2.3 "White Space"
287                        case ' ':
288                        case '\n':
289                        case '\r':
290                        case '\t':
291                                break;
292                        // Attribute Name
293                        default:
294                                tokenBuffer.append(c);
295                                state = State.IN_ATTRIBUTE_NAME;
296                                break;
297                        }
298                        break;
299                case IN_ATTRIBUTE_NAME:
300                        switch (c) {
301                        case '=':
302                                attributeName = getToken();
303                                state = State.AFTER_ATTRIBUTE_EQUALS;
304                                break;
305                        default:
306                                tokenBuffer.append(c);
307                        }
308                        break;
309                case AFTER_ATTRIBUTE_EQUALS:
310                        switch (c) {
311                        case '\'':
312                                attributeValueQuotes = AttributeValueQuotes.apos;
313                                state = State.IN_ATTRIBUTE_VALUE;
314                                break;
315                        case '\"':
316                                attributeValueQuotes = AttributeValueQuotes.quot;
317                                state = State.IN_ATTRIBUTE_VALUE;
318                                break;
319                        default:
320                                throw InvalidXmlException.InvalidAttributeDeclarationException.create(c, splittedPartBuffer);
321                        }
322                        break;
323                case IN_ATTRIBUTE_VALUE:
324                        if (c == attributeValueQuotes.c) {
325                                attributes.put(attributeName, getToken());
326                                state = State.AFTER_START_NAME;
327                        } else {
328                                tokenBuffer.append(c);
329                        }
330                        break;
331                case IN_EMPTY_TAG:
332                        switch (c) {
333                        case '>':
334                                endTagFinished = true;
335                                state = State.TAG_RIGHT_ANGLE_BRACKET;
336                                break;
337                        default:
338                                throw InvalidXmlException.InvalidEmptyTagException.create(c, splittedPartBuffer);
339                        }
340                        break;
341                case IN_PROCESSING_INSTRUCTION_OR_DECLARATION:
342                        switch (c) {
343                                case '\'':
344                                        attributeValueQuotes = AttributeValueQuotes.apos;
345                                        state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE;
346                                        break;
347                                case '\"':
348                                        attributeValueQuotes = AttributeValueQuotes.quot;
349                                        state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE;
350                                        break;
351                                case '?':
352                                        state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK;
353                                        break;
354                        }
355                        break;
356                case IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE:
357                        if (c == attributeValueQuotes.c) {
358                                state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION;
359                        }
360                        break;
361                case IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK:
362                        if (c == '>') {
363                                String processingInstructionOrDeclaration = splittedPartBuffer.toString();
364                                onProcessingInstructionOrDeclaration(processingInstructionOrDeclaration);
365                                newSplittedPart();
366                        } else {
367                                state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION;
368                        }
369                        break;
370                case AFTER_COMMENT_BANG:
371                case AFTER_COMMENT_DASH1:
372                case AFTER_COMMENT_DASH2:
373                case AFTER_COMMENT:
374                case AFTER_COMMENT_CLOSING_DASH1:
375                case AFTER_COMMENT_CLOSING_DASH2:
376                        throw new UnsupportedOperationException();
377                }
378
379                if (xmlPrinter != null) {
380                        xmlPrinter.onNextChar(c, depth, initialState, state);
381                }
382
383                if (endTagFinished) {
384                        onEndTagFinished();
385                }
386        }
387
388        private void onStartTagFinished() {
389                // qName should already be set correctly.
390                depth++;
391                String prefix = extractPrefix(qName);
392                String localpart = extractLocalpart(qName);
393                onStartTag(prefix, localpart, attributes);
394                attributes.clear();
395        }
396
397        private void onEndTagFinished() {
398                String endTagName = getToken();
399                if (endTagName.length() == 0) {
400                        // empty element case
401                        endTagName = qName;
402                }
403                depth--;
404                if (depth == 0) {
405                        String completeElement = splittedPartBuffer.toString();
406                        splittedPartBuffer.setLength(0);
407                        if (completeElementCallback != null) {
408                                completeElementCallback.onCompleteElement(completeElement);
409                        }
410                        if (xmlPrinter != null) {
411                                xmlPrinter.onCompleteElement();
412                        }
413                }
414                onEndTag(endTagName);
415
416                assert state != State.START;
417                state = State.START;
418        }
419
420        private String getToken() {
421                String token = tokenBuffer.toString();
422                tokenBuffer.setLength(0);
423                return token;
424        }
425
426        private void onProcessingInstructionOrDeclaration(String processingInstructionOrDeclaration) {
427                if (processingInstructionOrDeclaration.startsWith("<?xml ")) {
428                        if (declarationCallback != null) {
429                                declarationCallback.onDeclaration(processingInstructionOrDeclaration);
430                        }
431                } else {
432                        if (processingInstructionCallback != null) {
433                                processingInstructionCallback.onProcessingInstruction(processingInstructionOrDeclaration);
434                        }
435                }
436        }
437
438        private static String extractPrefix(String qName) {
439                int index = qName.indexOf(':');
440                return index > -1  ? qName.substring(0, index) : qName;
441        }
442
443        private static String extractLocalpart(String qName) {
444                int index = qName.indexOf(':');
445                return index > -1 ? qName.substring(index + 1) : qName;
446        }
447}