001/** 002 * 003 * Copyright © 2015-2021 Florian Schmaus 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.jxmpp.xml.splitter; 018 019import java.io.IOException; 020import java.io.Writer; 021import java.util.HashMap; 022import java.util.Map; 023 024/** 025 * A XML splitter capable of splitting XML into top-level elements. 026 * <p> 027 * Note that this class does not yet support the feature set of XML. Only the 028 * required features for XMPP are supported. XML comments and processing 029 * instructions are not supported. 030 * </p> 031 * 032 */ 033public class XmlSplitter extends Writer { 034 035 enum State { 036 START, 037 TAG_LEFT_ANGLE_BRACKET, 038 TAG_RIGHT_ANGLE_BRACKET, 039 END_TAG_SOLIDUS, 040 IN_TAG_NAME, 041 IN_END_TAG, 042 AFTER_START_NAME, 043 IN_EMPTY_TAG, 044 IN_ATTRIBUTE_NAME, 045 AFTER_ATTRIBUTE_EQUALS, 046 IN_ATTRIBUTE_VALUE, 047 AFTER_COMMENT_BANG, 048 AFTER_COMMENT_DASH1, 049 AFTER_COMMENT_DASH2, 050 AFTER_COMMENT, 051 AFTER_COMMENT_CLOSING_DASH1, 052 AFTER_COMMENT_CLOSING_DASH2, 053 IN_PROCESSING_INSTRUCTION_OR_DECLARATION, 054 IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE, 055 IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK, 056 } 057 058 private final DeclarationCallback declarationCallback; 059 private final ProcessingInstructionCallback processingInstructionCallback; 060 061 private final XmlPrinter xmlPrinter; 062 063 protected final CompleteElementCallback completeElementCallback; 064 065 private final StringBuilder splittedPartBuffer; 066 067 private final StringBuilder tokenBuffer = new StringBuilder(256); 068 private final Map<String, String> attributes = new HashMap<>(); 069 070 private int depth; 071 private String qName; 072 private String attributeName; 073 private State state = State.START; 074 075 private enum AttributeValueQuotes { 076 apos('\''), 077 quot('"'), 078 ; 079 080 final char c; 081 082 AttributeValueQuotes(char c) { 083 this.c = c; 084 } 085 } 086 087 /** 088 * The type of quotation used for the current (or last) attribute. Note that depending on which quotation is used, 089 * the other quotation does not need to be escaped within the value. Therefore we need to remember it to reliable 090 * detect the end quotation of the value. 091 */ 092 private AttributeValueQuotes attributeValueQuotes; 093 094 /** 095 * Construct a new XML splitter. 096 * 097 * @param bufferSize the initial size of the buffer. 098 * @param completeElementCallback the callback invoked once a complete element has been processed. 099 * @param declarationCallback a optional callback for the XML declaration. 100 * @param processingInstructionCallback a optional callback for Processing Instructions. 101 */ 102 public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback, DeclarationCallback declarationCallback, ProcessingInstructionCallback processingInstructionCallback) { 103 this(bufferSize, completeElementCallback, declarationCallback, processingInstructionCallback, null); 104 } 105 106 /** 107 * Construct a new XML splitter. 108 * 109 * @param bufferSize the initial size of the buffer. 110 * @param completeElementCallback the callback invoked once a complete element has been processed. 111 * @param xmlPrinter an optional {@link XmlPrinter}. 112 */ 113 public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback, XmlPrinter xmlPrinter) { 114 this(bufferSize, completeElementCallback, null, null, xmlPrinter); 115 } 116 117 /** 118 * Construct a new XML splitter. 119 * 120 * @param bufferSize the initial size of the buffer. 121 * @param completeElementCallback the callback invoked once a complete element has been processed. 122 */ 123 public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback) { 124 this(bufferSize, completeElementCallback, null, null); 125 } 126 127 /** 128 * Construct a new XML splitter. 129 * 130 * @param bufferSize the initial size of the buffer. 131 * @param completeElementCallback the callback invoked once a complete element has been processed. 132 * @param declarationCallback a optional callback for the XML declaration. 133 * @param processingInstructionCallback a optional callback for Processing Instructions. 134 * @param xmlPrinter an optional {@link XmlPrinter}. 135 */ 136 public XmlSplitter(int bufferSize, CompleteElementCallback completeElementCallback, 137 DeclarationCallback declarationCallback, ProcessingInstructionCallback processingInstructionCallback, 138 XmlPrinter xmlPrinter) { 139 if (bufferSize < 0) { 140 bufferSize = 128; 141 } 142 this.splittedPartBuffer = new StringBuilder(bufferSize); 143 this.completeElementCallback = completeElementCallback; 144 this.declarationCallback = declarationCallback; 145 this.processingInstructionCallback = processingInstructionCallback; 146 this.xmlPrinter = xmlPrinter; 147 } 148 149 @Override 150 public void write(char[] cbuf, int off, int len) throws IOException { 151 if (xmlPrinter != null) { 152 xmlPrinter.onChunkStart(); 153 } 154 for (int cur = off; cur < off+len; cur++) { 155 processChar(cbuf[off+cur]); 156 } 157 if (xmlPrinter != null) { 158 xmlPrinter.onChunkEnd(); 159 } 160 } 161 162 @Override 163 public void flush() { 164 } 165 166 @Override 167 public void close() { 168 } 169 170 /** 171 * Get the size in bytes of the splitted part currently being processed. 172 * 173 * @return the size of the current splitted part in chars. 174 */ 175 public final int getCurrentSplittedPartSize() { 176 return splittedPartBuffer.length(); 177 } 178 179 protected void onNextChar() throws IOException { 180 } 181 182 protected void onStartTag(String prefix, String localpart, Map<String, String> attributes) { 183 } 184 185 protected void onEndTag(String qName) { 186 } 187 188 protected final void newSplittedPart() { 189 depth = 0; 190 splittedPartBuffer.setLength(0); 191 192 assert state != State.START; 193 state = State.START; 194 } 195 196 @SuppressWarnings("fallthrough") 197 private void processChar(char c) throws IOException { 198 onNextChar(); 199 200 // Append every char we see to the buffer. This helps for example XmppXmlSplitter to ensure a certain size is 201 // not exceeded. In case of XMPP, the size is usually for the top level stream element (Stanzas and Nonzas), but 202 // also other XML pseudo-elements like the Declaration or Processing Instructions's size is limited by this. 203 splittedPartBuffer.append(c); 204 205 boolean endTagFinished = false; 206 State initialState = state; 207 208 switch (state) { 209 case TAG_RIGHT_ANGLE_BRACKET: 210 state = State.START; 211 case START: 212 switch (c) { 213 case '<': 214 state = State.TAG_LEFT_ANGLE_BRACKET; 215 break; 216 } 217 break; 218 case TAG_LEFT_ANGLE_BRACKET: 219 switch (c) { 220 case '/': 221 state = State.END_TAG_SOLIDUS; 222 break; 223 case '?': 224 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION; 225 break; 226 case '!': 227 state = State.AFTER_COMMENT_BANG; 228 break; 229 default: 230 tokenBuffer.append(c); 231 state = State.IN_TAG_NAME; 232 break; 233 } 234 break; 235 case END_TAG_SOLIDUS: 236 // TODO: We could perform some verification here, like "c != '>'" or no space (?). 237 state = State.IN_END_TAG; 238 tokenBuffer.append(c); 239 break; 240 case IN_TAG_NAME: 241 switch (c) { 242 // XML 1.1 § 2.3 "White Space" 243 case ' ': 244 case '\n': 245 case '\r': 246 case '\t': 247 qName = getToken(); 248 state = State.AFTER_START_NAME; 249 break; 250 case '/': 251 qName = getToken(); 252 onStartTagFinished(); 253 state = State.IN_EMPTY_TAG; 254 break; 255 case '>': 256 qName = getToken(); 257 onStartTagFinished(); 258 state = State.TAG_RIGHT_ANGLE_BRACKET; 259 break; 260 default: 261 tokenBuffer.append(c); 262 break; 263 } 264 break; 265 case IN_END_TAG: 266 switch (c) { 267 case '>': 268 endTagFinished = true; 269 state = State.TAG_RIGHT_ANGLE_BRACKET; 270 break; 271 default: 272 tokenBuffer.append(c); 273 break; 274 } 275 break; 276 case AFTER_START_NAME: 277 switch (c) { 278 case '/': 279 onStartTagFinished(); 280 state = State.IN_EMPTY_TAG; 281 break; 282 case '>': 283 onStartTagFinished(); 284 state = State.TAG_RIGHT_ANGLE_BRACKET; 285 break; 286 // XML 1.1 § 2.3 "White Space" 287 case ' ': 288 case '\n': 289 case '\r': 290 case '\t': 291 break; 292 // Attribute Name 293 default: 294 tokenBuffer.append(c); 295 state = State.IN_ATTRIBUTE_NAME; 296 break; 297 } 298 break; 299 case IN_ATTRIBUTE_NAME: 300 switch (c) { 301 case '=': 302 attributeName = getToken(); 303 state = State.AFTER_ATTRIBUTE_EQUALS; 304 break; 305 default: 306 tokenBuffer.append(c); 307 } 308 break; 309 case AFTER_ATTRIBUTE_EQUALS: 310 switch (c) { 311 case '\'': 312 attributeValueQuotes = AttributeValueQuotes.apos; 313 state = State.IN_ATTRIBUTE_VALUE; 314 break; 315 case '\"': 316 attributeValueQuotes = AttributeValueQuotes.quot; 317 state = State.IN_ATTRIBUTE_VALUE; 318 break; 319 default: 320 throw InvalidXmlException.InvalidAttributeDeclarationException.create(c, splittedPartBuffer); 321 } 322 break; 323 case IN_ATTRIBUTE_VALUE: 324 if (c == attributeValueQuotes.c) { 325 attributes.put(attributeName, getToken()); 326 state = State.AFTER_START_NAME; 327 } else { 328 tokenBuffer.append(c); 329 } 330 break; 331 case IN_EMPTY_TAG: 332 switch (c) { 333 case '>': 334 endTagFinished = true; 335 state = State.TAG_RIGHT_ANGLE_BRACKET; 336 break; 337 default: 338 throw InvalidXmlException.InvalidEmptyTagException.create(c, splittedPartBuffer); 339 } 340 break; 341 case IN_PROCESSING_INSTRUCTION_OR_DECLARATION: 342 switch (c) { 343 case '\'': 344 attributeValueQuotes = AttributeValueQuotes.apos; 345 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE; 346 break; 347 case '\"': 348 attributeValueQuotes = AttributeValueQuotes.quot; 349 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE; 350 break; 351 case '?': 352 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK; 353 break; 354 } 355 break; 356 case IN_PROCESSING_INSTRUCTION_OR_DECLARATION_PSEUDO_ATTRIBUTE_VALUE: 357 if (c == attributeValueQuotes.c) { 358 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION; 359 } 360 break; 361 case IN_PROCESSING_INSTRUCTION_OR_DECLARATION_QUESTION_MARK: 362 if (c == '>') { 363 String processingInstructionOrDeclaration = splittedPartBuffer.toString(); 364 onProcessingInstructionOrDeclaration(processingInstructionOrDeclaration); 365 newSplittedPart(); 366 } else { 367 state = State.IN_PROCESSING_INSTRUCTION_OR_DECLARATION; 368 } 369 break; 370 case AFTER_COMMENT_BANG: 371 case AFTER_COMMENT_DASH1: 372 case AFTER_COMMENT_DASH2: 373 case AFTER_COMMENT: 374 case AFTER_COMMENT_CLOSING_DASH1: 375 case AFTER_COMMENT_CLOSING_DASH2: 376 throw new UnsupportedOperationException("XML comments are not supported by XmlSplitter (yet). They are also not allowed in XMPP streams (cf. RFC 6120 § 11.1)."); 377 } 378 379 if (xmlPrinter != null) { 380 xmlPrinter.onNextChar(c, depth, initialState, state); 381 } 382 383 if (endTagFinished) { 384 onEndTagFinished(); 385 } 386 } 387 388 private void onStartTagFinished() { 389 // qName should already be set correctly. 390 depth++; 391 String prefix = extractPrefix(qName); 392 String localpart = extractLocalpart(qName); 393 onStartTag(prefix, localpart, attributes); 394 attributes.clear(); 395 } 396 397 private void onEndTagFinished() { 398 String endTagName = getToken(); 399 if (endTagName.length() == 0) { 400 // empty element case 401 endTagName = qName; 402 } 403 depth--; 404 if (depth == 0) { 405 String completeElement = splittedPartBuffer.toString(); 406 splittedPartBuffer.setLength(0); 407 if (completeElementCallback != null) { 408 completeElementCallback.onCompleteElement(completeElement); 409 } 410 if (xmlPrinter != null) { 411 xmlPrinter.onCompleteElement(); 412 } 413 } 414 onEndTag(endTagName); 415 416 assert state != State.START; 417 state = State.START; 418 } 419 420 private String getToken() { 421 String token = tokenBuffer.toString(); 422 tokenBuffer.setLength(0); 423 return token; 424 } 425 426 private void onProcessingInstructionOrDeclaration(String processingInstructionOrDeclaration) { 427 if (processingInstructionOrDeclaration.startsWith("<?xml ")) { 428 if (declarationCallback != null) { 429 declarationCallback.onDeclaration(processingInstructionOrDeclaration); 430 } 431 } else { 432 if (processingInstructionCallback != null) { 433 processingInstructionCallback.onProcessingInstruction(processingInstructionOrDeclaration); 434 } 435 } 436 } 437 438 private static String extractPrefix(String qName) { 439 int index = qName.indexOf(':'); 440 return index > -1 ? qName.substring(0, index) : qName; 441 } 442 443 private static String extractLocalpart(String qName) { 444 int index = qName.indexOf(':'); 445 return index > -1 ? qName.substring(index + 1) : qName; 446 } 447}