001/* 002// $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $ 003// Package org.eigenbase.xom is an XML Object Mapper. 004// Copyright (C) 2008-2008 The Eigenbase Project 005// Copyright (C) 2008-2008 Disruptive Tech 006// Copyright (C) 2008-2008 LucidEra, Inc. 007// 008// This library is free software; you can redistribute it and/or modify it 009// under the terms of the GNU Lesser General Public License as published by the 010// Free Software Foundation; either version 2 of the License, or (at your 011// option) any later version approved by The Eigenbase Project. 012// 013// This library is distributed in the hope that it will be useful, 014// but WITHOUT ANY WARRANTY; without even the implied warranty of 015// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016// GNU Lesser General Public License for more details. 017// 018// You should have received a copy of the GNU Lesser General Public License 019// along with this library; if not, write to the Free Software 020// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 021*/ 022package org.eigenbase.xom.wrappers; 023 024import org.eigenbase.xom.*; 025import org.w3c.dom.Node; 026 027import java.util.*; 028import java.io.PrintWriter; 029 030/** 031 * Quick and dirty XML parser that finds the precise start and end 032 * position of all nodes in a document. Also finds all line endings, so 033 * that character offsets can be converted to line/column positions. 034 * 035 * @author jhyde 036 * @since 13 October, 2008 037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $ 038 */ 039public class Annotator { 040 private final List/*<LocInfo>*/ locInfoList = new ArrayList(); 041 private int[] lineStartPositions; 042 private final String xml; 043 private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap = 044 new HashMap(); 045 private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap(); 046 private int seq; // workspace for populateMap 047 048 /** 049 * Creates an Annotator. 050 * 051 * <p>For testing purposes, <code>wrapper</code> may be null. Parses the XML 052 * but does not build the mapping from location information to DOM nodes. 053 * 054 * @param xml XML source string 055 * @param def Wrapper around root DOM node 056 */ 057 Annotator(String xml, DOMWrapper def) { 058 this.xml = xml; 059 parse(xml); 060 if (def != null) { 061 seq = 0; 062 populateMap(def); 063 assert this.nodeLocMap.size() == this.wrapperLocMap.size(); 064 } 065 } 066 067 public Location getLocation(DOMWrapper wrapper) { 068 LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper); 069 if (location0 == null) { 070 location0 = (Annotator.LocInfo) 071 nodeLocMap.get(((W3CDOMWrapper) wrapper).node); 072 if (location0 == null) { 073 return null; 074 } 075 } 076 final LocInfo location = location0; 077 return new Location() { 078 public int getStartLine() { 079 return getLine(getStartPos()) + 1; 080 } 081 082 public int getStartColumn() { 083 return getCol(getStartPos()) + 1; 084 } 085 086 public int getStartPos() { 087 return location.startTagStartPos; 088 } 089 090 public int getEndLine() { 091 return getLine(getEndPos()) + 1; 092 } 093 094 public int getEndColumn() { 095 return getCol(getEndPos()) + 1; 096 } 097 098 public int getEndPos() { 099 return location.endTagEndPos >= 0 100 ? location.endTagEndPos 101 : location.startTagEndPos; 102 } 103 104 public String getText(boolean headOnly) { 105 return location.getText(headOnly); 106 } 107 108 public String toString() { 109 return location.toString(Annotator.this); 110 } 111 }; 112 } 113 114 /** 115 * Returns the list of LocInfo. For testing. 116 * 117 * @return list of LocInfo. 118 */ 119 List getLocInfoList() { 120 return locInfoList; 121 } 122 123 // enum State 124 private static final int 125 STATE_NORMAL = 0, 126 STATE_TAG = 1, 127 STATE_ENDTAG = 2, 128 STATE_QUOT = 3, 129 STATE_APOS = 4, 130 STATE_COMMENT = 5, 131 STATE_CDATA = 6; 132 133 void parse(String s) 134 { 135 final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack(); 136 final List lineStartPositions = new ArrayList(); 137 int state = STATE_NORMAL; 138 final int count = s.length(); 139 int i = 0; 140 int last = 0; 141 lineStartPositions.add(new Integer(i)); 142 lockInfoStack.push(null); 143 LocInfo location = null; 144 loop: 145 while (i < count) { 146 final char c = s.charAt(i); 147 switch (c) { 148 case '<': 149 stateSwitch: 150 switch (state) { 151 case STATE_NORMAL: 152 if (i > last) { 153 // Unlike other node types, we create the LocInfo 154 // at the end of the element. No need to add the node 155 // to the stack, because we'd just remove it again. 156 LocInfo loc2 = 157 new LocInfo(locInfoList.size(), TYPE_TEXT, last); 158 loc2.endTagEndPos = i; 159 locInfoList.add(loc2); 160 } 161 if (i + 1 < count) { 162 final char c1 = s.charAt(i + 1); 163 switch (c1) { 164 case '/': 165 // ^</Tag> 166 state = STATE_ENDTAG; 167 assert location != null; 168 break stateSwitch; 169 case '?': 170 // ^<?xml ... ?> 171 location = 172 new LocInfo( 173 locInfoList.size(), 174 TYPE_PROCESSING_INSTRUCTION, i); 175 locInfoList.add(location); 176 state = STATE_TAG; 177 i += "<?".length(); 178 continue loop; 179 case '!': 180 if (s.startsWith("--", i + 2)) { 181 // ^<!-- 182 location = 183 new LocInfo( 184 locInfoList.size(), 185 TYPE_COMMENT, i); 186 locInfoList.add(location); 187 state = STATE_COMMENT; 188 i += "<!--".length(); 189 continue loop; 190 } 191 if (s.startsWith("[CDATA[", i + 2)) { 192 // ^<![CDATA[ 193 location = 194 new LocInfo( 195 locInfoList.size(), 196 TYPE_CDATA_SECTION, i); 197 locInfoList.add(location); 198 state = STATE_CDATA; 199 i += "<![CDATA[".length(); 200 continue loop; 201 } 202 break; 203 } 204 } 205 // Start of an element, 206 // ^<Tag a1=v a2=v> 207 // Don't push until we see end of the head tag <Tag ... ^> 208 state = STATE_TAG; 209 location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i); 210 locInfoList.add(location); 211 ++i; 212 continue loop; 213 } 214 break; 215 216 case '>': 217 switch (state) { 218 case STATE_TAG: 219 ++i; 220 assert location != null; 221 switch (location.type) { 222 case TYPE_PROCESSING_INSTRUCTION: 223 // <? ... ?^> 224 case TYPE_CDATA_SECTION: 225 // <![CDATA[ ... ]]^> 226 case TYPE_COMMENT: 227 // <!-- ... --^> 228 location.endTagEndPos = i; 229 location = (LocInfo) lockInfoStack.peek(); 230 break; 231 default: 232 // <Tag^> 233 location.startTagEndPos = i; 234 lockInfoStack.push(location); 235 break; 236 } 237 last = i; 238 state = STATE_NORMAL; 239 continue loop; 240 241 case STATE_ENDTAG: 242 // </Tag^> 243 ++i; 244 assert location != null; 245 location.endTagEndPos = i; 246 try { 247 location = (LocInfo) lockInfoStack.pop(); 248 } catch (IndexOutOfBoundsException e) { 249 throw new RuntimeException( 250 "i=" + i + ", xml=" + xml.substring(i) 251 + ", nodeList=" + locInfoList, 252 e); 253 } 254 last = i; 255 state = STATE_NORMAL; 256 continue loop; 257 } 258 break; 259 260 case '/': 261 switch (state) { 262 case STATE_TAG: 263 ++i; 264 if (i < count && s.charAt(i) == '>') { 265 // <Tag a1=v1 a2=v2 ^/> 266 ++i; 267 location.endTagEndPos = i; 268 // no need to pop; we never pushed when we saw '<' 269 location = (LocInfo) lockInfoStack.peek(); 270 last = i; 271 state = STATE_NORMAL; 272 } 273 continue loop; 274 } 275 break; 276 277 case ']': 278 switch (state) { 279 case STATE_CDATA: 280 if (s.startsWith("]>", i + 1)) { 281 // <![CDATA[ ... ^]]> 282 state = STATE_NORMAL; 283 i += "]]>".length(); 284 location.endTagEndPos = i; 285 location = (LocInfo) lockInfoStack.peek(); 286 last = i; 287 continue loop; 288 } 289 } 290 break; 291 292 case '-': 293 switch (state) { 294 case STATE_COMMENT: 295 if (s.startsWith("->", i + 1)) { 296 // <!-- xxxxx^--> 297 i += "-->".length(); 298 location.endTagEndPos = i; 299 last = i; 300 location = (LocInfo) lockInfoStack.peek(); 301 state = STATE_NORMAL; 302 continue loop; 303 } 304 } 305 break; 306 307 case '\r': 308 ++i; 309 if (i < count && s.charAt(i) == '\n') { 310 // only count windows line ending CR LF as one line 311 ++i; 312 } 313 lineStartPositions.add(new Integer(i)); 314 continue loop; 315 316 case '\n': 317 ++i; 318 lineStartPositions.add(new Integer(i)); 319 continue loop; 320 321 case '\'': 322 switch (state) { 323 case STATE_APOS: 324 // a='xxx^' 325 state = STATE_TAG; 326 break; 327 case STATE_TAG: 328 // a=^'xxx' 329 state = STATE_APOS; 330 break; 331 case STATE_QUOT: 332 // a="doesn^'t matter" 333 default: 334 break; 335 } 336 break; 337 338 case '"': 339 switch (state) { 340 case STATE_QUOT: 341 // a="xxx^" 342 state = STATE_TAG; 343 break; 344 case STATE_TAG: 345 // a=^"xxx" 346 state = STATE_QUOT; 347 break; 348 case STATE_APOS: 349 // a='doesn^"t matter' 350 default: 351 break; 352 } 353 break; 354 } 355 356 ++i; 357 } 358 this.lineStartPositions = new int[lineStartPositions.size()]; 359 for (int j = 0; j < lineStartPositions.size(); j++) { 360 this.lineStartPositions[j] = 361 ((Integer) lineStartPositions.get(j)).intValue(); 362 } 363 } 364 365 private void populateMap(DOMWrapper def) 366 { 367 final int defType = def.getType(); 368 LocInfo location; 369 while (true) { 370 location = (LocInfo) locInfoList.get(seq++); 371 if (defType == DOMWrapper.ELEMENT 372 && location.type == TYPE_ELEMENT) 373 { 374 break; 375 } 376 if (defType == DOMWrapper.CDATA 377 && location.type == TYPE_TEXT) 378 { 379 break; 380 } 381 if (seq >= locInfoList.size()) { 382 return; 383 } 384 } 385 wrapperLocMap.put(def, location); 386 nodeLocMap.put(((W3CDOMWrapper) def).node, location); 387 final DOMWrapper[] elementChildren = def.getElementChildren(); 388 for (int i = 0; i < elementChildren.length; i++) { 389 DOMWrapper domWrapper = elementChildren[i]; 390 populateMap(domWrapper); 391 } 392 } 393 394 /** 395 * Returns the line that a character position falls on. The first line in a 396 * document is numbered 0. 397 * 398 * @param pos Character position 399 * @return Line (starting from 0) 400 */ 401 int getLine(int pos) 402 { 403 int index = Arrays.binarySearch(lineStartPositions, pos); 404 if (index >= 0) { 405 return index; 406 } else { 407 return -2 - index; 408 } 409 } 410 411 /** 412 * Returns the column that a character position falls on. The first column 413 * in a line is numbered 0. 414 * 415 * @param pos Character position 416 * @return column (starting from 0) 417 */ 418 int getCol(int pos) 419 { 420 int index = Arrays.binarySearch(lineStartPositions, pos); 421 if (index >= 0) { 422 return 0; 423 } else { 424 index = -2 - index; 425 return pos - lineStartPositions[index]; 426 } 427 } 428 429 void list(PrintWriter pw) 430 { 431 for (int i = 0; i < locInfoList.size(); i++) { 432 LocInfo location = (LocInfo) locInfoList.get(i); 433 pw.println( 434 location.seq + ": " + location.toString(this) + " [" 435 + location.getText(xml) + "]"); 436 } 437 pw.flush(); 438 } 439 440 // enum Type 441 private static final int 442 TYPE_ELEMENT = Node.ELEMENT_NODE, 443 TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE, 444 TYPE_COMMENT = Node.COMMENT_NODE, 445 TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE, 446 TYPE_TEXT = Node.TEXT_NODE; 447 448 class LocInfo { 449 /** Sequence in document, ordered by start position (prefix order) */ 450 final int seq; 451 /** Node type, typically {@link Node#ELEMENT_NODE}. */ 452 final int startTagStartPos; 453 final int type; 454 int startTagEndPos = -1; // -1 if entity is a single tag 455 int endTagEndPos = -1; 456 457 /** 458 * Creates a LocInfo. 459 * 460 * @param seq Sequence number in document 461 * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}. 462 * @param startTagStartPos Position of start of element 463 */ 464 LocInfo(int seq, int nodeType, int startTagStartPos) { 465 this.seq = seq; 466 this.type = nodeType; 467 this.startTagStartPos = startTagStartPos; 468 } 469 470 public String toString(Annotator annotator) { 471 return "line " + annotator.getLine(startTagStartPos) 472 + ", column " + annotator.getCol(startTagStartPos); 473 } 474 475 /** 476 * Returns the fragment of source XML that this node encompasses. 477 * 478 * @param xml Whole source XML 479 * @return fragment of source XML 480 */ 481 public String getText(String xml) { 482 return xml.substring( 483 startTagStartPos, 484 endTagEndPos >= 0 ? endTagEndPos 485 : xml.length()); 486 } 487 488 /** 489 * Returns the fragment of source XML corresponding to the head tag 490 * of this element, if this is an element, otherwise the whole node. 491 * 492 * @param xml Whole source XML 493 * @return fragment of source XML 494 */ 495 public String getHeadText(String xml) { 496 return xml.substring( 497 startTagStartPos, 498 startTagEndPos >= 0 ? startTagEndPos 499 : endTagEndPos >= 0 ? endTagEndPos 500 : xml.length()); 501 } 502 503 public String toString() { 504 return getHeadText(xml); 505 } 506 507 /** 508 * Returns the text of this location. Specification as for 509 * {@link org.eigenbase.xom.Location#getText(boolean)}. 510 * 511 * @param headOnly Whether to return only the head of elements 512 * @return Source text underlying a location 513 */ 514 public String getText(boolean headOnly) { 515 return xml.substring( 516 startTagStartPos, 517 headOnly && startTagEndPos >= 0 518 ? startTagEndPos 519 : endTagEndPos >= 0 520 ? endTagEndPos 521 : xml.length()); 522 } 523 } 524 525 /** 526 * Similar to {@link Stack} but based on {@link ArrayList} instead of 527 * {@link Vector}, and therefore more efficient. 528 */ 529 private static class ArrayStack extends ArrayList { 530 public final void push(Object t) 531 { 532 if (false) System.out.println(size() + " push [" + t + "]"); 533 add(t); 534 } 535 536 public final Object peek() 537 { 538 return get(size() - 1); 539 } 540 541 public final Object pop() 542 { 543 final int index = size() - 1; 544 Object t = remove(index); 545 if (false) System.out.println(size() + " pop [" + t + "]"); 546 return get(index - 1); 547 } 548 } 549} 550 551// End Annotator.java