001/*
002// $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
003// Package org.eigenbase.xom is an XML Object Mapper.
004// Copyright (C) 2008-2008 The Eigenbase Project
005// Copyright (C) 2008-2008 Disruptive Tech
006// Copyright (C) 2008-2008 LucidEra, Inc.
007//
008// This library is free software; you can redistribute it and/or modify it
009// under the terms of the GNU Lesser General Public License as published by the
010// Free Software Foundation; either version 2 of the License, or (at your
011// option) any later version approved by The Eigenbase Project.
012//
013// This library is distributed in the hope that it will be useful,
014// but WITHOUT ANY WARRANTY; without even the implied warranty of
015// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016// GNU Lesser General Public License for more details.
017//
018// You should have received a copy of the GNU Lesser General Public License
019// along with this library; if not, write to the Free Software
020// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
021*/
022package org.eigenbase.xom.wrappers;
023
024import org.eigenbase.xom.*;
025import org.w3c.dom.Node;
026
027import java.util.*;
028import java.io.PrintWriter;
029
030/**
031 * Quick and dirty XML parser that finds the precise start and end
032 * position of all nodes in a document. Also finds all line endings, so
033 * that character offsets can be converted to line/column positions.
034 *
035 * @author jhyde
036 * @since 13 October, 2008
037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
038 */
039public class Annotator {
040    private final List/*<LocInfo>*/ locInfoList = new ArrayList();
041    private int[] lineStartPositions;
042    private final String xml;
043    private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap =
044        new HashMap();
045    private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap();
046    private int seq; // workspace for populateMap
047
048    /**
049     * Creates an Annotator.
050     *
051     * <p>For testing purposes, <code>wrapper</code> may be null. Parses the XML
052     * but does not build the mapping from location information to DOM nodes.
053     *
054     * @param xml XML source string
055     * @param def Wrapper around root DOM node
056     */
057    Annotator(String xml, DOMWrapper def) {
058        this.xml = xml;
059        parse(xml);
060        if (def != null) {
061            seq = 0;
062            populateMap(def);
063            assert this.nodeLocMap.size() == this.wrapperLocMap.size();
064        }
065    }
066
067    public Location getLocation(DOMWrapper wrapper) {
068        LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper);
069        if (location0 == null) {
070            location0 = (Annotator.LocInfo)
071                nodeLocMap.get(((W3CDOMWrapper) wrapper).node);
072            if (location0 == null) {
073                return null;
074            }
075        }
076        final LocInfo location = location0;
077        return new Location() {
078            public int getStartLine() {
079                return getLine(getStartPos()) + 1;
080            }
081
082            public int getStartColumn() {
083                return getCol(getStartPos()) + 1;
084            }
085
086            public int getStartPos() {
087                return location.startTagStartPos;
088            }
089
090            public int getEndLine() {
091                return getLine(getEndPos()) + 1;
092            }
093
094            public int getEndColumn() {
095                return getCol(getEndPos()) + 1;
096            }
097
098            public int getEndPos() {
099                return location.endTagEndPos >= 0
100                    ? location.endTagEndPos
101                    : location.startTagEndPos;
102            }
103
104            public String getText(boolean headOnly) {
105                return location.getText(headOnly);
106            }
107
108            public String toString() {
109                return location.toString(Annotator.this);
110            }
111        };
112    }
113
114    /**
115     * Returns the list of LocInfo. For testing.
116     *
117     * @return list of LocInfo.
118     */
119    List getLocInfoList() {
120        return locInfoList;
121    }
122
123    // enum State
124    private static final int
125        STATE_NORMAL = 0,
126        STATE_TAG = 1,
127        STATE_ENDTAG = 2,
128        STATE_QUOT = 3,
129        STATE_APOS = 4,
130        STATE_COMMENT = 5,
131        STATE_CDATA = 6;
132
133    void parse(String s)
134    {
135        final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack();
136        final List lineStartPositions = new ArrayList();
137        int state = STATE_NORMAL;
138        final int count = s.length();
139        int i = 0;
140        int last = 0;
141        lineStartPositions.add(new Integer(i));
142        lockInfoStack.push(null);
143        LocInfo location = null;
144        loop:
145        while (i < count) {
146            final char c = s.charAt(i);
147            switch (c) {
148            case '<':
149                stateSwitch:
150                switch (state) {
151                case STATE_NORMAL:
152                    if (i > last) {
153                        // Unlike other node types, we create the LocInfo
154                        // at the end of the element. No need to add the node
155                        // to the stack, because we'd just remove it again.
156                        LocInfo loc2 =
157                            new LocInfo(locInfoList.size(), TYPE_TEXT, last);
158                        loc2.endTagEndPos = i;
159                        locInfoList.add(loc2);
160                    }
161                    if (i + 1 < count) {
162                        final char c1 = s.charAt(i + 1);
163                        switch (c1) {
164                        case '/':
165                            // ^</Tag>
166                            state = STATE_ENDTAG;
167                            assert location != null;
168                            break stateSwitch;
169                        case '?':
170                            // ^<?xml ... ?>
171                            location =
172                                new LocInfo(
173                                    locInfoList.size(),
174                                    TYPE_PROCESSING_INSTRUCTION, i);
175                            locInfoList.add(location);
176                            state = STATE_TAG;
177                            i += "<?".length();
178                            continue loop;
179                        case '!':
180                            if (s.startsWith("--", i + 2)) {
181                                // ^<!--
182                                location =
183                                    new LocInfo(
184                                        locInfoList.size(),
185                                        TYPE_COMMENT, i);
186                                locInfoList.add(location);
187                                state = STATE_COMMENT;
188                                i += "<!--".length();
189                                continue loop;
190                            }
191                            if (s.startsWith("[CDATA[", i + 2)) {
192                                // ^<![CDATA[
193                                location =
194                                    new LocInfo(
195                                        locInfoList.size(),
196                                        TYPE_CDATA_SECTION, i);
197                                locInfoList.add(location);
198                                state = STATE_CDATA;
199                                i += "<![CDATA[".length();
200                                continue loop;
201                            }
202                            break;
203                        }
204                    }
205                    // Start of an element,
206                    // ^<Tag a1=v a2=v>
207                    // Don't push until we see end of the head tag <Tag ... ^>
208                    state = STATE_TAG;
209                    location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i);
210                    locInfoList.add(location);
211                    ++i;
212                    continue loop;
213                }
214                break;
215
216            case '>':
217                switch (state) {
218                case STATE_TAG:
219                    ++i;
220                    assert location != null;
221                    switch (location.type) {
222                    case TYPE_PROCESSING_INSTRUCTION:
223                        // <? ... ?^>
224                    case TYPE_CDATA_SECTION:
225                        // <![CDATA[ ... ]]^>
226                    case TYPE_COMMENT:
227                        // <!-- ... --^>
228                        location.endTagEndPos = i;
229                        location = (LocInfo) lockInfoStack.peek();
230                        break;
231                    default:
232                        // <Tag^>
233                        location.startTagEndPos = i;
234                        lockInfoStack.push(location);
235                        break;
236                    }
237                    last = i;
238                    state = STATE_NORMAL;
239                    continue loop;
240
241                case STATE_ENDTAG:
242                    // </Tag^>
243                    ++i;
244                    assert location != null;
245                    location.endTagEndPos = i;
246                    try {
247                        location = (LocInfo) lockInfoStack.pop();
248                    } catch (IndexOutOfBoundsException e) {
249                        throw new RuntimeException(
250                            "i=" + i + ", xml=" + xml.substring(i)
251                                + ", nodeList=" + locInfoList,
252                            e);
253                    }
254                    last = i;
255                    state = STATE_NORMAL;
256                    continue loop;
257                }
258                break;
259
260            case '/':
261                switch (state) {
262                case STATE_TAG:
263                    ++i;
264                    if (i < count && s.charAt(i) == '>') {
265                        // <Tag a1=v1 a2=v2 ^/>
266                        ++i;
267                        location.endTagEndPos = i;
268                        // no need to pop; we never pushed when we saw '<'
269                        location = (LocInfo) lockInfoStack.peek();
270                        last = i;
271                        state = STATE_NORMAL;
272                    }
273                    continue loop;
274                }
275                break;
276
277            case ']':
278                switch (state) {
279                case STATE_CDATA:
280                    if (s.startsWith("]>", i + 1)) {
281                         // <![CDATA[ ... ^]]>
282                        state = STATE_NORMAL;
283                        i += "]]>".length();
284                        location.endTagEndPos = i;
285                        location = (LocInfo) lockInfoStack.peek();
286                        last = i;
287                        continue loop;
288                    }
289                }
290                break;
291
292            case '-':
293                switch (state) {
294                case STATE_COMMENT:
295                    if (s.startsWith("->", i + 1)) {
296                        // <!-- xxxxx^-->
297                        i += "-->".length();
298                        location.endTagEndPos = i;
299                        last = i;
300                        location = (LocInfo) lockInfoStack.peek();
301                        state = STATE_NORMAL;
302                        continue loop;
303                    }
304                }
305                break;
306
307            case '\r':
308                ++i;
309                if (i < count && s.charAt(i) == '\n') {
310                    // only count windows line ending CR LF as one line
311                    ++i;
312                }
313                lineStartPositions.add(new Integer(i));
314                continue loop;
315
316            case '\n':
317                ++i;
318                lineStartPositions.add(new Integer(i));
319                continue loop;
320
321            case '\'':
322                switch (state) {
323                case STATE_APOS:
324                    // a='xxx^'
325                    state = STATE_TAG;
326                    break;
327                case STATE_TAG:
328                    // a=^'xxx'
329                    state = STATE_APOS;
330                    break;
331                case STATE_QUOT:
332                    // a="doesn^'t matter"
333                default:
334                    break;
335                }
336                break;
337
338            case '"':
339                switch (state) {
340                case STATE_QUOT:
341                    // a="xxx^"
342                    state = STATE_TAG;
343                    break;
344                case STATE_TAG:
345                    // a=^"xxx"
346                    state = STATE_QUOT;
347                    break;
348                case STATE_APOS:
349                    // a='doesn^"t matter'
350                default:
351                    break;
352                }
353                break;
354            }
355
356            ++i;
357        }
358        this.lineStartPositions = new int[lineStartPositions.size()];
359        for (int j = 0; j < lineStartPositions.size(); j++) {
360            this.lineStartPositions[j] =
361                ((Integer) lineStartPositions.get(j)).intValue();
362        }
363    }
364
365    private void populateMap(DOMWrapper def)
366    {
367        final int defType = def.getType();
368        LocInfo location;
369        while (true) {
370            location = (LocInfo) locInfoList.get(seq++);
371            if (defType == DOMWrapper.ELEMENT
372                && location.type == TYPE_ELEMENT)
373            {
374                break;
375            }
376            if (defType == DOMWrapper.CDATA
377                && location.type == TYPE_TEXT)
378            {
379                break;
380            }
381            if (seq >= locInfoList.size()) {
382                return;
383            }
384        }
385        wrapperLocMap.put(def, location);
386        nodeLocMap.put(((W3CDOMWrapper) def).node, location);
387        final DOMWrapper[] elementChildren = def.getElementChildren();
388        for (int i = 0; i < elementChildren.length; i++) {
389            DOMWrapper domWrapper = elementChildren[i];
390            populateMap(domWrapper);
391        }
392    }
393
394    /**
395     * Returns the line that a character position falls on. The first line in a
396     * document is numbered 0.
397     *
398     * @param pos Character position
399     * @return Line (starting from 0)
400     */
401    int getLine(int pos)
402    {
403        int index = Arrays.binarySearch(lineStartPositions, pos);
404        if (index >= 0) {
405            return index;
406        } else {
407            return -2 - index;
408        }
409    }
410
411    /**
412     * Returns the column that a character position falls on. The first column
413     * in a line is numbered 0.
414     *
415     * @param pos Character position
416     * @return column (starting from 0)
417     */
418    int getCol(int pos)
419    {
420        int index = Arrays.binarySearch(lineStartPositions, pos);
421        if (index >= 0) {
422            return 0;
423        } else {
424            index = -2 - index;
425            return pos - lineStartPositions[index];
426        }
427    }
428
429    void list(PrintWriter pw)
430    {
431        for (int i = 0; i < locInfoList.size(); i++) {
432            LocInfo location = (LocInfo) locInfoList.get(i);
433            pw.println(
434                location.seq + ": " + location.toString(this) + " ["
435                    + location.getText(xml) + "]");
436        }
437        pw.flush();
438    }
439
440    // enum Type
441    private static final int
442        TYPE_ELEMENT = Node.ELEMENT_NODE,
443        TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE,
444        TYPE_COMMENT = Node.COMMENT_NODE,
445        TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE,
446        TYPE_TEXT = Node.TEXT_NODE;
447
448    class LocInfo {
449        /** Sequence in document, ordered by start position (prefix order) */
450        final int seq;
451        /** Node type, typically {@link Node#ELEMENT_NODE}. */
452        final int startTagStartPos;
453        final int type;
454        int startTagEndPos = -1; // -1 if entity is a single tag
455        int endTagEndPos = -1;
456
457        /**
458         * Creates a LocInfo.
459         *
460         * @param seq Sequence number in document
461         * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}.
462         * @param startTagStartPos Position of start of element
463         */
464        LocInfo(int seq, int nodeType, int startTagStartPos) {
465            this.seq = seq;
466            this.type = nodeType;
467            this.startTagStartPos = startTagStartPos;
468        }
469
470        public String toString(Annotator annotator) {
471            return "line " + annotator.getLine(startTagStartPos)
472                + ", column " + annotator.getCol(startTagStartPos);
473        }
474
475        /**
476         * Returns the fragment of source XML that this node encompasses.
477         *
478         * @param xml Whole source XML
479         * @return fragment of source XML
480         */
481        public String getText(String xml) {
482            return xml.substring(
483                startTagStartPos,
484                endTagEndPos >= 0 ? endTagEndPos
485                    : xml.length());
486        }
487
488        /**
489         * Returns the fragment of source XML corresponding to the head tag
490         * of this element, if this is an element, otherwise the whole node.
491         *
492         * @param xml Whole source XML
493         * @return fragment of source XML
494         */
495        public String getHeadText(String xml) {
496            return xml.substring(
497                startTagStartPos,
498                startTagEndPos >= 0 ? startTagEndPos
499                    : endTagEndPos >= 0 ? endTagEndPos
500                        : xml.length());
501        }
502
503        public String toString() {
504            return getHeadText(xml);
505        }
506
507        /**
508         * Returns the text of this location. Specification as for
509         * {@link org.eigenbase.xom.Location#getText(boolean)}.
510         *
511         * @param headOnly Whether to return only the head of elements
512         * @return Source text underlying a location
513         */
514        public String getText(boolean headOnly) {
515            return xml.substring(
516                startTagStartPos,
517                headOnly && startTagEndPos >= 0
518                    ? startTagEndPos
519                    : endTagEndPos >= 0
520                    ? endTagEndPos
521                    : xml.length());
522        }
523    }
524
525    /**
526     * Similar to {@link Stack} but based on {@link ArrayList} instead of
527     * {@link Vector}, and therefore more efficient.
528     */
529    private static class ArrayStack extends ArrayList {
530        public final void push(Object t)
531        {
532            if (false) System.out.println(size() + " push [" + t + "]");
533            add(t);
534        }
535
536        public final Object peek()
537        {
538            return get(size() - 1);
539        }
540
541        public final Object pop()
542        {
543            final int index = size() - 1;
544            Object t = remove(index);
545            if (false) System.out.println(size() + " pop  [" + t + "]");
546            return get(index - 1);
547        }
548    }
549}
550
551// End Annotator.java