1
2
3
4
5
6
7
8 package org.dom4j.io;
9
10 import java.io.IOException;
11 import java.io.OutputStream;
12 import java.io.StringWriter;
13 import java.io.UnsupportedEncodingException;
14 import java.io.Writer;
15 import java.util.HashSet;
16 import java.util.Iterator;
17 import java.util.Set;
18 import java.util.Stack;
19
20 import org.dom4j.Document;
21 import org.dom4j.DocumentHelper;
22 import org.dom4j.Element;
23 import org.dom4j.Entity;
24 import org.dom4j.Node;
25
26 import org.xml.sax.SAXException;
27
28 /***
29 * <p>
30 * <code>HTMLWriter</code> takes a DOM4J tree and formats it to a stream as
31 * HTML. This formatter is similar to XMLWriter but it outputs the text of CDATA
32 * and Entity sections rather than the serialised format as in XML, it has an
33 * XHTML mode, it retains whitespace in certain elements such as <PRE>,
34 * and it supports certain elements which have no corresponding close tag such
35 * as for <BR> and <P>.
36 * </p>
37 *
38 * <p>
39 * The OutputFormat passed in to the constructor is checked for isXHTML() and
40 * isExpandEmptyElements(). See {@link OutputFormat OutputFormat}for details.
41 * Here are the rules for <b>this class </b> based on an OutputFormat, "format",
42 * passed in to the constructor: <br/><br/>
43 *
44 * <ul>
45 * <li>If an element is in {@link #getOmitElementCloseSet()
46 * getOmitElementCloseSet}, then it is treated specially:
47 *
48 * <ul>
49 * <li>It never expands, since some browsers treat this as two separate
50 * Horizontal Rules: <HR></HR></li>
51 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, then
52 * it has a space before the closing single-tag slash, since Netscape 4.x-
53 * treats this: <HR /> as an element named "HR" with an attribute named
54 * "/", but that's better than when it refuses to recognize this: <hr/>
55 * which it thinks is an element named "HR/".</li>
56 * </ul>
57 *
58 * </li>
59 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, all
60 * elements must have either a close element, or be a closed single tag.</li>
61 * <li>If {@link org.dom4j.io.OutputFormat#isExpandEmptyElements()
62 * format.isExpandEmptyElements()}() is true, all elements are expanded except
63 * as above.</li>
64 * </ul>
65 *
66 * <b>Examples </b>
67 * </p>
68 *
69 * <p>
70 * </p>
71 *
72 * <p>
73 * If isXHTML == true, CDATA sections look like this:
74 *
75 * <PRE>
76 *
77 * <b><myelement><![CDATA[My data]]></myelement> </b>
78 *
79 * </PRE>
80 *
81 * Otherwise, they look like this:
82 *
83 * <PRE>
84 *
85 * <b><myelement>My data</myelement> </b>
86 *
87 * </PRE>
88 *
89 * </p>
90 *
91 * <p>
92 * Basically, {@link OutputFormat.isXHTML() OutputFormat.isXHTML()} ==
93 * <code>true</code> will produce valid XML, while {@link
94 * org.dom4j.io.OutputFormat#isExpandEmptyElements()
95 * format.isExpandEmptyElements()} determines whether empty elements are
96 * expanded if isXHTML is true, excepting the special HTML single tags.
97 * </p>
98 *
99 * <p>
100 * Also, HTMLWriter handles tags whose contents should be preformatted, that is,
101 * whitespace-preserved. By default, this set includes the tags <PRE>,
102 * <SCRIPT>, <STYLE>, and <TEXTAREA>, case insensitively. It
103 * does not include <IFRAME>. Other tags, such as <CODE>,
104 * <KBD>, <TT>, <VAR>, are usually rendered in a different
105 * font in most browsers, but don't preserve whitespace, so they also don't
106 * appear in the default list. HTML Comments are always whitespace-preserved.
107 * However, the parser you use may store comments with linefeed-only text nodes
108 * (\n) even if your platform uses another line.separator character, and
109 * HTMLWriter outputs Comment nodes exactly as the DOM is set up by the parser.
110 * See examples and discussion here: {@link#setPreformattedTags(java.util.Set)
111 * setPreformattedTags}
112 * </p>
113 *
114 * <p>
115 * <b>Examples </b>
116 * </p>
117 * <blockquote>
118 * <p>
119 * <b>Pretty Printing </b>
120 * </p>
121 *
122 * <p>
123 * This example shows how to pretty print a string containing a valid HTML
124 * document to a string. You can also just call the static methods of this
125 * class: <br>
126 * {@link #prettyPrintHTML(String) prettyPrintHTML(String)}or <br>
127 * {@link #prettyPrintHTML(String,boolean,boolean,boolean,boolean)
128 * prettyPrintHTML(String,boolean,boolean,boolean,boolean)} or, <br>
129 * {@link #prettyPrintXHTML(String) prettyPrintXHTML(String)}for XHTML (note
130 * the X)
131 * </p>
132 *
133 * <pre>
134 * String testPrettyPrint(String html) {
135 * StringWriter sw = new StringWriter();
136 * OutputFormat format = OutputFormat.createPrettyPrint();
137 * // These are the default values for createPrettyPrint,
138 * // so you needn't set them:
139 * // format.setNewlines(true);
140 * // format.setTrimText(true);</font>
141 * format.setXHTML(true);
142 * HTMLWriter writer = new HTMLWriter(sw, format);
143 * Document document = DocumentHelper.parseText(html);
144 * writer.write(document);
145 * writer.flush();
146 * return sw.toString();
147 * }
148 * </pre>
149 *
150 * <p>
151 * This example shows how to create a "squeezed" document, but one that will
152 * work in browsers even if the browser line length is limited. No newlines are
153 * included, no extra whitespace at all, except where it it required by
154 * {@link #setPreformattedTags(java.util.Set) setPreformattedTags}.
155 * </p>
156 *
157 * <pre>
158 * String testCrunch(String html) {
159 * StringWriter sw = new StringWriter();
160 * OutputFormat format = OutputFormat.createPrettyPrint();
161 * format.setNewlines(false);
162 * format.setTrimText(true);
163 * format.setIndent("");
164 * format.setXHTML(true);
165 * format.setExpandEmptyElements(false);
166 * format.setNewLineAfterNTags(20);
167 * org.dom4j.io.HTMLWriter writer = new HTMLWriter(sw, format);
168 * org.dom4j.Document document = DocumentHelper.parseText(html);
169 * writer.write(document);
170 * writer.flush();
171 * return sw.toString();
172 * }
173 * </pre>
174 *
175 * </blockquote>
176 *
177 * @author <a href="mailto:james.strachan@metastuff.com">James Strachan </a>
178 * @author Laramie Crocker
179 * @version $Revision: 1.21 $
180 */
181 public class HTMLWriter extends XMLWriter {
182 private static String lineSeparator = System.getProperty("line.separator");
183
184 protected static final HashSet DEFAULT_PREFORMATTED_TAGS;
185
186 static {
187
188
189 DEFAULT_PREFORMATTED_TAGS = new HashSet();
190 DEFAULT_PREFORMATTED_TAGS.add("PRE");
191 DEFAULT_PREFORMATTED_TAGS.add("SCRIPT");
192 DEFAULT_PREFORMATTED_TAGS.add("STYLE");
193 DEFAULT_PREFORMATTED_TAGS.add("TEXTAREA");
194 }
195
196 protected static final OutputFormat DEFAULT_HTML_FORMAT;
197
198 static {
199 DEFAULT_HTML_FORMAT = new OutputFormat(" ", true);
200 DEFAULT_HTML_FORMAT.setTrimText(true);
201 DEFAULT_HTML_FORMAT.setSuppressDeclaration(true);
202 }
203
204 private Stack formatStack = new Stack();
205
206 private String lastText = "";
207
208 private int tagsOuput = 0;
209
210
211 private int newLineAfterNTags = -1;
212
213 private HashSet preformattedTags = DEFAULT_PREFORMATTED_TAGS;
214
215 /***
216 * Used to store the qualified element names which should have no close
217 * element tag
218 */
219 private HashSet omitElementCloseSet;
220
221 public HTMLWriter(Writer writer) {
222 super(writer, DEFAULT_HTML_FORMAT);
223 }
224
225 public HTMLWriter(Writer writer, OutputFormat format) {
226 super(writer, format);
227 }
228
229 public HTMLWriter() throws UnsupportedEncodingException {
230 super(DEFAULT_HTML_FORMAT);
231 }
232
233 public HTMLWriter(OutputFormat format) throws UnsupportedEncodingException {
234 super(format);
235 }
236
237 public HTMLWriter(OutputStream out) throws UnsupportedEncodingException {
238 super(out, DEFAULT_HTML_FORMAT);
239 }
240
241 public HTMLWriter(OutputStream out, OutputFormat format)
242 throws UnsupportedEncodingException {
243 super(out, format);
244 }
245
246 public void startCDATA() throws SAXException {
247 }
248
249 public void endCDATA() throws SAXException {
250 }
251
252
253
254 protected void writeCDATA(String text) throws IOException {
255
256
257 if (getOutputFormat().isXHTML()) {
258 super.writeCDATA(text);
259 } else {
260 writer.write(text);
261 }
262
263 lastOutputNodeType = Node.CDATA_SECTION_NODE;
264 }
265
266 protected void writeEntity(Entity entity) throws IOException {
267 writer.write(entity.getText());
268 lastOutputNodeType = Node.ENTITY_REFERENCE_NODE;
269 }
270
271 protected void writeDeclaration() throws IOException {
272 }
273
274 protected void writeString(String text) throws IOException {
275
276
277
278
279
280
281
282
283
284
285
286
287 if (text.equals("\n")) {
288 if (!formatStack.empty()) {
289 super.writeString(lineSeparator);
290 }
291
292 return;
293 }
294
295 lastText = text;
296
297 if (formatStack.empty()) {
298 super.writeString(text.trim());
299 } else {
300 super.writeString(text);
301 }
302 }
303
304 /***
305 * Overriden method to not close certain element names to avoid wierd
306 * behaviour from browsers for versions up to 5.x
307 *
308 * @param qualifiedName
309 * DOCUMENT ME!
310 *
311 * @throws IOException
312 * DOCUMENT ME!
313 */
314 protected void writeClose(String qualifiedName) throws IOException {
315 if (!omitElementClose(qualifiedName)) {
316 super.writeClose(qualifiedName);
317 }
318 }
319
320 protected void writeEmptyElementClose(String qualifiedName)
321 throws IOException {
322 if (getOutputFormat().isXHTML()) {
323
324 if (omitElementClose(qualifiedName)) {
325
326
327
328
329
330 writer.write(" />");
331 } else {
332 super.writeEmptyElementClose(qualifiedName);
333 }
334 } else {
335
336 if (omitElementClose(qualifiedName)) {
337
338 writer.write(">");
339 } else {
340
341
342 super.writeEmptyElementClose(qualifiedName);
343 }
344 }
345 }
346
347 protected boolean omitElementClose(String qualifiedName) {
348 return internalGetOmitElementCloseSet().contains(
349 qualifiedName.toUpperCase());
350 }
351
352 private HashSet internalGetOmitElementCloseSet() {
353 if (omitElementCloseSet == null) {
354 omitElementCloseSet = new HashSet();
355 loadOmitElementCloseSet(omitElementCloseSet);
356 }
357
358 return omitElementCloseSet;
359 }
360
361
362 protected void loadOmitElementCloseSet(Set set) {
363 set.add("AREA");
364 set.add("BASE");
365 set.add("BR");
366 set.add("COL");
367 set.add("HR");
368 set.add("IMG");
369 set.add("INPUT");
370 set.add("LINK");
371 set.add("META");
372 set.add("P");
373 set.add("PARAM");
374 }
375
376
377
378 /***
379 * A clone of the Set of elements that can have their close-tags omitted. By
380 * default it should be "AREA", "BASE", "BR", "COL", "HR", "IMG", "INPUT",
381 * "LINK", "META", "P", "PARAM"
382 *
383 * @return A clone of the Set.
384 */
385 public Set getOmitElementCloseSet() {
386 return (Set) (internalGetOmitElementCloseSet().clone());
387 }
388
389 /***
390 * To use the empty set, pass an empty Set, or null:
391 *
392 * <pre>
393 *
394 *
395 * setOmitElementCloseSet(new HashSet());
396 * or
397 * setOmitElementCloseSet(null);
398 *
399 *
400 * </pre>
401 *
402 * @param newSet
403 * DOCUMENT ME!
404 */
405 public void setOmitElementCloseSet(Set newSet) {
406
407 omitElementCloseSet = new HashSet();
408
409 if (newSet != null) {
410 omitElementCloseSet = new HashSet();
411
412 Object aTag;
413 Iterator iter = newSet.iterator();
414
415 while (iter.hasNext()) {
416 aTag = iter.next();
417
418 if (aTag != null) {
419 omitElementCloseSet.add(aTag.toString().toUpperCase());
420 }
421 }
422 }
423 }
424
425 /***
426 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
427 */
428 public Set getPreformattedTags() {
429 return (Set) (preformattedTags.clone());
430 }
431
432 /***
433 * <p>
434 * Override the default set, which includes PRE, SCRIPT, STYLE, and
435 * TEXTAREA, case insensitively.
436 * </p>
437 *
438 * <p>
439 * <b>Setting Preformatted Tags </b>
440 * </p>
441 *
442 * <p>
443 * Pass in a Set of Strings, one for each tag name that should be treated
444 * like a PRE tag. You may pass in null or an empty Set to assign the empty
445 * set, in which case no tags will be treated as preformatted, except that
446 * HTML Comments will continue to be preformatted. If a tag is included in
447 * the set of preformatted tags, all whitespace within the tag will be
448 * preserved, including whitespace on the same line preceding the close tag.
449 * This will generally make the close tag not line up with the start tag,
450 * but it preserves the intention of the whitespace within the tag.
451 * </p>
452 *
453 * <p>
454 * The browser considers leading whitespace before the close tag to be
455 * significant, but leading whitespace before the open tag to be
456 * insignificant. For example, if the HTML author doesn't put the close
457 * TEXTAREA tag flush to the left margin, then the TEXTAREA control in the
458 * browser will have spaces on the last line inside the control. This may be
459 * the HTML author's intent. Similarly, in a PRE, the browser treats a
460 * flushed left close PRE tag as different from a close tag with leading
461 * whitespace. Again, this must be left up to the HTML author.
462 * </p>
463 *
464 * <p>
465 * <b>Examples </b>
466 * </p>
467 * <blockquote>
468 * <p>
469 * Here is an example of how you can set the PreformattedTags list using
470 * setPreformattedTags to include IFRAME, as well as the default set, if you
471 * have an instance of this class named myHTMLWriter:
472 *
473 * <pre>
474 * Set current = myHTMLWriter.getPreformattedTags();
475 * current.add("IFRAME");
476 * myHTMLWriter.setPreformattedTags(current);
477 *
478 * //The set is now <b>PRE, SCRIPT, STYLE, TEXTAREA, IFRAME</b>
479 *
480 *
481 * </pre>
482 *
483 * Similarly, you can simply replace it with your own:
484 *
485 * <pre>
486 *
487 *
488 * HashSet newset = new HashSet();
489 * newset.add("PRE");
490 * newset.add("TEXTAREA");
491 * myHTMLWriter.setPreformattedTags(newset);
492 *
493 * //The set is now <b>{PRE, TEXTAREA}</b>
494 *
495 *
496 * </pre>
497 *
498 * You can remove all tags from the preformatted tags list, with an empty
499 * set, like this:
500 *
501 * <pre>
502 *
503 *
504 * myHTMLWriter.setPreformattedTags(new HashSet());
505 *
506 * //The set is now <b>{}</b>
507 *
508 *
509 * </pre>
510 *
511 * or with null, like this:
512 *
513 * <pre>
514 *
515 *
516 * myHTMLWriter.setPreformattedTags(null);
517 *
518 * //The set is now <b>{}</b>
519 *
520 *
521 * </pre>
522 *
523 * </p>
524 * </blockquote>
525 *
526 * @param newSet
527 * DOCUMENT ME!
528 */
529 public void setPreformattedTags(Set newSet) {
530
531
532
533
534 preformattedTags = new HashSet();
535
536 if (newSet != null) {
537 Object aTag;
538 Iterator iter = newSet.iterator();
539
540 while (iter.hasNext()) {
541 aTag = iter.next();
542
543 if (aTag != null) {
544 preformattedTags.add(aTag.toString().toUpperCase());
545 }
546 }
547 }
548 }
549
550 /***
551 * DOCUMENT ME!
552 *
553 * @param qualifiedName
554 * DOCUMENT ME!
555 *
556 * @return true if the qualifiedName passed in matched (case-insensitively)
557 * a tag in the preformattedTags set, or false if not found or if
558 * the set is empty or null.
559 *
560 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
561 */
562 public boolean isPreformattedTag(String qualifiedName) {
563
564
565 return (preformattedTags != null)
566 && (preformattedTags.contains(qualifiedName.toUpperCase()));
567 }
568
569 /***
570 * This override handles any elements that should not remove whitespace,
571 * such as <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>.
572 * Note: the close tags won't line up with the open tag, but we can't alter
573 * that. See javadoc note at setPreformattedTags.
574 *
575 * @param element
576 * DOCUMENT ME!
577 *
578 * @throws IOException
579 * When the stream could not be written to.
580 *
581 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
582 */
583 protected void writeElement(Element element) throws IOException {
584 if (newLineAfterNTags == -1) {
585 lazyInitNewLinesAfterNTags();
586 }
587
588 if (newLineAfterNTags > 0) {
589 if ((tagsOuput > 0) && ((tagsOuput % newLineAfterNTags) == 0)) {
590 super.writer.write(lineSeparator);
591 }
592 }
593
594 tagsOuput++;
595
596 String qualifiedName = element.getQualifiedName();
597 String saveLastText = lastText;
598 int size = element.nodeCount();
599
600 if (isPreformattedTag(qualifiedName)) {
601 OutputFormat currentFormat = getOutputFormat();
602 boolean saveNewlines = currentFormat.isNewlines();
603 boolean saveTrimText = currentFormat.isTrimText();
604 String currentIndent = currentFormat.getIndent();
605
606
607
608 formatStack.push(new FormatState(saveNewlines, saveTrimText,
609 currentIndent));
610
611 try {
612
613
614 super.writePrintln();
615
616 if ((saveLastText.trim().length() == 0)
617 && (currentIndent != null)
618 && (currentIndent.length() > 0)) {
619
620
621
622
623
624 super.writer.write(justSpaces(saveLastText));
625 }
626
627
628
629 currentFormat.setNewlines(false);
630 currentFormat.setTrimText(false);
631 currentFormat.setIndent("");
632
633
634 super.writeElement(element);
635 } finally {
636 FormatState state = (FormatState) formatStack.pop();
637 currentFormat.setNewlines(state.isNewlines());
638 currentFormat.setTrimText(state.isTrimText());
639 currentFormat.setIndent(state.getIndent());
640 }
641 } else {
642 super.writeElement(element);
643 }
644 }
645
646 private String justSpaces(String text) {
647 int size = text.length();
648 StringBuffer res = new StringBuffer(size);
649 char c;
650
651 for (int i = 0; i < size; i++) {
652 c = text.charAt(i);
653
654 switch (c) {
655 case '\r':
656 case '\n':
657
658 continue;
659
660 default:
661 res.append(c);
662 }
663 }
664
665 return res.toString();
666 }
667
668 private void lazyInitNewLinesAfterNTags() {
669 if (getOutputFormat().isNewlines()) {
670
671 newLineAfterNTags = 0;
672 } else {
673 newLineAfterNTags = getOutputFormat().getNewLineAfterNTags();
674 }
675 }
676
677
678
679 /***
680 * Convenience method to just get a String result.
681 *
682 * @param html
683 * DOCUMENT ME!
684 *
685 * @return a pretty printed String from the source string, preserving
686 * whitespace in the defaultPreformattedTags set, and leaving the
687 * close tags off of the default omitElementCloseSet set. Use one of
688 * the write methods if you want stream output.
689 *
690 * @throws java.io.IOException
691 * @throws java.io.UnsupportedEncodingException
692 * @throws org.dom4j.DocumentException
693 */
694 public static String prettyPrintHTML(String html)
695 throws java.io.IOException, java.io.UnsupportedEncodingException,
696 org.dom4j.DocumentException {
697 return prettyPrintHTML(html, true, true, false, true);
698 }
699
700 /***
701 * Convenience method to just get a String result, but <b>As XHTML </b>.
702 *
703 * @param html
704 * DOCUMENT ME!
705 *
706 * @return a pretty printed String from the source string, preserving
707 * whitespace in the defaultPreformattedTags set, but conforming to
708 * XHTML: no close tags are omitted (though if empty, they will be
709 * converted to XHTML empty tags: <HR/> Use one of the write
710 * methods if you want stream output.
711 *
712 * @throws java.io.IOException
713 * @throws java.io.UnsupportedEncodingException
714 * @throws org.dom4j.DocumentException
715 */
716 public static String prettyPrintXHTML(String html)
717 throws java.io.IOException, java.io.UnsupportedEncodingException,
718 org.dom4j.DocumentException {
719 return prettyPrintHTML(html, true, true, true, false);
720 }
721
722 /***
723 * DOCUMENT ME!
724 *
725 * @param html
726 * DOCUMENT ME!
727 * @param newlines
728 * DOCUMENT ME!
729 * @param trim
730 * DOCUMENT ME!
731 * @param isXHTML
732 * DOCUMENT ME!
733 * @param expandEmpty
734 * DOCUMENT ME!
735 *
736 * @return a pretty printed String from the source string, preserving
737 * whitespace in the defaultPreformattedTags set, and leaving the
738 * close tags off of the default omitElementCloseSet set. This
739 * override allows you to specify various formatter options. Use one
740 * of the write methods if you want stream output.
741 *
742 * @throws java.io.IOException
743 * @throws java.io.UnsupportedEncodingException
744 * @throws org.dom4j.DocumentException
745 */
746 public static String prettyPrintHTML(String html, boolean newlines,
747 boolean trim, boolean isXHTML, boolean expandEmpty)
748 throws java.io.IOException, java.io.UnsupportedEncodingException,
749 org.dom4j.DocumentException {
750 StringWriter sw = new StringWriter();
751 OutputFormat format = OutputFormat.createPrettyPrint();
752 format.setNewlines(newlines);
753 format.setTrimText(trim);
754 format.setXHTML(isXHTML);
755 format.setExpandEmptyElements(expandEmpty);
756
757 HTMLWriter writer = new HTMLWriter(sw, format);
758 Document document = DocumentHelper.parseText(html);
759 writer.write(document);
760 writer.flush();
761
762 return sw.toString();
763 }
764
765
766
767 private class FormatState {
768 private boolean newlines = false;
769
770 private boolean trimText = false;
771
772 private String indent = "";
773
774 public FormatState(boolean newLines, boolean trimText, String indent) {
775 this.newlines = newLines;
776 this.trimText = trimText;
777 this.indent = indent;
778 }
779
780 public boolean isNewlines() {
781 return newlines;
782 }
783
784 public boolean isTrimText() {
785 return trimText;
786 }
787
788 public String getIndent() {
789 return indent;
790 }
791 }
792 }
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841