Hi I remembered this post today and decided to create a prototype. My code is very messy but well it’s almost done (I hope). It can already be output as a “tree”.
I wrote it in eclipse so it looks a bit different.
Here is the Eclipse version:
Main.java
package main;
public class Main {
public static void main(String[] args) throws Exception {
HtmlParser p = new HtmlParser();
p.parseHtml();
HtmlParserTree tree = new HtmlParserTree(p.allTags);
tree.createTree();
//String fullXpath = "/html/body/img";
}
}
HtmlParser.java
package main;
import java.io.File;
import java.util.ArrayList;
import java.util.Scanner;
public class HtmlParser {
public static final char splitChar = '~';
public ArrayList<HtmlParserType> allTags;
private boolean includeComments = false;
private String htmlFile = null;
private int currCharIndex = 0;
public HtmlParser() {
allTags = new ArrayList<HtmlParserType>();
}
public void parseHtml() throws Exception {
htmlFile = loadFile();
// System.out.println(htmlFile);
while (currCharIndex < htmlFile.length() - 1) {
nextChar();
}
// Remove DOCTYPE html because we dont neeed it
//allTags.remove(0);
// System.out.println("");
// for (int i = 0; i < allTags.size(); i++) {
// System.out.println(allTags.get(i).toString());
// }
}
private void nextChar() {
currCharIndex++;
char currChar = getCurrChar();
if (currChar == ' ')
return;
if (currChar == '<') {
// opentag
openTag();
return;
}
String content = "";
while (currChar != '<') {
if (currChar == '!') {
currCharIndex++;
currChar = getCurrChar();
continue;
}
content += currChar;
currCharIndex++;
currChar = getCurrChar();
}
content = content.substring(0, content.length());
allTags.add(new HtmlParserType(content, HtmlParserType.Type.CONTEXT));
currCharIndex--;
}
private void openTag() {
boolean inString = false;
boolean lastCharWasSpace = false;
String currTag = "";
char currChar = getCurrChar();
char nextChar = getNextChar();
if (nextChar == '!') {
// comment
openComment();
return;
}
while (currChar != '>') {
currCharIndex++;
currChar = getCurrChar();
if (currChar == '"')
inString = !inString;
if (currChar == ' ') {
if (inString) {
currTag += ' ';
} else if (!lastCharWasSpace) {
currTag += splitChar;
lastCharWasSpace = true;
}
continue;
}
currTag += currChar;
lastCharWasSpace = false;
}
// Damit > das weg ist
currTag = currTag.substring(0, currTag.length() - 1);
allTags.add(new HtmlParserType(currTag, HtmlParserType.Type.TAG));
}
private void openComment() {
String currComment = "";
char currChar = getCurrChar();
while (currChar != '>') {
currCharIndex++;
currChar = getCurrChar();
currComment += currChar;
}
// Damit > das weg ist
currComment = currComment.substring(0, currComment.length() - 1);
if (!includeComments)
return;
allTags.add(new HtmlParserType(currComment, HtmlParserType.Type.COMMENT));
}
private char getCurrChar() {
return htmlFile.charAt(currCharIndex);
}
private char getNextChar() {
return htmlFile.charAt(currCharIndex + 1);
}
public String loadFile() throws Exception {
String html = "";
File myObj = new File("C:\\Users\\user\\eclipse-workspace\\HTMLParser\\data\\index.html");
Scanner myReader = new Scanner(myObj);
while (myReader.hasNextLine()) {
String data = myReader.nextLine();
html += data;
}
myReader.close();
return html;
}
}
HtmlParserType.java
package main;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
public class HtmlParserType {
private final String[] selfClosingTags = { "area", "base", "br", "col", "embed", "hr", "img", "link", "meta",
"param", "source", "track", "wbr" };
public enum Type {
CONTEXT, COMMENT, TAG, CLOSINGTAG
}
public ArrayList<HtmlParserType> children;
public Map<String, String> attributes;
public boolean isSelfClosing;
public String completeTagText;
public String name;
public Type type;
public HtmlParserType(String te, Type t) {
attributes = new LinkedHashMap<String, String>();
children = new ArrayList<HtmlParserType>();
completeTagText = te;
type = t;
name = getTagName();
isSelfClosing = isSelfclosing();
createArguments();
if (isClosingTag()) {
type = Type.CLOSINGTAG;
}
}
public void printArr(String[] arr) {
for (String s : arr) {
System.out.println(s);
}
}
public void createArguments() {
if (completeTagText.endsWith("/"))
completeTagText = completeTagText.substring(0, completeTagText.length() - 1);
completeTagText = completeTagText.replaceAll("async" + HtmlParser.splitChar, "");
completeTagText = completeTagText.replaceAll(HtmlParser.splitChar + "required", "");
String[] args = completeTagText.split(Character.toString(HtmlParser.splitChar));
// index 0 is the tagname
if (args == null)
return;
// System.out.println(completeTagText);
// printArr(args);
if (args.length <= 1)
return;
for (int i = 1; i < args.length; i++) {
String arg = args[i];
// System.out.println(arg);// name="viewport"
arg = arg.replaceAll("=", "");
// arg = arg.replaceAll(Character.toString('"'), "");
// System.out.println(arg);
// String[] splitArg = arg.split("=");
String[] splitArg;
if (arg.contains(Character.toString('"'))) {
splitArg = arg.split(Character.toString('"'));
} else {
splitArg = arg.split("'");
}
String arg0 = splitArg[0];
String arg1 = "";
if (splitArg.length > 1)
arg1 = splitArg[1];
// System.out.println(arg0 + "\n" + arg1);
// printArr(splitArg);
attributes.put(arg0, arg1);
}
}
public String getFormattedName() {
String r = name;
if (isSelfClosing)
r += "/";
return r;
}
public String getAttributes() {
return completeTagText.substring(getTagName().length(), completeTagText.length());
}
public String getTagName() {
return completeTagText.split(Character.toString(HtmlParser.splitChar))[0];
}
public boolean isClosingTag() {
return getTagName().startsWith("/");
}
public boolean isSelfclosing() {
return Arrays.asList(selfClosingTags).contains(getTagName());
}
@Override
public String toString() {
String r = name;
if (isSelfClosing)
r += "/";
r += "|" + type.toString() + "|" + attributes;
return r;
}
}
**HtmlParserTree.java**
package main;
import java.util.ArrayList;
public class HtmlParserTree {
HtmlParserType[] types;
public HtmlParserTree(ArrayList<HtmlParserType> t) {
types = new HtmlParserType[t.size()];
for (int i = 0; i < t.size(); i++) {
types[i] = t.get(i);
}
}
public void createTree() {
int depth = 0;
for (HtmlParserType type : types) {
printWithDepth(type.getFormattedName() + " " + depth, depth);
depth = updateDepth(type, depth);
}
System.out.println("\n\n");
depth = 0;
for (HtmlParserType type : types) {
depth = updateDepth(type, depth);
printWithDepth(type.getFormattedName() + " " + depth, depth);
// type.children.add(type);
}
}
public int updateDepth(HtmlParserType type, int depth) {
if (type.type == HtmlParserType.Type.TAG) {
if (!type.isSelfClosing) {
depth++;
}
} else if (type.type == HtmlParserType.Type.CLOSINGTAG) {
depth--;
}
return depth;
}
public void printWithDepth(String text, int d) {
final String TAB = " ";
String depthText = "";
for (int i = 0; i < d; i++) {
depthText += TAB;
}
depthText += text;
System.out.println(depthText);
}
}
Here is the Procession version:
HtmlParserExample.pde (main)
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.ArrayList;
final char splitChar = '~';
enum Type {
CONTEXT, COMMENT, TAG, CLOSINGTAG
}
void setup() {
//hide the window
surface.setVisible(false);
HtmlParser p = new HtmlParser();
try {
p.parseHtml();
}
catch(Exception e) {
e.printStackTrace();
}
HtmlParserTree tree = new HtmlParserTree(p.allTags);
tree.createTree();
}
void draw() {
}
HtmlParser.pde
class HtmlParser {
public ArrayList<HtmlParserType> allTags;
private boolean includeComments = false;
private String htmlFile = null;
private int currCharIndex = 0;
HtmlParser() {
allTags = new ArrayList<HtmlParserType>();
}
public void parseHtml() throws Exception {
htmlFile = loadFile();
// System.out.println(htmlFile);
while (currCharIndex < htmlFile.length() - 1) {
nextChar();
}
// Remove DOCTYPE html because we dont neeed it
//allTags.remove(0);
// System.out.println("");
// for (int i = 0; i < allTags.size(); i++) {
// System.out.println(allTags.get(i).toString());
// }
}
private void nextChar() {
currCharIndex++;
char currChar = getCurrChar();
if (currChar == ' ')
return;
if (currChar == '<') {
// opentag
openTag();
return;
}
String content = "";
while (currChar != '<') {
if (currChar == '!') {
currCharIndex++;
currChar = getCurrChar();
continue;
}
content += currChar;
currCharIndex++;
currChar = getCurrChar();
}
content = content.substring(0, content.length());
allTags.add(new HtmlParserType(content, Type.CONTEXT));
currCharIndex--;
}
private void openTag() {
boolean inString = false;
boolean lastCharWasSpace = false;
String currTag = "";
char currChar = getCurrChar();
char nextChar = getNextChar();
if (nextChar == '!') {
// comment
openComment();
return;
}
while (currChar != '>') {
currCharIndex++;
currChar = getCurrChar();
if (currChar == '"')
inString = !inString;
if (currChar == ' ') {
if (inString) {
currTag += ' ';
} else if (!lastCharWasSpace) {
currTag += splitChar;
lastCharWasSpace = true;
}
continue;
}
currTag += currChar;
lastCharWasSpace = false;
}
// Damit > das weg ist
currTag = currTag.substring(0, currTag.length() - 1);
allTags.add(new HtmlParserType(currTag, Type.TAG));
}
private void openComment() {
String currComment = "";
char currChar = getCurrChar();
while (currChar != '>') {
currCharIndex++;
currChar = getCurrChar();
currComment += currChar;
}
// Damit > das weg ist
currComment = currComment.substring(0, currComment.length() - 1);
if (!includeComments)
return;
allTags.add(new HtmlParserType(currComment, Type.COMMENT));
}
private char getCurrChar() {
return htmlFile.charAt(currCharIndex);
}
private char getNextChar() {
return htmlFile.charAt(currCharIndex + 1);
}
public String loadFile() throws Exception {
String html = "";
File myObj = new File(sketchPath("data/index.html"));
Scanner myReader = new Scanner(myObj);
while (myReader.hasNextLine()) {
String data = myReader.nextLine();
html += data;
}
myReader.close();
return html;
}
}
HtmlParserTree.pde
class HtmlParserTree {
HtmlParserType[] types;
HtmlParserTree(ArrayList<HtmlParserType> t) {
types = new HtmlParserType[t.size()];
for (int i = 0; i < t.size(); i++) {
types[i] = t.get(i);
}
}
public void createTree() {
int depth = 0;
for (HtmlParserType type : types) {
printWithDepth(type.getFormattedName() + " " + depth, depth);
depth = updateDepth(type, depth);
}
System.out.println("\n\n");
depth = 0;
for (HtmlParserType type : types) {
depth = updateDepth(type, depth);
printWithDepth(type.getFormattedName() + " " + depth, depth);
// type.children.add(type);
}
}
public int updateDepth(HtmlParserType type, int depth) {
if (type.type == Type.TAG) {
if (!type.isSelfClosing) {
depth++;
}
} else if (type.type == Type.CLOSINGTAG) {
depth--;
}
return depth;
}
public void printWithDepth(String text, int d) {
final String TAB = " ";
String depthText = "";
for (int i = 0; i < d; i++) {
depthText += TAB;
}
depthText += text;
System.out.println(depthText);
}
}
HtmlParserType.pde
class HtmlParserType {
private final String[] selfClosingTags = { "area", "base", "br", "col", "embed", "hr", "img", "link", "meta",
"param", "source", "track", "wbr" };
public ArrayList<HtmlParserType> children;
public Map<String, String> attributes;
public boolean isSelfClosing;
public String completeTagText;
public String name;
public Type type;
HtmlParserType(String te, Type t) {
attributes = new LinkedHashMap<String, String>();
children = new ArrayList<HtmlParserType>();
completeTagText = te;
type = t;
name = getTagName();
isSelfClosing = isSelfclosing();
createArguments();
if (isClosingTag()) {
type = Type.CLOSINGTAG;
}
}
public void printArr(String[] arr) {
for (String s : arr) {
System.out.println(s);
}
}
public void createArguments() {
if (completeTagText.endsWith("/"))
completeTagText = completeTagText.substring(0, completeTagText.length() - 1);
completeTagText = completeTagText.replaceAll("async" + splitChar, "");
completeTagText = completeTagText.replaceAll(splitChar + "required", "");
String[] args = completeTagText.split(Character.toString(splitChar));
// index 0 is the tagname
if (args == null)
return;
// System.out.println(completeTagText);
// printArr(args);
if (args.length <= 1)
return;
for (int i = 1; i < args.length; i++) {
String arg = args[i];
// System.out.println(arg);// name="viewport"
arg = arg.replaceAll("=", "");
// arg = arg.replaceAll(Character.toString('"'), "");
// System.out.println(arg);
// String[] splitArg = arg.split("=");
String[] splitArg;
if (arg.contains(Character.toString('"'))) {
splitArg = arg.split(Character.toString('"'));
} else {
splitArg = arg.split("'");
}
String arg0 = splitArg[0];
String arg1 = "";
if (splitArg.length > 1)
arg1 = splitArg[1];
// System.out.println(arg0 + "\n" + arg1);
// printArr(splitArg);
attributes.put(arg0, arg1);
}
}
public String getFormattedName() {
String r = name;
if (isSelfClosing)
r += "/";
return r;
}
public String getAttributes() {
return completeTagText.substring(getTagName().length(), completeTagText.length());
}
public String getTagName() {
return completeTagText.split(Character.toString(splitChar))[0];
}
public boolean isClosingTag() {
return getTagName().startsWith("/");
}
public boolean isSelfclosing() {
return Arrays.asList(selfClosingTags).contains(getTagName());
}
@Override
public String toString() {
String r = name;
if (isSelfClosing)
r += "/";
r += "|" + type.toString() + "|" + attributes;
return r;
}
}