Problem with reading a html file

Flolo · September 15, 2021, 8:20pm

Hey,
I want to write a class with which you can get a certain element from a website.

For example give the class a link and an xpath and as result I want a string with the code of the element.

link:

xpath:

/html/body/div[3]/div[3]/div[5]/div[1]/div[2]/div/a/img

desired result:

<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Votivni_obraz_Ocko_-_Ludmila.jpg/220px-Votivni_obraz_Ocko_-_Ludmila.jpg" decoding="async" width="220" height="329" class="thumbimage" srcset="//upload.wikimedia.org/wikipedia/commons/e/e5/Votivni_obraz_Ocko_-_Ludmila.jpg 1.5x" data-file-width="324" data-file-height="484">

I thought I could handle an html file the same as an xml file (never worked with one before) but they look the same.
But it doesn’t work (it always returns null)
Here is my code:

Main

import org.w3c.dom.Node;
import javax.xml.xpath.XPath;
import org.w3c.dom.Document;
import javax.xml.xpath.XPathConstants;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPathFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.URLConnection;
import java.net.URL;

void setup() {
  size(200, 200);
  String url = "https://de.wikipedia.org/wiki/Ludmilla_von_B%C3%B6hmen";
  String xpath = "/html/body/div[3]/div[3]/div[5]/div[1]/div[2]/div/a/img";
  GetWebsiteElement getElement = new GetWebsiteElement(url, xpath);
  exit();
}
void draw() {
}
void exit() {
  super.exit();
}

The Class

class GetWebsiteElement {
  //we need the full Xpath
  String url;
  String xpath;
  String htmlCode;
  String elementCode;
  GetWebsiteElement(String url, String xpath) {
    this.url = url;
    this.xpath = xpath;
    try {
      loadElement();
    }
    catch(Exception e) {
      e.printStackTrace();
    }
  }
  void loadElement() throws Exception {

    //Download the html file--------------------------------------------------------------
   /*
   i know i could use laodStrings(url) but then it wouldn't work in normal java if i needed it there
   */
    
    URL url_ = new URL(url);
    URLConnection spoof = url_.openConnection();

    //Spoof the connection so we look like a web browser
    spoof.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0;    H010818)" );
    BufferedReader in = new BufferedReader(new InputStreamReader(spoof.getInputStream()));
    String strLine = "";
    String finalHTML = "";
    //Loop through every line in the source
    while ((strLine = in.readLine()) != null) {
      finalHTML += strLine + "\n";
    }

    htmlCode = finalHTML;
    String savePath = sketchPath("html.txt");
    //saveStrings(savePath, new String[]{htmlCode});

    //------------------------------------------------------------------------

    File f = new File(savePath);
    FileInputStream fileIS = new FileInputStream(f);
    DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = builderFactory.newDocumentBuilder();
    Document xmlDocument = builder.parse(fileIS);
    XPath xPath = XPathFactory.newInstance().newXPath();
    NodeList nodes = (NodeList) xPath.compile(xpath).evaluate(xmlDocument, XPathConstants.NODESET);
    for (int i = 0; i < nodes.getLength(); i++)
      println(nodes.item(i).getNodeValue());
  }
}

I would appreciate any reply :)

jafal · September 16, 2021, 12:07am

Hi

I was planning for similar thing open exchange website and observe Gold exchange and make indicator when rise or down
I am going to watch your topic and make my steps

Flolo · February 20, 2022, 8:27pm

Hi I remembered this post today and decided to create a prototype. My code is very messy but well it’s almost done (I hope). It can already be output as a “tree”.
I wrote it in eclipse so it looks a bit different.

Here is the Eclipse version:

Main.java

package main;

public class Main {

	public static void main(String[] args) throws Exception {
		HtmlParser p = new HtmlParser();
		p.parseHtml();

		HtmlParserTree tree = new HtmlParserTree(p.allTags);
		tree.createTree();
		
		//String fullXpath = "/html/body/img";
		
	}

}

HtmlParser.java

package main;

import java.io.File;
import java.util.ArrayList;
import java.util.Scanner;

public class HtmlParser {
	public static final char splitChar = '~';
	public ArrayList<HtmlParserType> allTags;
	private boolean includeComments = false;
	private String htmlFile = null;
	private int currCharIndex = 0;

	public HtmlParser() {
		allTags = new ArrayList<HtmlParserType>();
	}

	public void parseHtml() throws Exception {
		htmlFile = loadFile();
		// System.out.println(htmlFile);

		while (currCharIndex < htmlFile.length() - 1) {
			nextChar();
		}
		// Remove DOCTYPE html because we dont neeed it

		//allTags.remove(0);
//		System.out.println("");
//		for (int i = 0; i < allTags.size(); i++) {
//			System.out.println(allTags.get(i).toString());
//		}
	}

	private void nextChar() {
		currCharIndex++;
		char currChar = getCurrChar();

		if (currChar == ' ')
			return;

		if (currChar == '<') {
			// opentag
			openTag();
			return;
		}
		String content = "";
		while (currChar != '<') {
			if (currChar == '!') {
				currCharIndex++;
				currChar = getCurrChar();
				continue;
			}
			content += currChar;
			currCharIndex++;
			currChar = getCurrChar();

		}
		content = content.substring(0, content.length());
		allTags.add(new HtmlParserType(content, HtmlParserType.Type.CONTEXT));
		currCharIndex--;
	}

	private void openTag() {
		boolean inString = false;
		boolean lastCharWasSpace = false;
		String currTag = "";
		char currChar = getCurrChar();
		char nextChar = getNextChar();
		if (nextChar == '!') {
			// comment
			openComment();
			return;
		}
		while (currChar != '>') {
			currCharIndex++;
			currChar = getCurrChar();
			if (currChar == '"')
				inString = !inString;

			if (currChar == ' ') {
				if (inString) {
					currTag += ' ';
				} else if (!lastCharWasSpace) {
					currTag += splitChar;
					lastCharWasSpace = true;
				}
				continue;
			}
			currTag += currChar;
			lastCharWasSpace = false;
		}
		// Damit > das weg ist
		currTag = currTag.substring(0, currTag.length() - 1);

		allTags.add(new HtmlParserType(currTag, HtmlParserType.Type.TAG));
	}

	private void openComment() {
		String currComment = "";
		char currChar = getCurrChar();
		while (currChar != '>') {
			currCharIndex++;
			currChar = getCurrChar();
			currComment += currChar;
		}
		// Damit > das weg ist
		currComment = currComment.substring(0, currComment.length() - 1);
		if (!includeComments)
			return;
		allTags.add(new HtmlParserType(currComment, HtmlParserType.Type.COMMENT));
	}

	private char getCurrChar() {
		return htmlFile.charAt(currCharIndex);
	}

	private char getNextChar() {
		return htmlFile.charAt(currCharIndex + 1);
	}

	public String loadFile() throws Exception {
		String html = "";
		File myObj = new File("C:\\Users\\user\\eclipse-workspace\\HTMLParser\\data\\index.html");
		Scanner myReader = new Scanner(myObj);
		while (myReader.hasNextLine()) {
			String data = myReader.nextLine();
			html += data;
		}
		myReader.close();
		return html;
	}
}

HtmlParserType.java

package main;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;

public class HtmlParserType {
	private final String[] selfClosingTags = { "area", "base", "br", "col", "embed", "hr", "img", "link", "meta",
			"param", "source", "track", "wbr" };

	public enum Type {
		CONTEXT, COMMENT, TAG, CLOSINGTAG
	}

	public ArrayList<HtmlParserType> children;
	public Map<String, String> attributes;
	public boolean isSelfClosing;
	public String completeTagText;
	public String name;
	public Type type;

	public HtmlParserType(String te, Type t) {
		attributes = new LinkedHashMap<String, String>();
		children = new ArrayList<HtmlParserType>();

		completeTagText = te;
		type = t;

		name = getTagName();
		isSelfClosing = isSelfclosing();

		createArguments();
		if (isClosingTag()) {
			type = Type.CLOSINGTAG;
		}
	}

	public void printArr(String[] arr) {
		for (String s : arr) {
			System.out.println(s);
		}
	}

	public void createArguments() {
		if (completeTagText.endsWith("/"))
			completeTagText = completeTagText.substring(0, completeTagText.length() - 1);

		completeTagText = completeTagText.replaceAll("async" + HtmlParser.splitChar, "");
		completeTagText = completeTagText.replaceAll(HtmlParser.splitChar + "required", "");

		String[] args = completeTagText.split(Character.toString(HtmlParser.splitChar));
		// index 0 is the tagname

		if (args == null)
			return;
		// System.out.println(completeTagText);
		// printArr(args);

		if (args.length <= 1)
			return;

		for (int i = 1; i < args.length; i++) {
			String arg = args[i];
			// System.out.println(arg);// name="viewport"
			arg = arg.replaceAll("=", "");
			// arg = arg.replaceAll(Character.toString('"'), "");
			// System.out.println(arg);
			// String[] splitArg = arg.split("=");
			String[] splitArg;
			if (arg.contains(Character.toString('"'))) {
				splitArg = arg.split(Character.toString('"'));
			} else {
				splitArg = arg.split("'");
			}
			String arg0 = splitArg[0];
			String arg1 = "";
			if (splitArg.length > 1)
				arg1 = splitArg[1];
			// System.out.println(arg0 + "\n" + arg1);
			// printArr(splitArg);
			attributes.put(arg0, arg1);
		}
	}

	public String getFormattedName() {
		String r = name;
		if (isSelfClosing)
			r += "/";
		return r;
	}

	public String getAttributes() {
		return completeTagText.substring(getTagName().length(), completeTagText.length());
	}

	public String getTagName() {
		return completeTagText.split(Character.toString(HtmlParser.splitChar))[0];
	}

	public boolean isClosingTag() {
		return getTagName().startsWith("/");
	}

	public boolean isSelfclosing() {
		return Arrays.asList(selfClosingTags).contains(getTagName());
	}

	@Override
	public String toString() {
		String r = name;
		if (isSelfClosing)
			r += "/";

		r += "|" + type.toString() + "|" + attributes;

		return r;
	}
}
**HtmlParserTree.java**

package main;

import java.util.ArrayList;

public class HtmlParserTree {
HtmlParserType[] types;

public HtmlParserTree(ArrayList<HtmlParserType> t) {
	types = new HtmlParserType[t.size()];
	for (int i = 0; i < t.size(); i++) {
		types[i] = t.get(i);
	}
}

public void createTree() {
	int depth = 0;
	for (HtmlParserType type : types) {
		printWithDepth(type.getFormattedName() + " " + depth, depth);
		depth = updateDepth(type, depth);
	}
	System.out.println("\n\n");
	depth = 0;
	for (HtmlParserType type : types) {
		depth = updateDepth(type, depth);
		printWithDepth(type.getFormattedName() + " " + depth, depth);
		// type.children.add(type);
	}
}

public int updateDepth(HtmlParserType type, int depth) {
	if (type.type == HtmlParserType.Type.TAG) {
		if (!type.isSelfClosing) {
			depth++;
		}
	} else if (type.type == HtmlParserType.Type.CLOSINGTAG) {
		depth--;
	}

	return depth;
}

public void printWithDepth(String text, int d) {
	final String TAB = "  ";
	String depthText = "";
	for (int i = 0; i < d; i++) {
		depthText += TAB;
	}
	depthText += text;
	System.out.println(depthText);
}

}

Here is the Procession version:

HtmlParserExample.pde (main)

import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.ArrayList;

final char splitChar = '~';

enum Type {
  CONTEXT, COMMENT, TAG, CLOSINGTAG
}

void setup() {
  //hide the window
  surface.setVisible(false);

  HtmlParser p = new HtmlParser();
  try {
    p.parseHtml();
  }
  catch(Exception e) {
    e.printStackTrace();
  }
  HtmlParserTree tree = new HtmlParserTree(p.allTags);
  tree.createTree();
}
void draw() {
}

HtmlParser.pde


class HtmlParser {
  public ArrayList<HtmlParserType> allTags;
  private boolean includeComments = false;
  private String htmlFile = null;
  private int currCharIndex = 0;

  HtmlParser() {
    allTags = new ArrayList<HtmlParserType>();
  }

  public void parseHtml() throws Exception {
    htmlFile = loadFile();
    // System.out.println(htmlFile);

    while (currCharIndex < htmlFile.length() - 1) {
      nextChar();
    }
    // Remove DOCTYPE html because we dont neeed it

    //allTags.remove(0);
    //    System.out.println("");
    //    for (int i = 0; i < allTags.size(); i++) {
    //      System.out.println(allTags.get(i).toString());
    //    }
  }

  private void nextChar() {
    currCharIndex++;
    char currChar = getCurrChar();

    if (currChar == ' ')
      return;

    if (currChar == '<') {
      // opentag
      openTag();
      return;
    }
    String content = "";
    while (currChar != '<') {
      if (currChar == '!') {
        currCharIndex++;
        currChar = getCurrChar();
        continue;
      }
      content += currChar;
      currCharIndex++;
      currChar = getCurrChar();
    }
    content = content.substring(0, content.length());
    allTags.add(new HtmlParserType(content, Type.CONTEXT));
    currCharIndex--;
  }

  private void openTag() {
    boolean inString = false;
    boolean lastCharWasSpace = false;
    String currTag = "";
    char currChar = getCurrChar();
    char nextChar = getNextChar();
    if (nextChar == '!') {
      // comment
      openComment();
      return;
    }
    while (currChar != '>') {
      currCharIndex++;
      currChar = getCurrChar();
      if (currChar == '"')
        inString = !inString;

      if (currChar == ' ') {
        if (inString) {
          currTag += ' ';
        } else if (!lastCharWasSpace) {
          currTag += splitChar;
          lastCharWasSpace = true;
        }
        continue;
      }
      currTag += currChar;
      lastCharWasSpace = false;
    }
    // Damit > das weg ist
    currTag = currTag.substring(0, currTag.length() - 1);

    allTags.add(new HtmlParserType(currTag, Type.TAG));
  }

  private void openComment() {
    String currComment = "";
    char currChar = getCurrChar();
    while (currChar != '>') {
      currCharIndex++;
      currChar = getCurrChar();
      currComment += currChar;
    }
    // Damit > das weg ist
    currComment = currComment.substring(0, currComment.length() - 1);
    if (!includeComments)
      return;
    allTags.add(new HtmlParserType(currComment, Type.COMMENT));
  }

  private char getCurrChar() {
    return htmlFile.charAt(currCharIndex);
  }

  private char getNextChar() {
    return htmlFile.charAt(currCharIndex + 1);
  }

  public String loadFile() throws Exception {
    String html = "";
    File myObj = new File(sketchPath("data/index.html"));
    Scanner myReader = new Scanner(myObj);
    while (myReader.hasNextLine()) {
      String data = myReader.nextLine();
      html += data;
    }
    myReader.close();
    return html;
  }
}

HtmlParserTree.pde

class HtmlParserTree {
  HtmlParserType[] types;

  HtmlParserTree(ArrayList<HtmlParserType> t) {
    types = new HtmlParserType[t.size()];
    for (int i = 0; i < t.size(); i++) {
      types[i] = t.get(i);
    }
  }

  public void createTree() {
    int depth = 0;
    for (HtmlParserType type : types) {
      printWithDepth(type.getFormattedName() + " " + depth, depth);
      depth = updateDepth(type, depth);
    }
    System.out.println("\n\n");
    depth = 0;
    for (HtmlParserType type : types) {
      depth = updateDepth(type, depth);
      printWithDepth(type.getFormattedName() + " " + depth, depth);
      // type.children.add(type);
    }
  }

  public int updateDepth(HtmlParserType type, int depth) {
    if (type.type == Type.TAG) {
      if (!type.isSelfClosing) {
        depth++;
      }
    } else if (type.type == Type.CLOSINGTAG) {
      depth--;
    }

    return depth;
  }

  public void printWithDepth(String text, int d) {
    final String TAB = "  ";
    String depthText = "";
    for (int i = 0; i < d; i++) {
      depthText += TAB;
    }
    depthText += text;
    System.out.println(depthText);
  }
}

HtmlParserType.pde

class HtmlParserType {
  private final String[] selfClosingTags = { "area", "base", "br", "col", "embed", "hr", "img", "link", "meta",
    "param", "source", "track", "wbr" };


  public ArrayList<HtmlParserType> children;
  public Map<String, String> attributes;
  public boolean isSelfClosing;
  public String completeTagText;
  public String name;
  public Type type;

  HtmlParserType(String te, Type t) {
    attributes = new LinkedHashMap<String, String>();
    children = new ArrayList<HtmlParserType>();

    completeTagText = te;
    type = t;

    name = getTagName();
    isSelfClosing = isSelfclosing();

    createArguments();
    if (isClosingTag()) {
      type = Type.CLOSINGTAG;
    }
  }

  public void printArr(String[] arr) {
    for (String s : arr) {
      System.out.println(s);
    }
  }

  public void createArguments() {
    if (completeTagText.endsWith("/"))
      completeTagText = completeTagText.substring(0, completeTagText.length() - 1);

    completeTagText = completeTagText.replaceAll("async" + splitChar, "");
    completeTagText = completeTagText.replaceAll(splitChar + "required", "");

    String[] args = completeTagText.split(Character.toString(splitChar));
    // index 0 is the tagname

    if (args == null)
      return;
    // System.out.println(completeTagText);
    // printArr(args);

    if (args.length <= 1)
      return;

    for (int i = 1; i < args.length; i++) {
      String arg = args[i];
      // System.out.println(arg);// name="viewport"
      arg = arg.replaceAll("=", "");
      // arg = arg.replaceAll(Character.toString('"'), "");
      // System.out.println(arg);
      // String[] splitArg = arg.split("=");
      String[] splitArg;
      if (arg.contains(Character.toString('"'))) {
        splitArg = arg.split(Character.toString('"'));
      } else {
        splitArg = arg.split("'");
      }
      String arg0 = splitArg[0];
      String arg1 = "";
      if (splitArg.length > 1)
        arg1 = splitArg[1];
      // System.out.println(arg0 + "\n" + arg1);
      // printArr(splitArg);
      attributes.put(arg0, arg1);
    }
  }

  public String getFormattedName() {
    String r = name;
    if (isSelfClosing)
      r += "/";
    return r;
  }

  public String getAttributes() {
    return completeTagText.substring(getTagName().length(), completeTagText.length());
  }

  public String getTagName() {
    return completeTagText.split(Character.toString(splitChar))[0];
  }

  public boolean isClosingTag() {
    return getTagName().startsWith("/");
  }

  public boolean isSelfclosing() {
    return Arrays.asList(selfClosingTags).contains(getTagName());
  }

  @Override
    public String toString() {
    String r = name;
    if (isSelfClosing)
      r += "/";

    r += "|" + type.toString() + "|" + attributes;

    return r;
  }
}

Topic		Replies	Views
Getting HTML and its info Coding Questions	2	338	April 23, 2019
Parsing HTML with Jsoup Libraries	4	1313	March 25, 2019
Unable to Resolve XML URL (Even though it is valid) Coding Questions	10	1021	June 5, 2019
Web loadStrings Coding Questions	2	319	February 19, 2020
[SOLVED] Accessing html elements defined into .html file? Coding Questions	2	2253	August 21, 2018

Problem with reading a html file

Related topics