Flickr Popular Tags page parsing using Java – update

In my last post on parsing Flickr popular tags I had discussed converting the Flickr page HTML to XML (using Tidy on the command line) and then using the Java XML APIs to get a Document that I could parse.

But I was able to find some sample code of how to use Tidy (jTidy) to parse HTML and return a Document, so I have changed my code as follows…

import java.io.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
import org.w3c.tidy.Tidy;

public class FParser {

	// Main program
	// Much of the XML parsing code comes from
	// http://www.exampledepot.com/egs/org.w3c.dom/pkg.html
	public static void main(String args[]) {
		// The Quintessential Program to Create a DOM Document from an XML File
		startParse(".");
	}

	public static void startParse(String dir) {
		Tidy tidy = new Tidy(); // obtain a new Tidy instance
		tidy.setQuiet(true);
		tidy.setShowWarnings(false);
		Document doc = null;
		try {
			FileInputStream fstream = new
				FileInputStream("2008-4-29_14-30.html");
			doc = tidy.parseDOM(fstream, null);
		} catch (Exception e) {
			e.printStackTrace();
		}

		// Retrieve the element using id
		NodeList list = doc.getElementsByTagName("p");
		for (int i=0; i<list.getLength(); i++) {
			Element element = (Element)list.item(i);
			// We want the <p> whose id is "TagCloud"
			if (element.getAttribute("id").equals("TagCloud")) {
				// Get each of the Tag Clouds <a> elements
				NodeList list2 = element.getElementsByTagName("a");
				for (int j=0; j<list2.getLength(); j++) {
					Element element2 = (Element)list2.item(j);
					// Get the tag by parsing the href attribute
					String sHref = element2.getAttribute("href").replaceAll("/photos/tags/", "").replaceAll("/","");
					// Get the weight by parsing the font-size attribute
					String sStyle = element2.getAttribute("style").replaceAll("font-size: ", ""). replaceAll("px;", "");
					System.out.println(sHref + ": " + sStyle);
				}
			}
		}
	}

	// Parses an XML file and returns a DOM document.
	// If validating is true, the contents is validated against the DTD
	// specified in the file.
	public static Document parseXmlFile(String filename, boolean validating) {
		try {
			// Create a builder factory
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
			factory.setValidating(validating);

			// Create the builder and parse the file
			Document doc = factory.newDocumentBuilder().parse(new File(filename));
			return doc;
		} catch (SAXException e) {
			// A parsing error occurred; the xml input is not valid
			e.printStackTrace();
		} catch (ParserConfigurationException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
}

0 Responses to “Flickr Popular Tags page parsing using Java – update”



  1. No Comments Yet

Leave a Reply