In my last post on parsing Flickr popular tags I had discussed converting the Flickr page HTML to XML (using Tidy on the command line) and then using the Java XML APIs to get a Document that I could parse.
But I was able to find some sample code of how to use Tidy (jTidy) to parse HTML and return a Document, so I have changed my code as follows…
import java.io.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
import org.w3c.tidy.Tidy;
public class FParser {
// Main program
// Much of the XML parsing code comes from
// http://www.exampledepot.com/egs/org.w3c.dom/pkg.html
public static void main(String args[]) {
// The Quintessential Program to Create a DOM Document from an XML File
startParse(".");
}
public static void startParse(String dir) {
Tidy tidy = new Tidy(); // obtain a new Tidy instance
tidy.setQuiet(true);
tidy.setShowWarnings(false);
Document doc = null;
try {
FileInputStream fstream = new
FileInputStream("2008-4-29_14-30.html");
doc = tidy.parseDOM(fstream, null);
} catch (Exception e) {
e.printStackTrace();
}
// Retrieve the element using id
NodeList list = doc.getElementsByTagName("p");
for (int i=0; i<list.getLength(); i++) {
Element element = (Element)list.item(i);
// We want the <p> whose id is "TagCloud"
if (element.getAttribute("id").equals("TagCloud")) {
// Get each of the Tag Clouds <a> elements
NodeList list2 = element.getElementsByTagName("a");
for (int j=0; j<list2.getLength(); j++) {
Element element2 = (Element)list2.item(j);
// Get the tag by parsing the href attribute
String sHref = element2.getAttribute("href").replaceAll("/photos/tags/", "").replaceAll("/","");
// Get the weight by parsing the font-size attribute
String sStyle = element2.getAttribute("style").replaceAll("font-size: ", ""). replaceAll("px;", "");
System.out.println(sHref + ": " + sStyle);
}
}
}
}
// Parses an XML file and returns a DOM document.
// If validating is true, the contents is validated against the DTD
// specified in the file.
public static Document parseXmlFile(String filename, boolean validating) {
try {
// Create a builder factory
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setValidating(validating);
// Create the builder and parse the file
Document doc = factory.newDocumentBuilder().parse(new File(filename));
return doc;
} catch (SAXException e) {
// A parsing error occurred; the xml input is not valid
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
0 Responses to “Flickr Popular Tags page parsing using Java – update”