September 15, 2002
@ 12:58 AM

using System.Xml.XPath;
using System.Xml;
using System;
using System.IO;
using System.Diagnostics;
using System.Net;
using System.Text;

/// <summary>
/// This class converts a Kuro5hin diary to RSS 0.91 or RSS 1.0 feeds.
/// </summary>

class K5Diary2RSS{


  ///<summary>
  ///Helper function for recursively printing error messages from nested exceptions.
  ///</summary>
  ///<param name="e">The exception</param>
  ///<param name="errStr">The exception to prepend to the Exception arguments error
  ///message</param>
  public static string PrintError(Exception e, string errStr){

    if(e == null)
      return errStr;
    else
      return PrintError(e.InnerException, errStr + e.Message );
  }

  /// <summary>
  /// Uses HTML Tidy available at http://tidy.sourceforge.net/ to convert the specified page
  /// to XHTML.
  /// </summary>
  public static void TidyPage(string htmlFile){
   
    Process tidyProc = new Process();
    tidyProc.StartInfo.FileName = "tidy";
    tidyProc.StartInfo.Arguments =  "-asxhtml -im " + htmlFile;
    tidyProc.StartInfo.UseShellExecute = false;  
   
    tidyProc.Start();
   
    //wait no longer than 60 seconds for tidy to convert the page 
    tidyProc.WaitForExit(60000);
   
    // release handles used by process
    tidyProc.Close();
  }
 
  ///<summary>
  ///Retrieves a Kuro5hin diary page from the URL and writes it to the provided output file.
  ///</summary>
  ///<param name="url">URL to the Kuro5hin Diary</param>
  ///<param name="outfile">Output file to write the page to.</param>
  public static void GetPage(string url, string outfile){

    Console.WriteLine("Connecting to {0}", url);

    /* Fetch the K5 diary page from the WWW */
    HttpWebRequest request   = (HttpWebRequest)WebRequest.Create(url);
    HttpWebResponse response = (HttpWebResponse) request.GetResponse();   

    //Open file for writing
    StreamWriter writeStream = new StreamWriter(File.OpenWrite(outfile));
   
    //Retrieve input stream from response and specify encoding
    Stream receiveStream     = response.GetResponseStream();
    Encoding encode = System.Text.Encoding.GetEncoding("utf-8");

    // Pipes the stream to a higher level stream reader with the required encoding format.
    StreamReader readStream = new StreamReader( receiveStream, encode );

    Char[] read = new Char[256];
   
    // Reads 256 characters at a time.   
    int count = readStream.Read( read, 0, 256 );


    while (count > 0) {
     
      // Dumps the 256 characters on a string and displays the string to the console.
      writeStream.Write(read, 0, count);
      count = readStream.Read(read, 0, 256);
     
    }  
   
    // Releases the resources of the response.
    response.Close();
    // Releases the resources of the Stream.
    readStream.Close();

    //close the output file
    writeStream.Close();

  }

  /// <summary>
  /// Converts a K5 diary file as XHTML to RSS 0.91
  /// </summary>
  /// <param name="doc">The K5 Diary XHTML document</param>
  /// <param name="link">The link to the K5 diary.</param>
  /// <param name="title">The title of the diary</param>
  /// <returns>The RSS file as an XmlDocument object</returns>
  public static XmlDocument K5Xhtml2Rss091(XmlDocument doc, string link, string title){
   
    XmlDocument rss = new XmlDocument();
    rss.LoadXml("<rss version=\"0.91\">\n<channel><title>" + title + "</title>\n" +
        "<link>" + link + "</link>\n" + "<description>" + title + " : The Kuro5hin Diary" +
        "</description>\n<language>en</language></channel>\n</rss>\n");
   
    XmlNode channel = rss.SelectSingleNode("/rss/channel");


    //create prefix<->namespace mappings
     XmlNamespaceManager  nsMgr = new XmlNamespaceManager(doc.NameTable);  
     nsMgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml");
    
     //Grab all the titles then use those to create <item>   
     XmlNodeList nodes = doc.SelectNodes("//xhtml:font[@color='#000000']", nsMgr);

     foreach (XmlNode node in nodes){

       string diaryTitle = node.InnerText;     
       string diaryLink  = "http://www.kuro5hin.org" + node.ParentNode.Attributes["href"].Value;
       string diaryDesc  =
     node.SelectSingleNode("./following::*[local-name() = 'font' and @size='2' and @color='#333333']").InnerXml;
       channel.InnerXml  = channel.InnerXml + "\n<item>\n<title>" + diaryTitle +
     "</title>\n<link>" + diaryLink + "</link>\n<description>" + diaryDesc + "</description>\n";
    
     }
    
     return rss;   
  }
 
  /// <summary>
  /// Converts a K5 diary file as XHTML to RSS 1.0
  /// </summary>
  /// <param name="doc">The K5 Diary XHTML document</param>
  /// <param name="link">The link to the K5 diary.</param>
  /// <param name="title">The title of the diary</param>
  /// <returns>The RSS file as an XmlDocument object</returns> 
  public static XmlDocument K5Xhtml2Rss10(XmlDocument doc, string link, string title){
   
    XmlDocument rss = new XmlDocument();
     rss.LoadXml("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" " +
         "xmlns:rss=\"http://purl.org/rss/1.0/\">\n" +
         "<rss:channel rdf:about=\"http://www.25hoursaday.com/rss10.xml\">" +
         "<rss:title>" + title + "</rss:title>\n" +
         "<rss:link>" + link + "</rss:link>\n" + "<rss:description>" + title +
         " : The Kuro5hin Diary" + "</rss:description>\n" +
         "<rss:items>\n<rdf:Seq/>\n</rss:items>\n</rss:channel>\n" +
         "</rdf:RDF>");
  

    //create prefix<->namespace mappings
     XmlNamespaceManager  nsMgr = new XmlNamespaceManager(doc.NameTable);  
     nsMgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml");
     nsMgr.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
     nsMgr.AddNamespace("rss", "http://purl.org/rss/1.0/");

    
     XmlNode Seq     = rss.SelectSingleNode("//rdf:Seq", nsMgr);
     XmlNode channel = Seq.ParentNode.ParentNode;
    
   
     //Grab all the titles then use those to create <item>   
     XmlNodeList nodes = doc.SelectNodes("//xhtml:font[@color='#000000']", nsMgr);

     foreach (XmlNode node in nodes){

       string diaryTitle = node.InnerText;     
       string diaryLink  = "http://www.kuro5hin.org" + node.ParentNode.Attributes["href"].Value;
       string diaryDesc  =
     node.SelectSingleNode("./following::*[local-name() = 'font' and @size='2' and @color='#333333']").InnerXml;

       Seq.InnerXml      = Seq.InnerXml + "<rdf:li rdf:resource=\"" + diaryLink + "\" " + 
     "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" />";

        channel.InnerXml  = channel.InnerXml + "\n" +
      "<rss:item xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" " +
      "xmlns:rss=\"http://purl.org/rss/1.0/\" rdf:about=\"" + diaryLink + "\" >\n" +
      "<rss:title>" + diaryTitle + "</rss:title>\n<rss:link>" + diaryLink + "</rss:link>\n" +
      "<rss:description>" + diaryDesc + "</rss:description>\n";
    
     }    
    
     return rss;

  }
 
  /// <summary>
  /// Where the magic happens.
  /// </summary>
  /// <param name="args">Command line parameters</param>
  public static void Main(string[] args){
   
   
    if(args.Length != 4){
      Console.WriteLine("Usage: K5Diary2RSS <K5-diary-url> <0.91 or 1.0> <title> <outfile>");
      return;
    }  
 
    uint now = (uint) DateTime.Now.Ticks;
    string fileName = now + ".html";
 
    try{

      string rssVersion = args[1];

      //used for naming temp files
 
     GetPage(args[0], fileName);  
 
     Console.WriteLine("Diary page retrieved from the web and saved as temp file[{0}.html]", now);

     /* Convert diary page to XML [requires HTML Tidy] */
     TidyPage(fileName);

     //Load the file.
     XmlDocument doc = new XmlDocument();
     doc.Load(fileName);
         

     /*  Convert XHTML file to RSS */
     XmlDocument rss = null;

     if(rssVersion.Equals("0.91")){
       rss = K5Xhtml2Rss091(doc, args[0], args[2]);
     }else if(rssVersion.Equals("1.0")){
       rss = K5Xhtml2Rss10(doc, args[0], args[2]);
     }else{
       Console.WriteLine("\n\n*** VERSION " + rssVersion + " IS AN UNSUPPORTED RSS VERSION***");
       return;
     }
       
     rss.Save(args[3]);
    
     /* Delete temp file */
     if(File.Exists(fileName)){    
       File.SetAttributes(fileName, FileAttributes.Normal);
       File.Delete(fileName);
     }


   }catch(XmlException xmle){
     Console.WriteLine("ERROR: XML Parse error occured because " + PrintError(xmle, null));
   }catch(FileNotFoundException fnfe){
     Console.WriteLine("ERROR: " + PrintError(fnfe, null));
   }catch(XPathException xe){
     Console.WriteLine("ERROR: The following error occured while querying the document: " + PrintError(xe, null));
   }catch(Exception e){
     Console.WriteLine("UNEXPECTED ERROR: " + PrintError(e, null));
     Console.WriteLine(e.StackTrace);
   }
  }

}
 

 

Categories: