#! /usr/bin/python

from sgmllib import SGMLParser
import sys
import re

class BaseHTMLProcessor(SGMLParser):
    def reset(self):                       
        SGMLParser.reset(self)
        self.meta = {
            "favicon":"",
            "title":"",
            "keywords":"",
            "language":"",
            "author" :"",
            "description" :[]
          }
        self.title = 0
        self.in_p = 0
        self.nb_p = 0
        self.meta_desc = 0

    def start_html(self, attrs):
      l = [v for k,v in attrs if k=='xml:lang' or 'lang']
      if l:
        self.meta["language"] = l.pop()[:2]

    def start_meta(self, attrs):
      print attrs
      desc = auth = keyw = lang = ""
      if 'description' in attrs:
        desc = attrs.pop()
      if 'author' in attrs:
        auth = attrs.pop()
      if 'keywords' in attrs:
        keyw = attrs.pop()
      if 'Content-Language' in attrs:
        lang = attrs.pop()
      if desc:
        d = [v for k,v in attrs if k=='content']
        self.meta["description"].append(d.pop())
        self.meta_desc = 1
      if auth:
        a = [v for k,v in attrs if k=='content']
        self.meta["author"] = a.pop()
      if keyw:
        k = [v for k,v in attrs if k=='content']
        self.meta["keywords"] = k.pop()
      if lang:
        l = [v for k,v in attrs if k=='content']
        self.meta["language"] = l.pop()[:2]

    def start_title(self, attrs):
      self.title = 1
      
    def end_title(self):
      self.title = 0

    def start_link(self, attrs):
      lnk = [v for k,v in attrs if (v=='shortcut icon' or v=='icon')]
      if lnk:
        l = [v for k,v in attrs if k=='href']
        self.meta["favicon"] = l.pop()

    def start_p(self, attrs):
      self.in_p = 1
      self.nb_p += 1

    def end_p(self):
      self.in_p = 0

    def handle_data(self, text):
      if self.title:
        self.meta["title"] = text
      if self.in_p and self.nb_p < 3 and re.search('\w+',text) and not self.meta_desc:
        self.meta["description"].append(text)

    def output(self):              
        "".join(self.meta["description"])
        return self.meta

def getInfos(url):
     print ">>> %s" % url
     print "\tDownloading ... ",
     import urllib
     sock = urllib.urlopen(url)
     print "OK!"
     htmlSource = sock.read()
     out = {"URL":"", "size":"" }
     if sock.info().__contains__('Content-length'):
        out["size"] = humanize(sock.info().get('Content-length'))
     out["URL"] = sock.geturl()
     sock.close()
     print "\tParsing ... ",
     parser = BaseHTMLProcessor()
     parser.feed(htmlSource)
     parser.close()
     print "OK!\n"
     out =  dict(out.items() + parser.output().items())
     if out["favicon"][5:7] != "//":
       if out["favicon"][:1] == "/" and out["URL"][-1:] == "/":
         out["favicon"] = "%s%s" % (out["URL"][0:-1], out["favicon"])
       elif out["favicon"][:1] == "/":
         out["favicon"] = "%s%s" % (out["URL"], out["favicon"])
       elif out["favicon"][:2] == ".." and out["URL"][-1:] != "/":
         out["favicon"] = "%s/%s" % (out["URL"], out["favicon"])
       elif out["favicon"][:2] == ".." and out["URL"][-1:] == "/":
         out["favicon"] = "%s%s" % (out["URL"], out["favicon"])
       else:
         out["favicon"] = "%s/%s" % (out["URL"], out["favicon"])
     return out

def humanize(size):
  suffixes = ('k', 'm', 'g', 't')
  out = 0
  size = int(size)

  if size < 0:
    return out

  for suffix in suffixes:
    size /= 1024
    if size < 1024:
      out = "%s%s" % (size, suffix)
      break

  return out


def genHTML(infos):
  out = "<li"
  if infos["language"]:
    out += " lang='%s'" % infos["language"]
  out += ">"

  if infos["favicon"][-3:] in ('ico', 'gif', 'png', 'jpg', 'jpeg'):
    out += "<img height='16px' width='16px' src='%s' />" % infos["favicon"]
  else:
    out += "<img src='/pix/nofavicon.png' />"

  out += "<a href='%s' target='_blank'><b>%s</b></a><br />" % (infos["URL"],infos["title"])

  if len("".join(infos["description"])) > 200:
    out += "%s ... <br />" % "".join(infos["description"])[:200]
  elif len("".join(infos["description"])) != 0:
    out += "%s<br />" % "".join(infos["description"])

  out += "%s" % infos["URL"]
  if infos["size"]:
    out += " - %s" % infos["size"]
  out += "<br />\n\n"

#  if infos["keywords"]:
#    out += "%s<br />" % infos["keywords"]

  if infos['author']:
    out += "By %s." % infos["author"]

  out += "</li>"

  return out


if __name__ == '__main__':
  outfile = "links.html"
  f = open(outfile, 'wb')
  f.write("<ul>")
  for arg in sys.stdin.readlines():
    infos = getInfos(arg)
    if infos:
      f.write(genHTML(infos))
  f.write("</ul>")
  f.close()
  import webbrowser
  webbrowser.open_new(outfile)
