#!/sw/bin/python2.6 # # Author: Adayapalam Appaiah Kumaraswamy # Date: July 10, 2004 # File Name: cgilink.cgi # Program Description: This program checks links on a single HTML # web page and displays whether they are working # or unavailable. # # Python Version: Python 2.2.3 # Contact: ee03b091@ee.iitm.ac.in # Categories: "CGI scripts", "Miscellaneous" """ CGI Link Checker 0.1 Author: Adayapalam Appaiah Kumaraswamy Date: 10 Jul 2004 License This software is in the PUBLIC DOMAIN. You can use it for whatever you like, but the author will not be responsible for any direct or indirect damages caused by the use of this software. This is a simple program I wrote to check my links on a single HTML page. It does _not_ nest and check the linked pages. Limitations: If your URL to be checked is a directory, like http://foo.com/bar, you must specify it as http://foo.com/bar/ - with the trailing '/'. Kumar """ # EDIT THIS VARIABLE # if you have placed the accompanying files on the web server in a # directory other than /cgilink/. Make sure that it starts with and # ends with a slash (/), if it is on the same server. # e.g. /mystuff/cgilinkfiles/ # or http://myserver/mystuff/cgilinkfiles/ SERVER_INSTALL_DIR = "http://localhost/~ajh/scripts/cgilink/" import cgi, cgitb cgitb.enable() import urllib2, sys, re, urllib, os.path, sgmllib, time import HTMLParser starttime = time.time() class LinkParser(HTMLParser.HTMLParser): def __init__(self, base): HTMLParser.HTMLParser.__init__(self) self.base = base self.nolinks = 0 self.noimgs = 0 self.noobjs = 0 self.files = [] def handle_starttag(self, tag, attrs): if tag in ("link", "a"): for i in attrs: if i[0] == "href": element = (urllib.basejoin(self.base, i[1]), "Link") if not element in self.files: self.files.append(element) self.nolinks += 1 elif tag == "img": for i in attrs: if i[0] == "src": element = (urllib.basejoin(self.base, i[1]), "Image") if not element in self.files: self.files.append(element) self.noimgs += 1 elif tag == "object": for i in attrs: if i[0] == "data": element = (urllib.basejoin(self.base, i[1]), "Object") if not element in self.files: self.files.append(element) self.noobjs += 1 elif tag == "base": for i in attrs: if i[0] == "href": self.base=i[1] def getlist(self): HTMLParser.HTMLParser.close(self) return self.files def get_response(urlstr): try: conn = urllib2.urlopen(urlstr) except urllib2.HTTPError, herr: return "HTTP Error: " + str(herr) except urllib2.URLError, err: if err.reason[0] == -3: return "URL Error: " + str(err.reason[1]) else: return "URL not supported: " + str(err.reason) conn.close() return "" def print_all(data, base=None): good = 0 bad = 0 unsupp = 0 linkparser = LinkParser(base) linkparser.feed(data) files = linkparser.getlist() print """

The following links were tested:

""" imgstr = "" flushcount = 0 for file in files: x = get_response(file[0]) print "
  • " linkstr = "%s: %s" real_link = file[0] real_ltext = file[0] real_ltext = file[0].replace("&", "&") if not x: print linkstr % (file[1], "oklink", real_link, real_ltext) print imgstr % ("tick2", "OK") good += 1 elif x[:1] == "H": print linkstr % (file[1], "badlink", real_link, real_ltext) print imgstr % ("cross", "Error") print x bad += 1 elif x[:1] == "U": print linkstr % (file[1], "urlerror", real_link, real_ltext) print imgstr % ("ques", "Unsupported") print x unsupp += 1 print "
  • " if flushcount == 2: flushcount = 0 sys.stdout.flush() else: flushcount += 1 print "" conclusion = "

    Tested " bold = "%d" conclusion += bold % linkparser.noobjs + " objects, " conclusion += bold % linkparser.noimgs + " images and " conclusion += bold % linkparser.nolinks + " links/hyperlinks. " conclusion += "Of these, " + bold % good + " appear to be valid references, " \ + bold % unsupp + " were not available or unsupported and " \ + bold % bad + " appear to be bad/unavailable references.

    " print conclusion form = cgi.FieldStorage() print "Content-type: text/html" print """ """ print "" % SERVER_INSTALL_DIR print "" % SERVER_INSTALL_DIR print """ Link check result """ if not form.has_key('url'): print "

    Error, enter a valid URL!

    " print "" sys.exit(0) heading = '

    Results: checking %s

    ' urlname = form.getfirst('url') print heading % (urlname, urlname) print "
    " if not urlname[:7] == "http://": urlname = "http://" + urlname try: url = urllib.urlopen(urlname) except IOError, (ierr, strierr): print "

    Error opening " + urlname + ": " + strierr[1] + "! \ Please check the URL you have entered.

    " print "
    " sys.exit(0) data = url.read() try: print_all(data, base=url.geturl()) except Exception, err: print "

    There was an error parsing the page specified: " + str(err) + ". " print '

    Please have it checked using the W3C Validator service ' print 'and retry using this link checker.

    ' stoptime = time.time() print '

    Test performed in ' + "%.2f" % (stoptime - starttime) + " seconds.

    " print '

    ' print """ \"Valid" % SERVER_INSTALL_DIR print """ """ print "\"Valid" % SERVER_INSTALL_DIR print '' print "\"Python" % SERVER_INSTALL_DIR print '

    ' print ""