import re from sgmllib import SGMLParser from urltypes import * from macros import * class ParseTag(object): """ Class representing a tag which is parsed by the HTML parser(s) """ def __init__(self, tag, tagdict, pattern=None, enabled=True): # Tag is the name of the tag (element) which will be parsed. # Tagdict is a dictionary which contains the attributes # of the tag which we are interested as keys and the type # of URL the value of the attribute will be saved as, as # the value. If there are more than one type of URL for this # attribute key, then the value is a list. # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] }, # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}. self.tag = tag self.tagdict = tagdict self.enabled = enabled self.pattern = pattern def disable(self): """ Disable parsing of this tag """ self.enabled = False def enable(self): """ Enable parsing of this tag """ self.enabled = True def isEnabled(self): """ Is this tag enabled ? """ return self.enabled def setPattern(self, pattern): self.pattern = pattern def __eq__(self, item): return self.tag.lower() == item.lower() class HarvestManSimpleParser(SGMLParser): """ An HTML/XHTML parser derived from SGMLParser """ # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE) # A more lenient form of query regular expression query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE) skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)') # Junk URLs obtained by parsing HTML of web-directory pages # i.e pages with title "Index of...". The filtering is done after # looking at the title of the page. index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])') features = [ ParseTag('a', {'href': URL_TYPE_ANY}), ParseTag('base', {'href' : URL_TYPE_BASE}), ParseTag('frame', {'src' : URL_TYPE_FRAME}), ParseTag('img', {'src': URL_TYPE_IMAGE}), ParseTag('form', {'action': URL_TYPE_FORM}), ParseTag('link', {'href': URL_TYPE_ANY}), ParseTag('body', {'background' : URL_TYPE_IMAGE}), ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}), ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}), ParseTag('area', {'href': URL_TYPE_ANY}), ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}), ParseTag('embed', {'src': URL_TYPE_ANY}), ParseTag('object', {'data': URL_TYPE_ANY}), ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ] handled_rel_types = ( URL_TYPE_STYLESHEET, ) def __init__(self): self.url = None self.links = [] self.linkpos = {} self.images = [] # Keywords self.keywords = [] # Description of page self.description = '' # Title of page self.title = '' self.title_flag = True # Fix for links self.base_href = False # Base url for above self.base = None # anchor links flag self._anchors = True # For META robots tag self.can_index = True self.can_follow = True # Current tag self._tag = '' SGMLParser.__init__(self) # Type self.typ = 0 def save_anchors(self, value): """ Set the save anchor links flag """ # Warning: If you set this to true, anchor links on # webpages will be saved as separate files. self._anchors = value def enable_feature(self, tag): """ Enable the given tag feature if it is disabled """ if tag in self.features: parsetag = self.features[self.features.index(tag)] parsetag.enable() def disable_feature(self, tag): """ Disable the given tag feature if it is enabled """ if tag in self.features: parsetag = self.features[self.features.index(tag)] parsetag.disable() def filter_link(self, link): """ Function to filter links, we decide here whether to handle certain kinds of links """ if not link: return LINK_EMPTY # ignore javascript links (From 1.2 version javascript # links of the form .js are fetched, but we still ignore # the actual javascript actions since there is no # javascript engine.) llink = link.lower() # Skip javascript, mailto, news and directory special tags. if self.skip_re.match(llink): return LINK_FILTERED # If this is a web-directory Index page, then check for # match with junk URLs of such index pages if self.title.lower().startswith('index of'): if self.index_page_re.match(llink): # print 'Filtering link',llink return LINK_FILTERED # Check if we're accepting query style URLs if not objects.config.getquerylinks and self.query_re.search(llink): debug('Query filtering link',link) return LINK_FILTERED return LINK_NOT_FILTERED def handle_anchor_links(self, link): """ Handle links of the form html#...""" # if anchor tag, then get rid of anchor #... # and only add the webpage link if not link: return LINK_EMPTY # Need to do this here also self.check_add_link(URL_TYPE_ANCHOR, link) # No point in getting #anchor sort of links # since typically they point to anchors in the # same page index = link.rfind('.html#') if index != -1: newhref = link[:(index + 5)] self.check_add_link(URL_TYPE_WEBPAGE, newhref) return ANCHOR_LINK_FOUND else: index = link.rfind('.htm#') if index != -1: newhref = link[:(index + 4)] self.check_add_link(URL_TYPE_WEBPAGE, newhref) return ANCHOR_LINK_FOUND return ANCHOR_LINK_NOT_FOUND def unknown_starttag(self, tag, attrs): """ This method gives you the tag in the html page along with its attributes as a list of tuples """ # Raise event for anybody interested in catching a tagparse event... if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False: # Don't parse this tag.. return # Set as current tag self._tag = tag # print self._tag, attrs if not attrs: return isBaseTag = not self.base and tag == 'base' # print 'Base=>',isBaseTag if tag in self.features: d = CaselessDict(attrs) parsetag = self.features[self.features.index(tag)] # Don't do anything if the feature is disabled if not parsetag.isEnabled(): return tagdict = parsetag.tagdict link = '' for key, typ in tagdict.items(): # If there is a tag # set self.base_href if isBaseTag and key=='href': self.base_href = True try: self.base = d[key] except: self.base_href = False continue # if the link already has a value, skip # (except for applet tags) if tag != 'applet': if link: continue if tag == 'link': try: # Fix - only reset typ if it is one # of the valid handled rel types. foundtyp = d['rel'].lower() if foundtyp in self.handled_rel_types: typ = getTypeClass(foundtyp) except KeyError: pass try: if tag == 'meta': # Handle meta tag for refresh foundtyp = d.get('http-equiv','').lower() if foundtyp.lower() == 'refresh': link = d.get(key,'') if not link: continue # This will be of the form of either # a time-gap (CONTENT="600") or a time-gap # with a URL (CONTENT="0; URL=") items = link.split(';') if len(items)==1: # Only a time-gap, skip it continue elif len(items)==2: # Second one should be a URL reqd = items[1] # print 'Reqd=>',reqd if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1: link = reqd.split('=')[1].strip() # print 'Link=>',link else: continue else: # Handle robots meta tag name = d.get('name','').lower() if name=='robots': robots = d.get('content','').lower() # Split to ',' contents = [item.strip() for item in robots.split(',')] # Check for nofollow self.can_follow = not ('nofollow' in contents) # Check for noindex self.can_index = not ('noindex' in contents) elif name=='keywords': self.keywords = d.get('content','').split(',') # Trim the keywords list self.keywords = [word.lower().strip() for word in self.keywords] elif name=='description': self.description = d.get('content','').strip() else: continue elif tag != 'applet': link = d[key] else: link += d[key] if key == 'codebase': if link: if link[-1] != '/': link += '/' continue except KeyError: continue # see if this link is to be filtered if self.filter_link(link) != LINK_NOT_FILTERED: # print 'Filtered link',link continue # anchor links in a page should not be saved # index = link.find('#') # Make sure not to wrongly categorize '#' in query strings # as anchor URLs. if link.find('#') != -1 and not self.query_re.search(link): # print 'Is an anchor link',link self.handle_anchor_links(link) else: # append to private list of links self.check_add_link(typ, link) def unknown_endtag(self, tag): self._tag = '' if tag=='title': self.title_flag = False self.title = self.title.strip() def handle_data(self, data): if self._tag.lower()=='title' and self.title_flag: self.title += data def check_add_link(self, typ, link): """ To avoid adding duplicate links """ f = False if typ == 'image': if not (typ, link) in self.images: self.images.append((typ, link)) elif not (typ, link) in self.links: # print 'Adding link ', link, typ pos = self.getpos() self.links.append((typ, link)) self.linkpos[(typ,link)] = (pos[0],pos[1]) def add_tag_info(self, taginfo): """ Add new tag information to this object. This can be used to change the behavior of this class at runtime by adding new tags """ # The taginfo object should be a dictionary # of the form { tagtype : (elementname, elementype) } # egs: { 'body' : ('background', 'img) } if type(taginfo) != dict: raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!" # get the key of the dictionary key = (taginfo.keys())[0] if len(taginfo[key]) != 2: raise ValueError, 'Value mismatch, size of tag tuple should be 2' # get the value tuple tagelname, tageltype = taginfo[key] # see if this is an already existing tagtype if key in self.handled.keys: _values = self.handled[key] f=0 for index in xrange(len(_values)): # if the elementname is also # the same, just replace it. v = _values[index] elname, eltype = v if elname == tagelname: f=1 _values[index] = (tagelname, tageltype) break # new element, add it to list if f==0: _values.append((tagelname, tageltype)) return else: # new key, directly modify dictionary elements = [] elements.append((tagelname, tageltype)) self.handled[key] = elements def reset(self): SGMLParser.reset(self) self.url = None self.base = None self.links = [] self.images = [] self.base_href = False self.base_url = '' self.can_index = True self.can_follow = True self.title = '' self.title_flag = True self.description = '' self.keywords = [] def base_url_defined(self): """ Return whether this url had a base url of the form defined """ return self.base_href def get_base_url(self): return self.base def set_url(self, url): """ Set the URL whose data is about to be parsed """ self.url = url class HarvestManSGMLOpParser(HarvestManSimpleParser): """ A parser based on effbot's sgmlop """ def __init__(self): # This module should be built already! import sgmlop self.parser = sgmlop.SGMLParser() self.parser.register(self) HarvestManSimpleParser.__init__(self) # Type self.typ = 1 def finish_starttag(self, tag, attrs): self.unknown_starttag(tag, attrs) def finish_endtag(self, tag): self.unknown_endtag(tag) def feed(self, data): self.parser.feed(data) class HarvestManCSSParser(object): """ Class to parse stylesheets and extract URLs """ # Regexp to parse stylesheet imports importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE) importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE) # Regexp to parse URLs inside CSS files cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE) def __init__(self): # Any imported stylesheet URLs self.csslinks = [] # All URLs including above self.links = [] def feed(self, data): self._parse(data) def _parse(self, data): """ Parse stylesheet data and extract imported css links, if any """ # Return is a list of imported css links. # This subroutine uses the specification mentioned at # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import # for doing stylesheet imports. # This takes care of @import "style.css" and # @import url("style.css") and url(...) syntax. # Media types specified if any, are ignored. # Matches for @import "style.css" l1 = self.importcss1.findall(data) # Matches for @import url("style.css") l2 = self.importcss2.findall(data) # Matches for url(...) l3 = self.cssurl.findall(data) for item in (l1+l2): if not item: continue url = item[1].replace("'",'').replace('"','') self.csslinks.append(url) self.links.append(url) for item in l3: if not item: continue url = item[1].replace("'",'').replace('"','') if url not in self.links: self.links.append(url)