import html.parser import re class page_parser(html.parser.HTMLParser): STATE_OTHER = 0 STATE_HEADER = 1 STATE_LOG = 2 def clean_up_entry(self): self.entry = re.sub(r'\s+', r' ', self.entry) self.entry = re.sub(r'(\S)\(', r'\1 (', self.entry) self.entry = re.sub(r'\)(\S)', r') \1', self.entry) self.entry = re.sub(r'\(\s+', r'(', self.entry) self.entry = re.sub(r'\s+\)', r')', self.entry) self.entry = re.sub(r'\s+:', r':', self.entry) self.entry = re.sub(r'\s+,', r',', self.entry) self.entry = re.sub(r'(Stability|Military) ([-+]\d+)', r'\2 [\1]', self.entry) self.entry = re.sub(r'([-+]\d+) (Stability|Military)', r'\1 [\2]', self.entry) self.entry = re.sub(r'(Progress Cards on Game Board) for Round \d+', r'\1', self.entry) self.entry = re.sub(r"(?:^|(?<=\W))'([-\w\s]+)(?:(?=[^-'\w\s])|$)", r'\1', self.entry) self.entry = re.sub(r"(?:^|(?<=\W))'([^']+(?:s' )[^']*)'(?:(?=\W)|$)", r'{\1}', self.entry) self.entry = re.sub(r"(?:^|(?<=\W))'([^']*(?:\S's)[^']*)'(?:(?=\W)|$)", r'{\1}', self.entry) self.entry = re.sub(r"(?:^|(?<=\W))'([^']+)'(?:(?=\W)|$)", r'{\1}', self.entry) self.entry = re.sub(r'({[^}]*})\s*\[[^]]*\]', r'\1', self.entry) m = re.search(r'cards\s+((?:\[[^]]*\]\s*)+)', self.entry) if m is not None: card_list = m.group(1) card_list = card_list.replace('[', '{') card_list = card_list.replace(']', '}') card_list = card_list.replace('_', ' ') card_list = card_list.replace(' s ', "'s ") card_list = card_list.replace('s ', "s' ") before_card_list = self.entry[:m.start(0)] after_card_list = self.entry[m.end(0):] self.entry = before_card_list + card_list + after_card_list if self.state == self.STATE_HEADER: self.entry = re.sub(r'(Nations Game ID=\d+), ', r'\1\n', self.entry) self.entry = re.sub(r'\s*Game finished - ', r'\n', self.entry) self.entry = re.sub(r'\s*(Players:)', r'\n\1', self.entry) self.entry = re.sub(r',?\s*(round:)', r'\n\1', self.entry) def add_entry(self): if self.log and not self.log.endswith('\n'): self.log += '\n' self.clean_up_entry() self.log += self.entry if self.log and not self.log.endswith('\n'): self.log += '\n' self.entry = '' def handle_starttag(self, tag, attrs): if tag in ('div', 'li'): self.add_entry() if tag == 'div': for attr in attrs: if attr[0] == 'id': div_id = attr[1] if div_id == 'nations-gameheader': self.state = self.STATE_HEADER elif div_id == 'nations-log': self.state = self.STATE_LOG if self.state == self.STATE_OTHER: return if tag == 'img': for attr in attrs: if attr[0] == 'src': image_name = re.sub(r'^.*/([^/]*)\.[^.]*', r'\1', attr[1]) if image_name.startswith('Disc_'): return image_name = re.sub(r'^Token_', r'', image_name) if image_name.startswith('Meeple'): image_name = 'Worker' if image_name == 'Heritage': image_name = 'Books' if self.entry: self.entry += ' ' self.entry += '[' + image_name + ']' def handle_endtag(self, tag): if tag in ('div', 'li'): self.add_entry() if tag == 'div': self.state = self.STATE_OTHER def handle_data(self, data): if (self.state == self.STATE_OTHER or not data.strip()) and not data.startswith('Game finished'): return if self.entry: self.entry += ' ' self.entry += data.strip() def parse_page(self, page): self.state = self.STATE_OTHER self.log = '' self.entry = '' self.feed(page) log = self.log self.state = self.STATE_OTHER self.log = '' self.entry = '' return log def parse_page(page): return page_parser().parse_page(page)