import requests import json import wikitextparser as wtp from lxml import etree from lxml import html # https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6 # https://www.mediawiki.org/wiki/API:Main_page class MyWikiAPI: WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php" WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log" user_agent_headers = {"user-Agent": "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} def __init__(self): print(f"***KS*** Initializing Wiki API") def __find_section_on_page__(self, page_title, section_title): response = requests.get( self.WIKI_BASE_URL, headers=self.user_agent_headers, params={ 'action': 'parse', 'format': 'json', 'page': page_title, 'prop': 'sections', }).json() sections = response['parse']['sections'] section_ix_found = None for sec in sections: if sec["line"] == section_title: section_ix_found = sec["index"] return section_ix_found def __get_page_section_content__(self, page_title, section_id, format="wikitext"): response = requests.get( self.WIKI_BASE_URL, headers=self.user_agent_headers, params={ 'action': 'parse', 'format': 'json', 'page': page_title, 'prop': format, 'section': section_id }).json() return response["parse"][format]["*"] def __get_featured_log__(self, month, year): featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}" print(f"Getting content for: {featured_url}") response = requests.get( featured_url, headers=self.user_agent_headers, ).text return response def __process_featured_log__(self, html_content): tree = html.fromstring(html_content) article_heading = "mw-heading mw-heading3" elements = tree.xpath(f"//div[@class='{article_heading}']") element_texts = [] for element in elements: n1 = element.getnext() n1_text = " ".join(n1.itertext()) n2 = n1.getnext() n2_text = " ".join(n2.itertext()) element_text = f"{n1_text} {n2_text}" element_texts.append(element_text) return element_texts def __is_int__(self, s): try: int(s) except ValueError: return False else: return True def get_category(self, category, year): ret = self.__get_category_pages__(category, year) print(f"Got category: {category}\n{ret}") def get_page_section(self, page_title, section_title): section_id = self.__find_section_on_page__(page_title, section_title) _ret = "" if section_id is not None: _ret = self.__get_page_section_content__(page_title, section_id) # print(f"***KS*** Got page section for {page_title} / {section_title} \n{_ret}") return _ret def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end): parsed = wtp.parse(section_content) sections = parsed.sections section_found = None for sec in sections: if sec.title is not None and sec.title.find(sub_section_name) >= 0: section_found = sec print(f"Found matching subsection: {section_found.title}") rows_collected = [] if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0: table_data = section_found.tables[0].data() for row in table_data: if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end: rows_collected.append(row) return rows_collected def get_featured_articles(self, month, year): # https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log # all articles with all discussions about them full_html = self.__get_featured_log__(month, year) ret = self.__process_featured_log__(full_html) #print(f"Featured for {month} {year}:\n{ret}") return ret