| import requests |
| import json |
| import wikitextparser as wtp |
| from lxml import etree |
| from lxml import html |
|
|
| |
| |
|
|
| class MyWikiAPI: |
| WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php" |
| WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log" |
| user_agent_headers = {"user-Agent": |
| "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} |
|
|
| def __init__(self): |
| print(f"***KS*** Initializing Wiki API") |
|
|
| def __find_section_on_page__(self, page_title, section_title): |
| response = requests.get( |
| self.WIKI_BASE_URL, |
| headers=self.user_agent_headers, |
| params={ |
| 'action': 'parse', |
| 'format': 'json', |
| 'page': page_title, |
| 'prop': 'sections', |
| }).json() |
| sections = response['parse']['sections'] |
| section_ix_found = None |
| for sec in sections: |
| if sec["line"] == section_title: |
| section_ix_found = sec["index"] |
| return section_ix_found |
|
|
| def __get_page_section_content__(self, page_title, section_id, format="wikitext"): |
| response = requests.get( |
| self.WIKI_BASE_URL, |
| headers=self.user_agent_headers, |
| params={ |
| 'action': 'parse', |
| 'format': 'json', |
| 'page': page_title, |
| 'prop': format, |
| 'section': section_id |
| }).json() |
| return response["parse"][format]["*"] |
|
|
| def __get_featured_log__(self, month, year): |
| featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}" |
| print(f"Getting content for: {featured_url}") |
| response = requests.get( |
| featured_url, |
| headers=self.user_agent_headers, |
| ).text |
| return response |
|
|
| def __process_featured_log__(self, html_content): |
| tree = html.fromstring(html_content) |
| article_heading = "mw-heading mw-heading3" |
| elements = tree.xpath(f"//div[@class='{article_heading}']") |
| element_texts = [] |
| for element in elements: |
| n1 = element.getnext() |
| n1_text = " ".join(n1.itertext()) |
|
|
| n2 = n1.getnext() |
| n2_text = " ".join(n2.itertext()) |
| element_text = f"{n1_text} {n2_text}" |
|
|
| element_texts.append(element_text) |
|
|
| return element_texts |
|
|
| def __is_int__(self, s): |
| try: |
| int(s) |
| except ValueError: |
| return False |
| else: |
| return True |
|
|
| def get_category(self, category, year): |
| ret = self.__get_category_pages__(category, year) |
| print(f"Got category: {category}\n{ret}") |
|
|
| def get_page_section(self, page_title, section_title): |
| section_id = self.__find_section_on_page__(page_title, section_title) |
| _ret = "" |
| if section_id is not None: |
| _ret = self.__get_page_section_content__(page_title, section_id) |
| |
| return _ret |
|
|
| def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end): |
| parsed = wtp.parse(section_content) |
| sections = parsed.sections |
| section_found = None |
| for sec in sections: |
| if sec.title is not None and sec.title.find(sub_section_name) >= 0: |
| section_found = sec |
|
|
| print(f"Found matching subsection: {section_found.title}") |
|
|
| rows_collected = [] |
| if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0: |
| table_data = section_found.tables[0].data() |
| for row in table_data: |
| if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end: |
| rows_collected.append(row) |
|
|
| return rows_collected |
|
|
| def get_featured_articles(self, month, year): |
| |
| |
| full_html = self.__get_featured_log__(month, year) |
|
|
| ret = self.__process_featured_log__(full_html) |
|
|
| |
| return ret |