|
|
import requests |
|
|
import json |
|
|
import wikitextparser as wtp |
|
|
from lxml import etree |
|
|
from lxml import html |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MyWikiAPI: |
|
|
WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php" |
|
|
WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log" |
|
|
user_agent_headers = {"user-Agent": |
|
|
"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} |
|
|
|
|
|
def __init__(self): |
|
|
print(f"***KS*** Initializing Wiki API") |
|
|
|
|
|
def __find_section_on_page__(self, page_title, section_title): |
|
|
response = requests.get( |
|
|
self.WIKI_BASE_URL, |
|
|
headers=self.user_agent_headers, |
|
|
params={ |
|
|
'action': 'parse', |
|
|
'format': 'json', |
|
|
'page': page_title, |
|
|
'prop': 'sections', |
|
|
}).json() |
|
|
sections = response['parse']['sections'] |
|
|
section_ix_found = None |
|
|
for sec in sections: |
|
|
if sec["line"] == section_title: |
|
|
section_ix_found = sec["index"] |
|
|
return section_ix_found |
|
|
|
|
|
def __get_page_section_content__(self, page_title, section_id, format="wikitext"): |
|
|
response = requests.get( |
|
|
self.WIKI_BASE_URL, |
|
|
headers=self.user_agent_headers, |
|
|
params={ |
|
|
'action': 'parse', |
|
|
'format': 'json', |
|
|
'page': page_title, |
|
|
'prop': format, |
|
|
'section': section_id |
|
|
}).json() |
|
|
return response["parse"][format]["*"] |
|
|
|
|
|
def __get_featured_log__(self, month, year): |
|
|
featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}" |
|
|
print(f"Getting content for: {featured_url}") |
|
|
response = requests.get( |
|
|
featured_url, |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
return response |
|
|
|
|
|
def __process_featured_log__(self, html_content): |
|
|
tree = html.fromstring(html_content) |
|
|
article_heading = "mw-heading mw-heading3" |
|
|
elements = tree.xpath(f"//div[@class='{article_heading}']") |
|
|
element_texts = [] |
|
|
for element in elements: |
|
|
n1 = element.getnext() |
|
|
n1_text = " ".join(n1.itertext()) |
|
|
|
|
|
n2 = n1.getnext() |
|
|
n2_text = " ".join(n2.itertext()) |
|
|
element_text = f"{n1_text} {n2_text}" |
|
|
|
|
|
element_texts.append(element_text) |
|
|
|
|
|
return element_texts |
|
|
|
|
|
def __is_int__(self, s): |
|
|
try: |
|
|
int(s) |
|
|
except ValueError: |
|
|
return False |
|
|
else: |
|
|
return True |
|
|
|
|
|
def get_category(self, category, year): |
|
|
ret = self.__get_category_pages__(category, year) |
|
|
print(f"Got category: {category}\n{ret}") |
|
|
|
|
|
def get_page_section(self, page_title, section_title): |
|
|
section_id = self.__find_section_on_page__(page_title, section_title) |
|
|
_ret = "" |
|
|
if section_id is not None: |
|
|
_ret = self.__get_page_section_content__(page_title, section_id) |
|
|
|
|
|
return _ret |
|
|
|
|
|
def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end): |
|
|
parsed = wtp.parse(section_content) |
|
|
sections = parsed.sections |
|
|
section_found = None |
|
|
for sec in sections: |
|
|
if sec.title is not None and sec.title.find(sub_section_name) >= 0: |
|
|
section_found = sec |
|
|
|
|
|
print(f"Found matching subsection: {section_found.title}") |
|
|
|
|
|
rows_collected = [] |
|
|
if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0: |
|
|
table_data = section_found.tables[0].data() |
|
|
for row in table_data: |
|
|
if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end: |
|
|
rows_collected.append(row) |
|
|
|
|
|
return rows_collected |
|
|
|
|
|
def get_featured_articles(self, month, year): |
|
|
|
|
|
|
|
|
full_html = self.__get_featured_log__(month, year) |
|
|
|
|
|
ret = self.__process_featured_log__(full_html) |
|
|
|
|
|
|
|
|
return ret |