File size: 4,441 Bytes
834b7c1 a573bfb 834b7c1 a573bfb 834b7c1 a573bfb 6d959c6 a573bfb 6d959c6 a573bfb 6d959c6 a573bfb e2523de a573bfb e2523de 6d959c6 834b7c1 6d959c6 834b7c1 6d959c6 a573bfb e2523de 834b7c1 a573bfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import requests
import json
import wikitextparser as wtp
from lxml import etree
from lxml import html
# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page
class MyWikiAPI:
WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log"
user_agent_headers = {"user-Agent":
"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}
def __init__(self):
print(f"***KS*** Initializing Wiki API")
def __find_section_on_page__(self, page_title, section_title):
response = requests.get(
self.WIKI_BASE_URL,
headers=self.user_agent_headers,
params={
'action': 'parse',
'format': 'json',
'page': page_title,
'prop': 'sections',
}).json()
sections = response['parse']['sections']
section_ix_found = None
for sec in sections:
if sec["line"] == section_title:
section_ix_found = sec["index"]
return section_ix_found
def __get_page_section_content__(self, page_title, section_id, format="wikitext"):
response = requests.get(
self.WIKI_BASE_URL,
headers=self.user_agent_headers,
params={
'action': 'parse',
'format': 'json',
'page': page_title,
'prop': format,
'section': section_id
}).json()
return response["parse"][format]["*"]
def __get_featured_log__(self, month, year):
featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}"
print(f"Getting content for: {featured_url}")
response = requests.get(
featured_url,
headers=self.user_agent_headers,
).text
return response
def __process_featured_log__(self, html_content):
tree = html.fromstring(html_content)
article_heading = "mw-heading mw-heading3"
elements = tree.xpath(f"//div[@class='{article_heading}']")
element_texts = []
for element in elements:
n1 = element.getnext()
n1_text = " ".join(n1.itertext())
n2 = n1.getnext()
n2_text = " ".join(n2.itertext())
element_text = f"{n1_text} {n2_text}"
element_texts.append(element_text)
return element_texts
def __is_int__(self, s):
try:
int(s)
except ValueError:
return False
else:
return True
def get_category(self, category, year):
ret = self.__get_category_pages__(category, year)
print(f"Got category: {category}\n{ret}")
def get_page_section(self, page_title, section_title):
section_id = self.__find_section_on_page__(page_title, section_title)
_ret = ""
if section_id is not None:
_ret = self.__get_page_section_content__(page_title, section_id)
# print(f"***KS*** Got page section for {page_title} / {section_title} \n{_ret}")
return _ret
def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end):
parsed = wtp.parse(section_content)
sections = parsed.sections
section_found = None
for sec in sections:
if sec.title is not None and sec.title.find(sub_section_name) >= 0:
section_found = sec
print(f"Found matching subsection: {section_found.title}")
rows_collected = []
if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0:
table_data = section_found.tables[0].data()
for row in table_data:
if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end:
rows_collected.append(row)
return rows_collected
def get_featured_articles(self, month, year):
# https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log
# all articles with all discussions about them
full_html = self.__get_featured_log__(month, year)
ret = self.__process_featured_log__(full_html)
#print(f"Featured for {month} {year}:\n{ret}")
return ret |