Agents-Course-Assignment

Paused

App Files Files Community

Agents-Course-Assignment / my_base_wiki_api.py

krzsam

commit

de39e8f 8 months ago

raw

history blame

4.44 kB

	import requests
	import json
	import wikitextparser as wtp
	from lxml import etree
	from lxml import html

	# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
	# https://www.mediawiki.org/wiki/API:Main_page

	class MyWikiAPI:
	WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
	WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log"
	user_agent_headers = {"user-Agent":
	"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}

	def __init__(self):
	print(f"*KS* Initializing Wiki API")

	def __find_section_on_page__(self, page_title, section_title):
	response = requests.get(
	self.WIKI_BASE_URL,
	headers=self.user_agent_headers,
	params={
	'action': 'parse',
	'format': 'json',
	'page': page_title,
	'prop': 'sections',
	}).json()
	sections = response['parse']['sections']
	section_ix_found = None
	for sec in sections:
	if sec["line"] == section_title:
	section_ix_found = sec["index"]
	return section_ix_found

	def __get_page_section_content__(self, page_title, section_id, format="wikitext"):
	response = requests.get(
	self.WIKI_BASE_URL,
	headers=self.user_agent_headers,
	params={
	'action': 'parse',
	'format': 'json',
	'page': page_title,
	'prop': format,
	'section': section_id
	}).json()
	return response["parse"][format]["*"]

	def __get_featured_log__(self, month, year):
	featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}"
	print(f"Getting content for: {featured_url}")
	response = requests.get(
	featured_url,
	headers=self.user_agent_headers,
	).text
	return response

	def __process_featured_log__(self, html_content):
	tree = html.fromstring(html_content)
	article_heading = "mw-heading mw-heading3"
	elements = tree.xpath(f"//div[@class='{article_heading}']")
	element_texts = []
	for element in elements:
	n1 = element.getnext()
	n1_text = " ".join(n1.itertext())

	n2 = n1.getnext()
	n2_text = " ".join(n2.itertext())
	element_text = f"{n1_text} {n2_text}"

	element_texts.append(element_text)

	return element_texts

	def __is_int__(self, s):
	try:
	int(s)
	except ValueError:
	return False
	else:
	return True

	def get_category(self, category, year):
	ret = self.__get_category_pages__(category, year)
	print(f"Got category: {category}\n{ret}")

	def get_page_section(self, page_title, section_title):
	section_id = self.__find_section_on_page__(page_title, section_title)
	_ret = ""
	if section_id is not None:
	_ret = self.__get_page_section_content__(page_title, section_id)
	# print(f"*KS* Got page section for {page_title} / {section_title} \n{_ret}")
	return _ret

	def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end):
	parsed = wtp.parse(section_content)
	sections = parsed.sections
	section_found = None
	for sec in sections:
	if sec.title is not None and sec.title.find(sub_section_name) >= 0:
	section_found = sec

	print(f"Found matching subsection: {section_found.title}")

	rows_collected = []
	if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0:
	table_data = section_found.tables[0].data()
	for row in table_data:
	if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end:
	rows_collected.append(row)

	return rows_collected

	def get_featured_articles(self, month, year):
	# https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log
	# all articles with all discussions about them
	full_html = self.__get_featured_log__(month, year)

	ret = self.__process_featured_log__(full_html)

	#print(f"Featured for {month} {year}:\n{ret}")
	return ret