Agents-Course-Assignment

Paused

File size: 4,441 Bytes

import requests
import json
import wikitextparser as wtp
from lxml import etree
from lxml import html

# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page

class MyWikiAPI:
    WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
    WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log"
    user_agent_headers = {"user-Agent":
        "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}

    def __init__(self):
        print(f"***KS*** Initializing Wiki API")

    def __find_section_on_page__(self, page_title, section_title):
        response = requests.get(
            self.WIKI_BASE_URL,
            headers=self.user_agent_headers,
            params={
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': 'sections',
            }).json()
        sections = response['parse']['sections']
        section_ix_found = None
        for sec in sections:
            if sec["line"] == section_title:
                section_ix_found = sec["index"]
        return section_ix_found

    def __get_page_section_content__(self, page_title, section_id, format="wikitext"):
        response = requests.get(
            self.WIKI_BASE_URL,
            headers=self.user_agent_headers,
            params={
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': format,
                'section': section_id
            }).json()
        return response["parse"][format]["*"]

    def __get_featured_log__(self, month, year):
        featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}"
        print(f"Getting content for: {featured_url}")
        response = requests.get(
            featured_url,
            headers=self.user_agent_headers,
            ).text
        return response

    def __process_featured_log__(self, html_content):
        tree = html.fromstring(html_content)
        article_heading = "mw-heading mw-heading3"
        elements = tree.xpath(f"//div[@class='{article_heading}']")
        element_texts = []
        for element in elements:
            n1 = element.getnext()
            n1_text = " ".join(n1.itertext())

            n2 = n1.getnext()
            n2_text = " ".join(n2.itertext())
            element_text = f"{n1_text} {n2_text}"

            element_texts.append(element_text)

        return element_texts

    def __is_int__(self, s):
        try:
            int(s)
        except ValueError:
            return False
        else:
            return True

    def get_category(self, category, year):
        ret = self.__get_category_pages__(category, year)
        print(f"Got category: {category}\n{ret}")

    def get_page_section(self, page_title, section_title):
        section_id = self.__find_section_on_page__(page_title, section_title)
        _ret = ""
        if section_id is not None:
            _ret = self.__get_page_section_content__(page_title, section_id)
        # print(f"***KS*** Got page section for {page_title} / {section_title} \n{_ret}")
        return _ret

    def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end):
        parsed = wtp.parse(section_content)
        sections = parsed.sections
        section_found = None
        for sec in sections:
            if sec.title is not None and sec.title.find(sub_section_name) >= 0:
                section_found = sec

        print(f"Found matching subsection: {section_found.title}")

        rows_collected = []
        if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0:
            table_data = section_found.tables[0].data()
            for row in table_data:
                if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end:
                    rows_collected.append(row)

        return rows_collected

    def get_featured_articles(self, month, year):
        # https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log
        # all articles with all discussions about them
        full_html = self.__get_featured_log__(month, year)

        ret = self.__process_featured_log__(full_html)

        #print(f"Featured for {month} {year}:\n{ret}")
        return ret