File size: 4,441 Bytes
834b7c1
 
 
a573bfb
 
834b7c1
 
 
 
 
 
a573bfb
834b7c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a573bfb
 
 
6d959c6
a573bfb
6d959c6
a573bfb
 
6d959c6
a573bfb
 
 
 
e2523de
a573bfb
e2523de
 
 
 
 
 
 
 
 
 
6d959c6
 
 
 
 
 
 
 
 
 
 
 
 
834b7c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d959c6
834b7c1
 
6d959c6
 
 
 
 
 
 
 
a573bfb
 
e2523de
 
 
 
834b7c1
a573bfb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import json
import wikitextparser as wtp
from lxml import etree
from lxml import html

# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page

class MyWikiAPI:
    WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
    WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log"
    user_agent_headers = {"user-Agent":
        "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}

    def __init__(self):
        print(f"***KS*** Initializing Wiki API")

    def __find_section_on_page__(self, page_title, section_title):
        response = requests.get(
            self.WIKI_BASE_URL,
            headers=self.user_agent_headers,
            params={
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': 'sections',
            }).json()
        sections = response['parse']['sections']
        section_ix_found = None
        for sec in sections:
            if sec["line"] == section_title:
                section_ix_found = sec["index"]
        return section_ix_found

    def __get_page_section_content__(self, page_title, section_id, format="wikitext"):
        response = requests.get(
            self.WIKI_BASE_URL,
            headers=self.user_agent_headers,
            params={
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': format,
                'section': section_id
            }).json()
        return response["parse"][format]["*"]

    def __get_featured_log__(self, month, year):
        featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}"
        print(f"Getting content for: {featured_url}")
        response = requests.get(
            featured_url,
            headers=self.user_agent_headers,
            ).text
        return response

    def __process_featured_log__(self, html_content):
        tree = html.fromstring(html_content)
        article_heading = "mw-heading mw-heading3"
        elements = tree.xpath(f"//div[@class='{article_heading}']")
        element_texts = []
        for element in elements:
            n1 = element.getnext()
            n1_text = " ".join(n1.itertext())

            n2 = n1.getnext()
            n2_text = " ".join(n2.itertext())
            element_text = f"{n1_text} {n2_text}"

            element_texts.append(element_text)

        return element_texts

    def __is_int__(self, s):
        try:
            int(s)
        except ValueError:
            return False
        else:
            return True

    def get_category(self, category, year):
        ret = self.__get_category_pages__(category, year)
        print(f"Got category: {category}\n{ret}")

    def get_page_section(self, page_title, section_title):
        section_id = self.__find_section_on_page__(page_title, section_title)
        _ret = ""
        if section_id is not None:
            _ret = self.__get_page_section_content__(page_title, section_id)
        # print(f"***KS*** Got page section for {page_title} / {section_title} \n{_ret}")
        return _ret

    def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end):
        parsed = wtp.parse(section_content)
        sections = parsed.sections
        section_found = None
        for sec in sections:
            if sec.title is not None and sec.title.find(sub_section_name) >= 0:
                section_found = sec

        print(f"Found matching subsection: {section_found.title}")

        rows_collected = []
        if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0:
            table_data = section_found.tables[0].data()
            for row in table_data:
                if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end:
                    rows_collected.append(row)

        return rows_collected

    def get_featured_articles(self, month, year):
        # https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log
        # all articles with all discussions about them
        full_html = self.__get_featured_log__(month, year)

        ret = self.__process_featured_log__(full_html)

        #print(f"Featured for {month} {year}:\n{ret}")
        return ret