File size: 10,703 Bytes
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c02ce2
48c27bb
 
 
 
3c02ce2
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
3c02ce2
 
 
48c27bb
3c02ce2
 
 
 
 
 
48c27bb
3c02ce2
 
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import requests
from datetime import datetime, timedelta
from typing import Dict, Optional
import re


def run_get_request(params: dict):
    """
    Utility function to run GET request against Wikipedia API
    """
    base_url = "https://en.wikipedia.org/w/api.php"

    # We need to supply headers for the request to work
    headers = {
        "User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}"
    }

    response = requests.get(base_url, params=params, headers=headers)
    # Handle HTTP errors
    response.raise_for_status()

    try:
        json_data = response.json()
    except Exception:
        raise ValueError(f"Unable to parse response: {response}")

    return json_data


def extract_revision_info(json_data, revnum=0, limit_revnum=True):
    """
    Utility function to extract page revision info from JSON data returned from API call

    Args:
        revnum: revision before current

    Examples:
        title = 'David_Szalay'
        json_data = get_previous_revisions(title, revisions = 100)
        extract_revision_info(json_data)       # Current revision
        extract_revision_info(json_data, 10)   # 10th revision before current
        extract_revision_info(json_data, 100)  # 10th revision before current
    """
    # Extract page and revision info
    pages = json_data["query"]["pages"]
    page_id = list(pages.keys())[0]

    try:
        if limit_revnum:
            # Limit revnum to earliest available revision before current
            revnum = min([revnum, len(pages[page_id]["revisions"]) - 1])
        # Get the specified revision
        revision = pages[page_id]["revisions"][revnum]
        # Remove the parentid key because we don't use it
        _ = revision.pop("parentid", None)
        # Add the actual revision number
        revision["revnum"] = revnum
        return revision
    except:
        # Page or revision not found, return empty dict
        return {"revid": None, "timestamp": None, "revnum": None}


def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
    """
    Get the revision info of a Wikipedia article closest to the age in days.

    Args:
        title: Wikipedia article title (e.g., 'David_Szalay')
        age_days: Age of the article revision in days (0 for current)

    Returns:
        Dictionary containing:
        - 'revid': Revision id of the article revision
        - 'timestamp': Timestamp of the article revision
    """

    # Get the target date
    target_date = datetime.utcnow() - timedelta(days=age_days)

    # Get the revision closest to the target date
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvlimit": 1,
        "rvdir": "older",
        "rvstart": target_date.isoformat() + "Z",
        "rvprop": "ids|timestamp",
        "format": "json",
    }

    # Run GET request
    json_data = run_get_request(params)

    # Return revision info
    return extract_revision_info(json_data)


def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]:
    """
    Get the revision info of a Wikipedia article a certain number of revisions before the current one.

    Args:
        title: Wikipedia article title (e.g., 'David_Szalay')
        revision: What revision before current (0 for current, must be between 0 and 499)

    Returns:
        Dictionary containing:
        - 'revid': Revision id of the article revision
        - 'timestamp': Timestamp of the article revision

    Note:
        In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500
        rvlimit = 1 returns a single revision: the current one
        rvlimit = 101 returns the 100 most recent revisions and the current one
        This is why we use rvlimit = revision + 1
    """

    # Get the revision closest to the target date
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvlimit": revisions + 1,
        "rvdir": "older",
        "rvprop": "ids|timestamp",
        "format": "json",
    }

    # Run GET request
    json_data = run_get_request(params)

    # Return info for all revisions
    return json_data


def get_wikipedia_introduction(revid: int) -> Dict[str, str]:
    """
    Retrieve the introduction of a Wikipedia article.

    Args:
        revid: Revision id of the article

    Returns:
        Text of the introduction

    Example:
        # Get intro from current article revision
        revision_info = get_revision_from_age("David_Szalay")
        get_wikipedia_introduction(revision_info["revid"])
    """

    # Return None for missing revid
    if not revid:
        return None

    # Get the content of this specific revision
    params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"}

    json_data = run_get_request(params)

    # Sometimes a revision is deleted and can't be viewed
    # E.g. revid = '1276494621' for Turin
    try:
        html_content = json_data["parse"]["text"]["*"]
    except:
        return None

    # Extract introduction (text before first section heading)
    # Remove everything from the first <h2> tag onwards
    intro_html = re.split(r"<h2", html_content, maxsplit=1)[0]

    # Extract text from paragraphs, excluding certain elements
    from html.parser import HTMLParser

    class IntroParser(HTMLParser):
        def __init__(self):
            super().__init__()
            self.text = []
            self.in_p = False
            self.skip = False

        def handle_starttag(self, tag, attrs):
            if tag == "p":
                self.in_p = True
            # Skip certain elements
            if tag in ["style", "script", "table", "div"]:
                attrs_dict = dict(attrs)
                # Skip infoboxes, navboxes, etc.
                if "class" in attrs_dict:
                    if any(
                        x in attrs_dict["class"]
                        for x in ["infobox", "navbox", "metadata", "toc"]
                    ):
                        self.skip = True
                if tag in ["style", "script"]:
                    self.skip = True

        def handle_endtag(self, tag):
            if tag == "p":
                if self.in_p and self.text and not self.text[-1].endswith("\n\n"):
                    self.text.append("\n\n")
                self.in_p = False
            if tag in ["style", "script", "table", "div"]:
                self.skip = False

        def handle_data(self, data):
            if self.in_p and not self.skip:
                # *Don't* clean up whitespace here - it makes run-on words
                # text = " ".join(data.split())
                text = data
                if text:
                    self.text.append(text)

    parser = IntroParser()
    parser.feed(intro_html)

    # Join and clean up the text
    introduction = "".join(parser.text).strip()

    # Remove multiple newlines
    introduction = re.sub(r"\n{3,}", "\n\n", introduction)

    # Remove empty paragraphs
    paragraphs = [p.strip() for p in introduction.split("\n\n") if p.strip()]
    introduction = "\n\n".join(paragraphs)

    return introduction


def get_revisions_behind(title: str, revid: int) -> int:
    """
    Get the number of revisions a given revid is behind the current revision of the page.

    Args:
        revid: Revision ID of the page

    Returns:
        Integer representing the number of revisions back (0 if it's the current revision)

    Example:
        # Get how many revisions behind a specific revid is
        revisions_behind = get_revisions_behind(123456789)
    """

    ## First, get the page title from the revid
    # params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"}
    # try:
    #    json_data = run_get_request(params)
    #    title = json_data["parse"]["title"]
    # except Exception:
    #    # If we can't get the title, the revid might be invalid
    #    raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.")

    # Search through revisions going back from current
    # We'll paginate through results if needed
    revision_count = 0
    continue_token = None

    # Run the loop twice to get up to 1000 revisions behind
    for i in range(2):
        params = {
            "action": "query",
            "titles": title,
            "prop": "revisions",
            "rvlimit": 500,  # API limit per request
            "rvdir": "older",
            "rvprop": "ids",
            "format": "json",
        }

        if continue_token:
            params["rvcontinue"] = continue_token

        try:
            json_data = run_get_request(params)
            pages = json_data["query"]["pages"]
            page_id = list(pages.keys())[0]

            if page_id == "-1":
                raise ValueError(f"Page not found for revid {revid}")

            revisions = pages[page_id]["revisions"]

            # Find the index of the given revid in the current batch of revisions
            for i, revision in enumerate(revisions):
                if revision["revid"] == revid:
                    return revision_count + i

            # Update the count of revisions we've checked
            revision_count += len(revisions)

            # Check if there are more revisions to search
            continue_token = json_data.get("continue", {}).get("rvcontinue")

            if not continue_token:
                # Reached the end of revisions but didn't find the revid
                raise ValueError(
                    f"Revid {revid} not found in the revision history of the page. "
                    f"It may be from a different page or may have been deleted."
                )

        except ValueError:
            # Re-raise ValueError exceptions
            raise
        except Exception as e:
            raise ValueError(f"Error searching for revid {revid}: {e}")

    # If we looped without returning the revision count, return it as a negative number
    negative_revision_count = -revision_count
    return negative_revision_count


def get_random_wikipedia_title():
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "random",
        "rnnamespace": 0,
        "rnlimit": 1,
        "format": "json",
    }

    try:
        json_data = run_get_request(params)

        # Extract the title
        title = json_data["query"]["random"][0]["title"]
        return title

    except requests.RequestException as e:
        print(f"Error fetching random Wikipedia title: {e}")
        return None