Spaces:
Running
Running
| import requests | |
| from datetime import datetime, timedelta | |
| from typing import Dict, Optional | |
| import re | |
| def run_get_request(params: dict): | |
| """ | |
| Utility function to run GET request against Wikipedia API | |
| """ | |
| base_url = "https://en.wikipedia.org/w/api.php" | |
| # We need to supply headers for the request to work | |
| headers = { | |
| "User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}" | |
| } | |
| response = requests.get(base_url, params=params, headers=headers) | |
| # Handle HTTP errors | |
| response.raise_for_status() | |
| try: | |
| json_data = response.json() | |
| except Exception: | |
| raise ValueError(f"Unable to parse response: {response}") | |
| return json_data | |
| def extract_revision_info(json_data, revnum=0, limit_revnum=True): | |
| """ | |
| Utility function to extract page revision info from JSON data returned from API call | |
| Args: | |
| revnum: revision before current | |
| Examples: | |
| title = 'David_Szalay' | |
| json_data = get_previous_revisions(title, revisions = 100) | |
| extract_revision_info(json_data) # Current revision | |
| extract_revision_info(json_data, 10) # 10th revision before current | |
| extract_revision_info(json_data, 100) # 10th revision before current | |
| """ | |
| # Extract page and revision info | |
| pages = json_data["query"]["pages"] | |
| page_id = list(pages.keys())[0] | |
| try: | |
| if limit_revnum: | |
| # Limit revnum to earliest available revision before current | |
| revnum = min([revnum, len(pages[page_id]["revisions"]) - 1]) | |
| # Get the specified revision | |
| revision = pages[page_id]["revisions"][revnum] | |
| # Remove the parentid key because we don't use it | |
| _ = revision.pop("parentid", None) | |
| # Add the actual revision number | |
| revision["revnum"] = revnum | |
| return revision | |
| except: | |
| # Page or revision not found, return empty dict | |
| return {"revid": None, "timestamp": None, "revnum": None} | |
| def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]: | |
| """ | |
| Get the revision info of a Wikipedia article closest to the age in days. | |
| Args: | |
| title: Wikipedia article title (e.g., 'David_Szalay') | |
| age_days: Age of the article revision in days (0 for current) | |
| Returns: | |
| Dictionary containing: | |
| - 'revid': Revision id of the article revision | |
| - 'timestamp': Timestamp of the article revision | |
| """ | |
| # Get the target date | |
| target_date = datetime.utcnow() - timedelta(days=age_days) | |
| # Get the revision closest to the target date | |
| params = { | |
| "action": "query", | |
| "titles": title, | |
| "prop": "revisions", | |
| "rvlimit": 1, | |
| "rvdir": "older", | |
| "rvstart": target_date.isoformat() + "Z", | |
| "rvprop": "ids|timestamp", | |
| "format": "json", | |
| } | |
| # Run GET request | |
| json_data = run_get_request(params) | |
| # Return revision info | |
| return extract_revision_info(json_data) | |
| def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]: | |
| """ | |
| Get the revision info of a Wikipedia article a certain number of revisions before the current one. | |
| Args: | |
| title: Wikipedia article title (e.g., 'David_Szalay') | |
| revision: What revision before current (0 for current, must be between 0 and 499) | |
| Returns: | |
| Dictionary containing: | |
| - 'revid': Revision id of the article revision | |
| - 'timestamp': Timestamp of the article revision | |
| Note: | |
| In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500 | |
| rvlimit = 1 returns a single revision: the current one | |
| rvlimit = 101 returns the 100 most recent revisions and the current one | |
| This is why we use rvlimit = revision + 1 | |
| """ | |
| # Get the revision closest to the target date | |
| params = { | |
| "action": "query", | |
| "prop": "revisions", | |
| "titles": title, | |
| "rvlimit": revisions + 1, | |
| "rvdir": "older", | |
| "rvprop": "ids|timestamp", | |
| "format": "json", | |
| } | |
| # Run GET request | |
| json_data = run_get_request(params) | |
| # Return info for all revisions | |
| return json_data | |
| def get_wikipedia_introduction(revid: int) -> Dict[str, str]: | |
| """ | |
| Retrieve the introduction of a Wikipedia article. | |
| Args: | |
| revid: Revision id of the article | |
| Returns: | |
| Text of the introduction | |
| Example: | |
| # Get intro from current article revision | |
| revision_info = get_revision_from_age("David_Szalay") | |
| get_wikipedia_introduction(revision_info["revid"]) | |
| """ | |
| # Return None for missing revid | |
| if not revid: | |
| return None | |
| # Get the content of this specific revision | |
| params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"} | |
| json_data = run_get_request(params) | |
| # Sometimes a revision is deleted and can't be viewed | |
| # E.g. revid = '1276494621' for Turin | |
| try: | |
| html_content = json_data["parse"]["text"]["*"] | |
| except: | |
| return None | |
| # Extract introduction (text before first section heading) | |
| # Remove everything from the first <h2> tag onwards | |
| intro_html = re.split(r"<h2", html_content, maxsplit=1)[0] | |
| # Extract text from paragraphs, excluding certain elements | |
| from html.parser import HTMLParser | |
| class IntroParser(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.text = [] | |
| self.in_p = False | |
| self.skip = False | |
| def handle_starttag(self, tag, attrs): | |
| if tag == "p": | |
| self.in_p = True | |
| # Skip certain elements | |
| if tag in ["style", "script", "table", "div"]: | |
| attrs_dict = dict(attrs) | |
| # Skip infoboxes, navboxes, etc. | |
| if "class" in attrs_dict: | |
| if any( | |
| x in attrs_dict["class"] | |
| for x in ["infobox", "navbox", "metadata", "toc"] | |
| ): | |
| self.skip = True | |
| if tag in ["style", "script"]: | |
| self.skip = True | |
| def handle_endtag(self, tag): | |
| if tag == "p": | |
| if self.in_p and self.text and not self.text[-1].endswith("\n\n"): | |
| self.text.append("\n\n") | |
| self.in_p = False | |
| if tag in ["style", "script", "table", "div"]: | |
| self.skip = False | |
| def handle_data(self, data): | |
| if self.in_p and not self.skip: | |
| # *Don't* clean up whitespace here - it makes run-on words | |
| # text = " ".join(data.split()) | |
| text = data | |
| if text: | |
| self.text.append(text) | |
| parser = IntroParser() | |
| parser.feed(intro_html) | |
| # Join and clean up the text | |
| introduction = "".join(parser.text).strip() | |
| # Remove multiple newlines | |
| introduction = re.sub(r"\n{3,}", "\n\n", introduction) | |
| # Remove empty paragraphs | |
| paragraphs = [p.strip() for p in introduction.split("\n\n") if p.strip()] | |
| introduction = "\n\n".join(paragraphs) | |
| return introduction | |
| def get_revisions_behind(title: str, revid: int) -> int: | |
| """ | |
| Get the number of revisions a given revid is behind the current revision of the page. | |
| Args: | |
| revid: Revision ID of the page | |
| Returns: | |
| Integer representing the number of revisions back (0 if it's the current revision) | |
| Example: | |
| # Get how many revisions behind a specific revid is | |
| revisions_behind = get_revisions_behind(123456789) | |
| """ | |
| ## First, get the page title from the revid | |
| # params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"} | |
| # try: | |
| # json_data = run_get_request(params) | |
| # title = json_data["parse"]["title"] | |
| # except Exception: | |
| # # If we can't get the title, the revid might be invalid | |
| # raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.") | |
| # Search through revisions going back from current | |
| # We'll paginate through results if needed | |
| revision_count = 0 | |
| continue_token = None | |
| # Run the loop twice to get up to 1000 revisions behind | |
| for i in range(2): | |
| params = { | |
| "action": "query", | |
| "titles": title, | |
| "prop": "revisions", | |
| "rvlimit": 500, # API limit per request | |
| "rvdir": "older", | |
| "rvprop": "ids", | |
| "format": "json", | |
| } | |
| if continue_token: | |
| params["rvcontinue"] = continue_token | |
| try: | |
| json_data = run_get_request(params) | |
| pages = json_data["query"]["pages"] | |
| page_id = list(pages.keys())[0] | |
| if page_id == "-1": | |
| raise ValueError(f"Page not found for revid {revid}") | |
| revisions = pages[page_id]["revisions"] | |
| # Find the index of the given revid in the current batch of revisions | |
| for i, revision in enumerate(revisions): | |
| if revision["revid"] == revid: | |
| return revision_count + i | |
| # Update the count of revisions we've checked | |
| revision_count += len(revisions) | |
| # Check if there are more revisions to search | |
| continue_token = json_data.get("continue", {}).get("rvcontinue") | |
| if not continue_token: | |
| # Reached the end of revisions but didn't find the revid | |
| raise ValueError( | |
| f"Revid {revid} not found in the revision history of the page. " | |
| f"It may be from a different page or may have been deleted." | |
| ) | |
| except ValueError: | |
| # Re-raise ValueError exceptions | |
| raise | |
| except Exception as e: | |
| raise ValueError(f"Error searching for revid {revid}: {e}") | |
| # If we looped without returning the revision count, return it as a negative number | |
| negative_revision_count = -revision_count | |
| return negative_revision_count | |
| def get_random_wikipedia_title(): | |
| url = "https://en.wikipedia.org/w/api.php" | |
| params = { | |
| "action": "query", | |
| "list": "random", | |
| "rnnamespace": 0, | |
| "rnlimit": 1, | |
| "format": "json", | |
| } | |
| try: | |
| json_data = run_get_request(params) | |
| # Extract the title | |
| title = json_data["query"]["random"][0]["title"] | |
| return title | |
| except requests.RequestException as e: | |
| print(f"Error fetching random Wikipedia title: {e}") | |
| return None | |