Spaces:
Running
Running
File size: 10,703 Bytes
48c27bb 3c02ce2 48c27bb 3c02ce2 48c27bb 3c02ce2 48c27bb 3c02ce2 48c27bb 3c02ce2 48c27bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 |
import requests
from datetime import datetime, timedelta
from typing import Dict, Optional
import re
def run_get_request(params: dict):
"""
Utility function to run GET request against Wikipedia API
"""
base_url = "https://en.wikipedia.org/w/api.php"
# We need to supply headers for the request to work
headers = {
"User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}"
}
response = requests.get(base_url, params=params, headers=headers)
# Handle HTTP errors
response.raise_for_status()
try:
json_data = response.json()
except Exception:
raise ValueError(f"Unable to parse response: {response}")
return json_data
def extract_revision_info(json_data, revnum=0, limit_revnum=True):
"""
Utility function to extract page revision info from JSON data returned from API call
Args:
revnum: revision before current
Examples:
title = 'David_Szalay'
json_data = get_previous_revisions(title, revisions = 100)
extract_revision_info(json_data) # Current revision
extract_revision_info(json_data, 10) # 10th revision before current
extract_revision_info(json_data, 100) # 10th revision before current
"""
# Extract page and revision info
pages = json_data["query"]["pages"]
page_id = list(pages.keys())[0]
try:
if limit_revnum:
# Limit revnum to earliest available revision before current
revnum = min([revnum, len(pages[page_id]["revisions"]) - 1])
# Get the specified revision
revision = pages[page_id]["revisions"][revnum]
# Remove the parentid key because we don't use it
_ = revision.pop("parentid", None)
# Add the actual revision number
revision["revnum"] = revnum
return revision
except:
# Page or revision not found, return empty dict
return {"revid": None, "timestamp": None, "revnum": None}
def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
"""
Get the revision info of a Wikipedia article closest to the age in days.
Args:
title: Wikipedia article title (e.g., 'David_Szalay')
age_days: Age of the article revision in days (0 for current)
Returns:
Dictionary containing:
- 'revid': Revision id of the article revision
- 'timestamp': Timestamp of the article revision
"""
# Get the target date
target_date = datetime.utcnow() - timedelta(days=age_days)
# Get the revision closest to the target date
params = {
"action": "query",
"titles": title,
"prop": "revisions",
"rvlimit": 1,
"rvdir": "older",
"rvstart": target_date.isoformat() + "Z",
"rvprop": "ids|timestamp",
"format": "json",
}
# Run GET request
json_data = run_get_request(params)
# Return revision info
return extract_revision_info(json_data)
def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]:
"""
Get the revision info of a Wikipedia article a certain number of revisions before the current one.
Args:
title: Wikipedia article title (e.g., 'David_Szalay')
revision: What revision before current (0 for current, must be between 0 and 499)
Returns:
Dictionary containing:
- 'revid': Revision id of the article revision
- 'timestamp': Timestamp of the article revision
Note:
In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500
rvlimit = 1 returns a single revision: the current one
rvlimit = 101 returns the 100 most recent revisions and the current one
This is why we use rvlimit = revision + 1
"""
# Get the revision closest to the target date
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvlimit": revisions + 1,
"rvdir": "older",
"rvprop": "ids|timestamp",
"format": "json",
}
# Run GET request
json_data = run_get_request(params)
# Return info for all revisions
return json_data
def get_wikipedia_introduction(revid: int) -> Dict[str, str]:
"""
Retrieve the introduction of a Wikipedia article.
Args:
revid: Revision id of the article
Returns:
Text of the introduction
Example:
# Get intro from current article revision
revision_info = get_revision_from_age("David_Szalay")
get_wikipedia_introduction(revision_info["revid"])
"""
# Return None for missing revid
if not revid:
return None
# Get the content of this specific revision
params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"}
json_data = run_get_request(params)
# Sometimes a revision is deleted and can't be viewed
# E.g. revid = '1276494621' for Turin
try:
html_content = json_data["parse"]["text"]["*"]
except:
return None
# Extract introduction (text before first section heading)
# Remove everything from the first <h2> tag onwards
intro_html = re.split(r"<h2", html_content, maxsplit=1)[0]
# Extract text from paragraphs, excluding certain elements
from html.parser import HTMLParser
class IntroParser(HTMLParser):
def __init__(self):
super().__init__()
self.text = []
self.in_p = False
self.skip = False
def handle_starttag(self, tag, attrs):
if tag == "p":
self.in_p = True
# Skip certain elements
if tag in ["style", "script", "table", "div"]:
attrs_dict = dict(attrs)
# Skip infoboxes, navboxes, etc.
if "class" in attrs_dict:
if any(
x in attrs_dict["class"]
for x in ["infobox", "navbox", "metadata", "toc"]
):
self.skip = True
if tag in ["style", "script"]:
self.skip = True
def handle_endtag(self, tag):
if tag == "p":
if self.in_p and self.text and not self.text[-1].endswith("\n\n"):
self.text.append("\n\n")
self.in_p = False
if tag in ["style", "script", "table", "div"]:
self.skip = False
def handle_data(self, data):
if self.in_p and not self.skip:
# *Don't* clean up whitespace here - it makes run-on words
# text = " ".join(data.split())
text = data
if text:
self.text.append(text)
parser = IntroParser()
parser.feed(intro_html)
# Join and clean up the text
introduction = "".join(parser.text).strip()
# Remove multiple newlines
introduction = re.sub(r"\n{3,}", "\n\n", introduction)
# Remove empty paragraphs
paragraphs = [p.strip() for p in introduction.split("\n\n") if p.strip()]
introduction = "\n\n".join(paragraphs)
return introduction
def get_revisions_behind(title: str, revid: int) -> int:
"""
Get the number of revisions a given revid is behind the current revision of the page.
Args:
revid: Revision ID of the page
Returns:
Integer representing the number of revisions back (0 if it's the current revision)
Example:
# Get how many revisions behind a specific revid is
revisions_behind = get_revisions_behind(123456789)
"""
## First, get the page title from the revid
# params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"}
# try:
# json_data = run_get_request(params)
# title = json_data["parse"]["title"]
# except Exception:
# # If we can't get the title, the revid might be invalid
# raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.")
# Search through revisions going back from current
# We'll paginate through results if needed
revision_count = 0
continue_token = None
# Run the loop twice to get up to 1000 revisions behind
for i in range(2):
params = {
"action": "query",
"titles": title,
"prop": "revisions",
"rvlimit": 500, # API limit per request
"rvdir": "older",
"rvprop": "ids",
"format": "json",
}
if continue_token:
params["rvcontinue"] = continue_token
try:
json_data = run_get_request(params)
pages = json_data["query"]["pages"]
page_id = list(pages.keys())[0]
if page_id == "-1":
raise ValueError(f"Page not found for revid {revid}")
revisions = pages[page_id]["revisions"]
# Find the index of the given revid in the current batch of revisions
for i, revision in enumerate(revisions):
if revision["revid"] == revid:
return revision_count + i
# Update the count of revisions we've checked
revision_count += len(revisions)
# Check if there are more revisions to search
continue_token = json_data.get("continue", {}).get("rvcontinue")
if not continue_token:
# Reached the end of revisions but didn't find the revid
raise ValueError(
f"Revid {revid} not found in the revision history of the page. "
f"It may be from a different page or may have been deleted."
)
except ValueError:
# Re-raise ValueError exceptions
raise
except Exception as e:
raise ValueError(f"Error searching for revid {revid}: {e}")
# If we looped without returning the revision count, return it as a negative number
negative_revision_count = -revision_count
return negative_revision_count
def get_random_wikipedia_title():
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"list": "random",
"rnnamespace": 0,
"rnlimit": 1,
"format": "json",
}
try:
json_data = run_get_request(params)
# Extract the title
title = json_data["query"]["random"][0]["title"]
return title
except requests.RequestException as e:
print(f"Error fetching random Wikipedia title: {e}")
return None
|