Spaces:

jedick
/

noteworthy-differences

Running

noteworthy-differences / wiki_data_fetcher.py

jedick

Get earliest available revision for a specified revision number

3c02ce2 1 day ago

10.7 kB

	import requests
	from datetime import datetime, timedelta
	from typing import Dict, Optional
	import re


	def run_get_request(params: dict):
	"""
	Utility function to run GET request against Wikipedia API
	"""
	base_url = "https://en.wikipedia.org/w/api.php"

	# We need to supply headers for the request to work
	headers = {
	"User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}"
	}

	response = requests.get(base_url, params=params, headers=headers)
	# Handle HTTP errors
	response.raise_for_status()

	try:
	json_data = response.json()
	except Exception:
	raise ValueError(f"Unable to parse response: {response}")

	return json_data


	def extract_revision_info(json_data, revnum=0, limit_revnum=True):
	"""
	Utility function to extract page revision info from JSON data returned from API call

	Args:
	revnum: revision before current

	Examples:
	title = 'David_Szalay'
	json_data = get_previous_revisions(title, revisions = 100)
	extract_revision_info(json_data) # Current revision
	extract_revision_info(json_data, 10) # 10th revision before current
	extract_revision_info(json_data, 100) # 10th revision before current
	"""
	# Extract page and revision info
	pages = json_data["query"]["pages"]
	page_id = list(pages.keys())[0]

	try:
	if limit_revnum:
	# Limit revnum to earliest available revision before current
	revnum = min([revnum, len(pages[page_id]["revisions"]) - 1])
	# Get the specified revision
	revision = pages[page_id]["revisions"][revnum]
	# Remove the parentid key because we don't use it
	_ = revision.pop("parentid", None)
	# Add the actual revision number
	revision["revnum"] = revnum
	return revision
	except:
	# Page or revision not found, return empty dict
	return {"revid": None, "timestamp": None, "revnum": None}


	def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
	"""
	Get the revision info of a Wikipedia article closest to the age in days.

	Args:
	title: Wikipedia article title (e.g., 'David_Szalay')
	age_days: Age of the article revision in days (0 for current)

	Returns:
	Dictionary containing:
	- 'revid': Revision id of the article revision
	- 'timestamp': Timestamp of the article revision
	"""

	# Get the target date
	target_date = datetime.utcnow() - timedelta(days=age_days)

	# Get the revision closest to the target date
	params = {
	"action": "query",
	"titles": title,
	"prop": "revisions",
	"rvlimit": 1,
	"rvdir": "older",
	"rvstart": target_date.isoformat() + "Z",
	"rvprop": "ids\|timestamp",
	"format": "json",
	}

	# Run GET request
	json_data = run_get_request(params)

	# Return revision info
	return extract_revision_info(json_data)


	def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]:
	"""
	Get the revision info of a Wikipedia article a certain number of revisions before the current one.

	Args:
	title: Wikipedia article title (e.g., 'David_Szalay')
	revision: What revision before current (0 for current, must be between 0 and 499)

	Returns:
	Dictionary containing:
	- 'revid': Revision id of the article revision
	- 'timestamp': Timestamp of the article revision

	Note:
	In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500
	rvlimit = 1 returns a single revision: the current one
	rvlimit = 101 returns the 100 most recent revisions and the current one
	This is why we use rvlimit = revision + 1
	"""

	# Get the revision closest to the target date
	params = {
	"action": "query",
	"prop": "revisions",
	"titles": title,
	"rvlimit": revisions + 1,
	"rvdir": "older",
	"rvprop": "ids\|timestamp",
	"format": "json",
	}

	# Run GET request
	json_data = run_get_request(params)

	# Return info for all revisions
	return json_data


	def get_wikipedia_introduction(revid: int) -> Dict[str, str]:
	"""
	Retrieve the introduction of a Wikipedia article.

	Args:
	revid: Revision id of the article

	Returns:
	Text of the introduction

	Example:
	# Get intro from current article revision
	revision_info = get_revision_from_age("David_Szalay")
	get_wikipedia_introduction(revision_info["revid"])
	"""

	# Return None for missing revid
	if not revid:
	return None

	# Get the content of this specific revision
	params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"}

	json_data = run_get_request(params)

	# Sometimes a revision is deleted and can't be viewed
	# E.g. revid = '1276494621' for Turin
	try:
	html_content = json_data["parse"]["text"]["*"]
	except:
	return None

	# Extract introduction (text before first section heading)
	# Remove everything from the first <h2> tag onwards
	intro_html = re.split(r"<h2", html_content, maxsplit=1)[0]

	# Extract text from paragraphs, excluding certain elements
	from html.parser import HTMLParser

	class IntroParser(HTMLParser):
	def __init__(self):
	super().__init__()
	self.text = []
	self.in_p = False
	self.skip = False

	def handle_starttag(self, tag, attrs):
	if tag == "p":
	self.in_p = True
	# Skip certain elements
	if tag in ["style", "script", "table", "div"]:
	attrs_dict = dict(attrs)
	# Skip infoboxes, navboxes, etc.
	if "class" in attrs_dict:
	if any(
	x in attrs_dict["class"]
	for x in ["infobox", "navbox", "metadata", "toc"]
	):
	self.skip = True
	if tag in ["style", "script"]:
	self.skip = True

	def handle_endtag(self, tag):
	if tag == "p":
	if self.in_p and self.text and not self.text[-1].endswith("\n\n"):
	self.text.append("\n\n")
	self.in_p = False
	if tag in ["style", "script", "table", "div"]:
	self.skip = False

	def handle_data(self, data):
	if self.in_p and not self.skip:
	# Don't clean up whitespace here - it makes run-on words
	# text = " ".join(data.split())
	text = data
	if text:
	self.text.append(text)

	parser = IntroParser()
	parser.feed(intro_html)

	# Join and clean up the text
	introduction = "".join(parser.text).strip()

	# Remove multiple newlines
	introduction = re.sub(r"\n{3,}", "\n\n", introduction)

	# Remove empty paragraphs
	paragraphs = [p.strip() for p in introduction.split("\n\n") if p.strip()]
	introduction = "\n\n".join(paragraphs)

	return introduction


	def get_revisions_behind(title: str, revid: int) -> int:
	"""
	Get the number of revisions a given revid is behind the current revision of the page.

	Args:
	revid: Revision ID of the page

	Returns:
	Integer representing the number of revisions back (0 if it's the current revision)

	Example:
	# Get how many revisions behind a specific revid is
	revisions_behind = get_revisions_behind(123456789)
	"""

	## First, get the page title from the revid
	# params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"}
	# try:
	# json_data = run_get_request(params)
	# title = json_data["parse"]["title"]
	# except Exception:
	# # If we can't get the title, the revid might be invalid
	# raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.")

	# Search through revisions going back from current
	# We'll paginate through results if needed
	revision_count = 0
	continue_token = None

	# Run the loop twice to get up to 1000 revisions behind
	for i in range(2):
	params = {
	"action": "query",
	"titles": title,
	"prop": "revisions",
	"rvlimit": 500, # API limit per request
	"rvdir": "older",
	"rvprop": "ids",
	"format": "json",
	}

	if continue_token:
	params["rvcontinue"] = continue_token

	try:
	json_data = run_get_request(params)
	pages = json_data["query"]["pages"]
	page_id = list(pages.keys())[0]

	if page_id == "-1":
	raise ValueError(f"Page not found for revid {revid}")

	revisions = pages[page_id]["revisions"]

	# Find the index of the given revid in the current batch of revisions
	for i, revision in enumerate(revisions):
	if revision["revid"] == revid:
	return revision_count + i

	# Update the count of revisions we've checked
	revision_count += len(revisions)

	# Check if there are more revisions to search
	continue_token = json_data.get("continue", {}).get("rvcontinue")

	if not continue_token:
	# Reached the end of revisions but didn't find the revid
	raise ValueError(
	f"Revid {revid} not found in the revision history of the page. "
	f"It may be from a different page or may have been deleted."
	)

	except ValueError:
	# Re-raise ValueError exceptions
	raise
	except Exception as e:
	raise ValueError(f"Error searching for revid {revid}: {e}")

	# If we looped without returning the revision count, return it as a negative number
	negative_revision_count = -revision_count
	return negative_revision_count


	def get_random_wikipedia_title():
	url = "https://en.wikipedia.org/w/api.php"
	params = {
	"action": "query",
	"list": "random",
	"rnnamespace": 0,
	"rnlimit": 1,
	"format": "json",
	}

	try:
	json_data = run_get_request(params)

	# Extract the title
	title = json_data["query"]["random"][0]["title"]
	return title

	except requests.RequestException as e:
	print(f"Error fetching random Wikipedia title: {e}")
	return None