Spaces:
Running
Running
jedick
commited on
Commit
·
3c02ce2
1
Parent(s):
9ef70ce
Get earliest available revision for a specified revision number
Browse files- app.py +9 -8
- collect_data.py +2 -2
- wiki_data_fetcher.py +13 -18
app.py
CHANGED
|
@@ -41,7 +41,7 @@ def fetch_current_revision(title: str):
|
|
| 41 |
try:
|
| 42 |
# Get current revision (revision 0)
|
| 43 |
json_data = get_previous_revisions(title, revisions=0)
|
| 44 |
-
revision_info = extract_revision_info(json_data,
|
| 45 |
|
| 46 |
if not revision_info.get("revid"):
|
| 47 |
error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
|
|
@@ -76,8 +76,8 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
|
|
| 76 |
|
| 77 |
Args:
|
| 78 |
title: Wikipedia article title
|
| 79 |
-
unit: "
|
| 80 |
-
number: Number of
|
| 81 |
|
| 82 |
Returns:
|
| 83 |
Tuple of (introduction, timestamp)
|
|
@@ -92,7 +92,7 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
|
|
| 92 |
# Get previous revision based on unit
|
| 93 |
if unit == "revisions":
|
| 94 |
json_data = get_previous_revisions(title, revisions=number)
|
| 95 |
-
revision_info = extract_revision_info(json_data,
|
| 96 |
else: # unit == "days"
|
| 97 |
revision_info = get_revision_from_age(title, age_days=number)
|
| 98 |
|
|
@@ -112,7 +112,7 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
|
|
| 112 |
|
| 113 |
# Get revisions_behind
|
| 114 |
if unit == "revisions":
|
| 115 |
-
revisions_behind =
|
| 116 |
else:
|
| 117 |
revisions_behind = get_revisions_behind(title, revid)
|
| 118 |
# For a negative number, replace the negative sign with ">"
|
|
@@ -283,7 +283,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
|
|
| 283 |
with gr.Row():
|
| 284 |
gr.Markdown(
|
| 285 |
"""
|
| 286 |
-
Compare current and old revisions of a Wikipedia article - you choose the number of
|
| 287 |
Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.<br>
|
| 288 |
The judge was aligned with human preferences as described in the
|
| 289 |
[GitHub repository](https://github.com/jedick/noteworthy-differences).
|
|
@@ -294,9 +294,9 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
|
|
| 294 |
title_input = gr.Textbox(
|
| 295 |
label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
|
| 296 |
)
|
| 297 |
-
number_input = gr.Number(label="Number", value=
|
| 298 |
unit_dropdown = gr.Dropdown(
|
| 299 |
-
choices=["
|
| 300 |
)
|
| 301 |
judge_mode_dropdown = gr.Dropdown(
|
| 302 |
choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
|
|
@@ -316,6 +316,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
|
|
| 316 |
"""#### Query Instructions
|
| 317 |
- Page title is case sensitive; use underscores or spaces
|
| 318 |
- Specify any number of days or up to 499 revisions behind
|
|
|
|
| 319 |
- Only article introductions are downloaded
|
| 320 |
"""
|
| 321 |
)
|
|
|
|
| 41 |
try:
|
| 42 |
# Get current revision (revision 0)
|
| 43 |
json_data = get_previous_revisions(title, revisions=0)
|
| 44 |
+
revision_info = extract_revision_info(json_data, revnum=0)
|
| 45 |
|
| 46 |
if not revision_info.get("revid"):
|
| 47 |
error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
|
|
|
|
| 76 |
|
| 77 |
Args:
|
| 78 |
title: Wikipedia article title
|
| 79 |
+
unit: "revisions" or "days"
|
| 80 |
+
number: Number of revisions or days behind
|
| 81 |
|
| 82 |
Returns:
|
| 83 |
Tuple of (introduction, timestamp)
|
|
|
|
| 92 |
# Get previous revision based on unit
|
| 93 |
if unit == "revisions":
|
| 94 |
json_data = get_previous_revisions(title, revisions=number)
|
| 95 |
+
revision_info = extract_revision_info(json_data, revnum=number)
|
| 96 |
else: # unit == "days"
|
| 97 |
revision_info = get_revision_from_age(title, age_days=number)
|
| 98 |
|
|
|
|
| 112 |
|
| 113 |
# Get revisions_behind
|
| 114 |
if unit == "revisions":
|
| 115 |
+
revisions_behind = revision_info["revnum"]
|
| 116 |
else:
|
| 117 |
revisions_behind = get_revisions_behind(title, revid)
|
| 118 |
# For a negative number, replace the negative sign with ">"
|
|
|
|
| 283 |
with gr.Row():
|
| 284 |
gr.Markdown(
|
| 285 |
"""
|
| 286 |
+
Compare current and old revisions of a Wikipedia article - you choose the number of revisions or days behind.<br>
|
| 287 |
Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.<br>
|
| 288 |
The judge was aligned with human preferences as described in the
|
| 289 |
[GitHub repository](https://github.com/jedick/noteworthy-differences).
|
|
|
|
| 294 |
title_input = gr.Textbox(
|
| 295 |
label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
|
| 296 |
)
|
| 297 |
+
number_input = gr.Number(label="Number", value=50, minimum=0, precision=0)
|
| 298 |
unit_dropdown = gr.Dropdown(
|
| 299 |
+
choices=["revisions", "days"], value="revisions", label="Unit"
|
| 300 |
)
|
| 301 |
judge_mode_dropdown = gr.Dropdown(
|
| 302 |
choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
|
|
|
|
| 316 |
"""#### Query Instructions
|
| 317 |
- Page title is case sensitive; use underscores or spaces
|
| 318 |
- Specify any number of days or up to 499 revisions behind
|
| 319 |
+
- The closest available revision is retrieved
|
| 320 |
- Only article introductions are downloaded
|
| 321 |
"""
|
| 322 |
)
|
collect_data.py
CHANGED
|
@@ -31,12 +31,12 @@ if __name__ == "__main__":
|
|
| 31 |
ts_0.append(info_0["timestamp"])
|
| 32 |
intro_0.append(get_wikipedia_introduction(info_0["revid"]))
|
| 33 |
# Append data for 10th revision before current
|
| 34 |
-
info_10 = extract_revision_info(json_data, 10)
|
| 35 |
revid_10.append(info_10["revid"])
|
| 36 |
ts_10.append(info_10["timestamp"])
|
| 37 |
intro_10.append(get_wikipedia_introduction(info_10["revid"]))
|
| 38 |
# Append data for 100th revision before current
|
| 39 |
-
info_100 = extract_revision_info(json_data, 100)
|
| 40 |
revid_100.append(info_100["revid"])
|
| 41 |
ts_100.append(info_100["timestamp"])
|
| 42 |
intro_100.append(get_wikipedia_introduction(info_100["revid"]))
|
|
|
|
| 31 |
ts_0.append(info_0["timestamp"])
|
| 32 |
intro_0.append(get_wikipedia_introduction(info_0["revid"]))
|
| 33 |
# Append data for 10th revision before current
|
| 34 |
+
info_10 = extract_revision_info(json_data, 10, limit_revnum=False)
|
| 35 |
revid_10.append(info_10["revid"])
|
| 36 |
ts_10.append(info_10["timestamp"])
|
| 37 |
intro_10.append(get_wikipedia_introduction(info_10["revid"]))
|
| 38 |
# Append data for 100th revision before current
|
| 39 |
+
info_100 = extract_revision_info(json_data, 100, limit_revnum=False)
|
| 40 |
revid_100.append(info_100["revid"])
|
| 41 |
ts_100.append(info_100["timestamp"])
|
| 42 |
intro_100.append(get_wikipedia_introduction(info_100["revid"]))
|
wiki_data_fetcher.py
CHANGED
|
@@ -27,12 +27,12 @@ def run_get_request(params: dict):
|
|
| 27 |
return json_data
|
| 28 |
|
| 29 |
|
| 30 |
-
def extract_revision_info(json_data,
|
| 31 |
"""
|
| 32 |
Utility function to extract page revision info from JSON data returned from API call
|
| 33 |
|
| 34 |
Args:
|
| 35 |
-
|
| 36 |
|
| 37 |
Examples:
|
| 38 |
title = 'David_Szalay'
|
|
@@ -45,25 +45,20 @@ def extract_revision_info(json_data, revision=0):
|
|
| 45 |
pages = json_data["query"]["pages"]
|
| 46 |
page_id = list(pages.keys())[0]
|
| 47 |
|
| 48 |
-
if page_id == "-1":
|
| 49 |
-
# Page not found, return empty dict
|
| 50 |
-
return {"revid": None, "timestamp": None}
|
| 51 |
-
|
| 52 |
try:
|
|
|
|
|
|
|
|
|
|
| 53 |
# Get the specified revision
|
| 54 |
-
revision = pages[page_id]["revisions"][
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
except:
|
| 58 |
-
#
|
| 59 |
-
return {"revid": None, "timestamp": None}
|
| 60 |
-
|
| 61 |
-
# NOTUSED: Create permanent URL
|
| 62 |
-
# permanent_url = f"https://en.wikipedia.org/w/index.php?title={title}&oldid={revid}"
|
| 63 |
-
|
| 64 |
-
# Remove the parentid key because we don't use it
|
| 65 |
-
_ = revision.pop("parentid", None)
|
| 66 |
-
return revision
|
| 67 |
|
| 68 |
|
| 69 |
def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
|
|
|
|
| 27 |
return json_data
|
| 28 |
|
| 29 |
|
| 30 |
+
def extract_revision_info(json_data, revnum=0, limit_revnum=True):
|
| 31 |
"""
|
| 32 |
Utility function to extract page revision info from JSON data returned from API call
|
| 33 |
|
| 34 |
Args:
|
| 35 |
+
revnum: revision before current
|
| 36 |
|
| 37 |
Examples:
|
| 38 |
title = 'David_Szalay'
|
|
|
|
| 45 |
pages = json_data["query"]["pages"]
|
| 46 |
page_id = list(pages.keys())[0]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
try:
|
| 49 |
+
if limit_revnum:
|
| 50 |
+
# Limit revnum to earliest available revision before current
|
| 51 |
+
revnum = min([revnum, len(pages[page_id]["revisions"]) - 1])
|
| 52 |
# Get the specified revision
|
| 53 |
+
revision = pages[page_id]["revisions"][revnum]
|
| 54 |
+
# Remove the parentid key because we don't use it
|
| 55 |
+
_ = revision.pop("parentid", None)
|
| 56 |
+
# Add the actual revision number
|
| 57 |
+
revision["revnum"] = revnum
|
| 58 |
+
return revision
|
| 59 |
except:
|
| 60 |
+
# Page or revision not found, return empty dict
|
| 61 |
+
return {"revid": None, "timestamp": None, "revnum": None}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
|