jedick commited on
Commit
3c02ce2
·
1 Parent(s): 9ef70ce

Get earliest available revision for a specified revision number

Browse files
Files changed (3) hide show
  1. app.py +9 -8
  2. collect_data.py +2 -2
  3. wiki_data_fetcher.py +13 -18
app.py CHANGED
@@ -41,7 +41,7 @@ def fetch_current_revision(title: str):
41
  try:
42
  # Get current revision (revision 0)
43
  json_data = get_previous_revisions(title, revisions=0)
44
- revision_info = extract_revision_info(json_data, revision=0)
45
 
46
  if not revision_info.get("revid"):
47
  error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
@@ -76,8 +76,8 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
76
 
77
  Args:
78
  title: Wikipedia article title
79
- unit: "days" or "revisions"
80
- number: Number of days or revisions behind
81
 
82
  Returns:
83
  Tuple of (introduction, timestamp)
@@ -92,7 +92,7 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
92
  # Get previous revision based on unit
93
  if unit == "revisions":
94
  json_data = get_previous_revisions(title, revisions=number)
95
- revision_info = extract_revision_info(json_data, revision=number)
96
  else: # unit == "days"
97
  revision_info = get_revision_from_age(title, age_days=number)
98
 
@@ -112,7 +112,7 @@ def fetch_previous_revision(title: str, unit: str, number: int, new_revision: st
112
 
113
  # Get revisions_behind
114
  if unit == "revisions":
115
- revisions_behind = number
116
  else:
117
  revisions_behind = get_revisions_behind(title, revid)
118
  # For a negative number, replace the negative sign with ">"
@@ -283,7 +283,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
283
  with gr.Row():
284
  gr.Markdown(
285
  """
286
- Compare current and old revisions of a Wikipedia article - you choose the number of days or revisions behind.<br>
287
  Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.<br>
288
  The judge was aligned with human preferences as described in the
289
  [GitHub repository](https://github.com/jedick/noteworthy-differences).
@@ -294,9 +294,9 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
294
  title_input = gr.Textbox(
295
  label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
296
  )
297
- number_input = gr.Number(label="Number", value=100, minimum=0, precision=0)
298
  unit_dropdown = gr.Dropdown(
299
- choices=["days", "revisions"], value="days", label="Unit"
300
  )
301
  judge_mode_dropdown = gr.Dropdown(
302
  choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
@@ -316,6 +316,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
316
  """#### Query Instructions
317
  - Page title is case sensitive; use underscores or spaces
318
  - Specify any number of days or up to 499 revisions behind
 
319
  - Only article introductions are downloaded
320
  """
321
  )
 
41
  try:
42
  # Get current revision (revision 0)
43
  json_data = get_previous_revisions(title, revisions=0)
44
+ revision_info = extract_revision_info(json_data, revnum=0)
45
 
46
  if not revision_info.get("revid"):
47
  error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
 
76
 
77
  Args:
78
  title: Wikipedia article title
79
+ unit: "revisions" or "days"
80
+ number: Number of revisions or days behind
81
 
82
  Returns:
83
  Tuple of (introduction, timestamp)
 
92
  # Get previous revision based on unit
93
  if unit == "revisions":
94
  json_data = get_previous_revisions(title, revisions=number)
95
+ revision_info = extract_revision_info(json_data, revnum=number)
96
  else: # unit == "days"
97
  revision_info = get_revision_from_age(title, age_days=number)
98
 
 
112
 
113
  # Get revisions_behind
114
  if unit == "revisions":
115
+ revisions_behind = revision_info["revnum"]
116
  else:
117
  revisions_behind = get_revisions_behind(title, revid)
118
  # For a negative number, replace the negative sign with ">"
 
283
  with gr.Row():
284
  gr.Markdown(
285
  """
286
+ Compare current and old revisions of a Wikipedia article - you choose the number of revisions or days behind.<br>
287
  Two classifier models (with heuristic and few-shot prompts) and a judge predict the noteworthiness of the differences.<br>
288
  The judge was aligned with human preferences as described in the
289
  [GitHub repository](https://github.com/jedick/noteworthy-differences).
 
294
  title_input = gr.Textbox(
295
  label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
296
  )
297
+ number_input = gr.Number(label="Number", value=50, minimum=0, precision=0)
298
  unit_dropdown = gr.Dropdown(
299
+ choices=["revisions", "days"], value="revisions", label="Unit"
300
  )
301
  judge_mode_dropdown = gr.Dropdown(
302
  choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
 
316
  """#### Query Instructions
317
  - Page title is case sensitive; use underscores or spaces
318
  - Specify any number of days or up to 499 revisions behind
319
+ - The closest available revision is retrieved
320
  - Only article introductions are downloaded
321
  """
322
  )
collect_data.py CHANGED
@@ -31,12 +31,12 @@ if __name__ == "__main__":
31
  ts_0.append(info_0["timestamp"])
32
  intro_0.append(get_wikipedia_introduction(info_0["revid"]))
33
  # Append data for 10th revision before current
34
- info_10 = extract_revision_info(json_data, 10)
35
  revid_10.append(info_10["revid"])
36
  ts_10.append(info_10["timestamp"])
37
  intro_10.append(get_wikipedia_introduction(info_10["revid"]))
38
  # Append data for 100th revision before current
39
- info_100 = extract_revision_info(json_data, 100)
40
  revid_100.append(info_100["revid"])
41
  ts_100.append(info_100["timestamp"])
42
  intro_100.append(get_wikipedia_introduction(info_100["revid"]))
 
31
  ts_0.append(info_0["timestamp"])
32
  intro_0.append(get_wikipedia_introduction(info_0["revid"]))
33
  # Append data for 10th revision before current
34
+ info_10 = extract_revision_info(json_data, 10, limit_revnum=False)
35
  revid_10.append(info_10["revid"])
36
  ts_10.append(info_10["timestamp"])
37
  intro_10.append(get_wikipedia_introduction(info_10["revid"]))
38
  # Append data for 100th revision before current
39
+ info_100 = extract_revision_info(json_data, 100, limit_revnum=False)
40
  revid_100.append(info_100["revid"])
41
  ts_100.append(info_100["timestamp"])
42
  intro_100.append(get_wikipedia_introduction(info_100["revid"]))
wiki_data_fetcher.py CHANGED
@@ -27,12 +27,12 @@ def run_get_request(params: dict):
27
  return json_data
28
 
29
 
30
- def extract_revision_info(json_data, revision=0):
31
  """
32
  Utility function to extract page revision info from JSON data returned from API call
33
 
34
  Args:
35
- revision: revision before current
36
 
37
  Examples:
38
  title = 'David_Szalay'
@@ -45,25 +45,20 @@ def extract_revision_info(json_data, revision=0):
45
  pages = json_data["query"]["pages"]
46
  page_id = list(pages.keys())[0]
47
 
48
- if page_id == "-1":
49
- # Page not found, return empty dict
50
- return {"revid": None, "timestamp": None}
51
-
52
  try:
 
 
 
53
  # Get the specified revision
54
- revision = pages[page_id]["revisions"][revision]
55
- revid = revision["revid"]
56
- timestamp = revision["timestamp"]
 
 
 
57
  except:
58
- # Revision not found, return empty dict
59
- return {"revid": None, "timestamp": None}
60
-
61
- # NOTUSED: Create permanent URL
62
- # permanent_url = f"https://en.wikipedia.org/w/index.php?title={title}&oldid={revid}"
63
-
64
- # Remove the parentid key because we don't use it
65
- _ = revision.pop("parentid", None)
66
- return revision
67
 
68
 
69
  def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
 
27
  return json_data
28
 
29
 
30
+ def extract_revision_info(json_data, revnum=0, limit_revnum=True):
31
  """
32
  Utility function to extract page revision info from JSON data returned from API call
33
 
34
  Args:
35
+ revnum: revision before current
36
 
37
  Examples:
38
  title = 'David_Szalay'
 
45
  pages = json_data["query"]["pages"]
46
  page_id = list(pages.keys())[0]
47
 
 
 
 
 
48
  try:
49
+ if limit_revnum:
50
+ # Limit revnum to earliest available revision before current
51
+ revnum = min([revnum, len(pages[page_id]["revisions"]) - 1])
52
  # Get the specified revision
53
+ revision = pages[page_id]["revisions"][revnum]
54
+ # Remove the parentid key because we don't use it
55
+ _ = revision.pop("parentid", None)
56
+ # Add the actual revision number
57
+ revision["revnum"] = revnum
58
+ return revision
59
  except:
60
+ # Page or revision not found, return empty dict
61
+ return {"revid": None, "timestamp": None, "revnum": None}
 
 
 
 
 
 
 
62
 
63
 
64
  def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]: