Spaces:
Running
Running
jedick
commited on
Commit
Β·
48c27bb
1
Parent(s):
fcf306b
Initial commit
Browse files- README.md +1 -1
- app.py +478 -0
- collect_data.py +68 -0
- create_examples.py +69 -0
- data/alignment_fewshot.txt +26 -0
- data/alignment_heuristic.txt +55 -0
- judge_disagreements.py +59 -0
- models.py +132 -0
- prompts.py +91 -0
- requirements.txt +8 -0
- retry_with_backoff.py +42 -0
- test_models.py +131 -0
- test_workflows.py +30 -0
- wiki_data_fetcher.py +339 -0
- workflows.py +23 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: Noteworthy Differences
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
|
|
|
| 1 |
---
|
| 2 |
title: Noteworthy Differences
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
app.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from wiki_data_fetcher import (
|
| 3 |
+
get_previous_revisions,
|
| 4 |
+
get_revision_from_age,
|
| 5 |
+
get_wikipedia_introduction,
|
| 6 |
+
extract_revision_info,
|
| 7 |
+
get_revisions_behind,
|
| 8 |
+
get_random_wikipedia_title,
|
| 9 |
+
)
|
| 10 |
+
from models import classifier, judge
|
| 11 |
+
import logfire
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
# Load API keys
|
| 15 |
+
load_dotenv()
|
| 16 |
+
# Setup logging with Logfire
|
| 17 |
+
logfire.configure()
|
| 18 |
+
|
| 19 |
+
# If running a standalone Gradio app via `demo.launch()` within a script,
|
| 20 |
+
# Logfire's auto-instrumentation for FastAPI is often automatically handled
|
| 21 |
+
# if installed. If mounting within a separate FastAPI app, use:
|
| 22 |
+
# logfire.instrument_fastapi(app)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@logfire.instrument("Step 1: Fetch current revision")
|
| 26 |
+
def fetch_current_revision(title: str):
|
| 27 |
+
"""
|
| 28 |
+
Fetch current revision of a Wikipedia article and return its introduction.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
title: Wikipedia article title
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Tuple of (introduction, timestamp)
|
| 35 |
+
"""
|
| 36 |
+
if not title or not title.strip():
|
| 37 |
+
error_msg = "Please enter a Wikipedia page title."
|
| 38 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 39 |
+
return None, None
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
# Get current revision (revision 0)
|
| 43 |
+
json_data = get_previous_revisions(title, revisions=0)
|
| 44 |
+
revision_info = extract_revision_info(json_data, revision=0)
|
| 45 |
+
|
| 46 |
+
if not revision_info.get("revid"):
|
| 47 |
+
error_msg = f"Error: Could not find Wikipedia page '{title}'. Please check the title."
|
| 48 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 49 |
+
return None, None
|
| 50 |
+
|
| 51 |
+
revid = revision_info["revid"]
|
| 52 |
+
timestamp = revision_info["timestamp"]
|
| 53 |
+
|
| 54 |
+
# Get introduction
|
| 55 |
+
introduction = get_wikipedia_introduction(revid)
|
| 56 |
+
|
| 57 |
+
if introduction is None:
|
| 58 |
+
introduction = f"Error: Could not retrieve introduction for current revision (revid: {revid})"
|
| 59 |
+
|
| 60 |
+
# Format timestamp for display
|
| 61 |
+
timestamp = f"**Timestamp:** {timestamp}" if timestamp else ""
|
| 62 |
+
|
| 63 |
+
# Return introduction text and timestamp
|
| 64 |
+
return introduction, timestamp
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
error_msg = f"Error occurred: {str(e)}"
|
| 68 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 69 |
+
return None, None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@logfire.instrument("Step 2: Fetch previous revision")
|
| 73 |
+
def fetch_previous_revision(title: str, unit: str, number: int, new_revision: str):
|
| 74 |
+
"""
|
| 75 |
+
Fetch previous revision of a Wikipedia article and return its introduction.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
title: Wikipedia article title
|
| 79 |
+
unit: "days" or "revisions"
|
| 80 |
+
number: Number of days or revisions behind
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Tuple of (introduction, timestamp)
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
# If we get here with an empty new revision, then an error should have been raised
|
| 87 |
+
# in fetch_current_revision, so just return empty values without raising another error
|
| 88 |
+
if not new_revision:
|
| 89 |
+
return None, None
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
# Get previous revision based on unit
|
| 93 |
+
if unit == "revisions":
|
| 94 |
+
json_data = get_previous_revisions(title, revisions=number)
|
| 95 |
+
revision_info = extract_revision_info(json_data, revision=number)
|
| 96 |
+
else: # unit == "days"
|
| 97 |
+
revision_info = get_revision_from_age(title, age_days=number)
|
| 98 |
+
|
| 99 |
+
if not revision_info.get("revid"):
|
| 100 |
+
error_msg = f"Error: Could not find revision {number} {'revisions' if unit == 'revisions' else 'days'} behind for '{title}'."
|
| 101 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 102 |
+
return None, None
|
| 103 |
+
|
| 104 |
+
revid = revision_info["revid"]
|
| 105 |
+
timestamp = revision_info["timestamp"]
|
| 106 |
+
|
| 107 |
+
# Get introduction
|
| 108 |
+
introduction = get_wikipedia_introduction(revid)
|
| 109 |
+
|
| 110 |
+
if introduction is None:
|
| 111 |
+
introduction = f"Error: Could not retrieve introduction for previous revision (revid: {revid})"
|
| 112 |
+
|
| 113 |
+
# Get revisions_behind
|
| 114 |
+
if unit == "revisions":
|
| 115 |
+
revisions_behind = number
|
| 116 |
+
else:
|
| 117 |
+
revisions_behind = get_revisions_behind(title, revid)
|
| 118 |
+
# For a negative number, replace the negative sign with ">"
|
| 119 |
+
if revisions_behind < 0:
|
| 120 |
+
revisions_behind = str(revisions_behind).replace("-", ">")
|
| 121 |
+
|
| 122 |
+
# Format timestamp for display
|
| 123 |
+
timestamp = (
|
| 124 |
+
f"**Timestamp:** {timestamp}, {revisions_behind} revisions behind"
|
| 125 |
+
if timestamp
|
| 126 |
+
else ""
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Return introduction text and timestamp
|
| 130 |
+
return introduction, timestamp
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
error_msg = f"Error occurred: {str(e)}"
|
| 134 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 135 |
+
return None, None
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def run_classifier(old_revision: str, new_revision: str, prompt_style: str):
|
| 139 |
+
"""
|
| 140 |
+
Run a classification model on the revisions.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
old_revision: Old revision text
|
| 144 |
+
new_revision: New revision text
|
| 145 |
+
prompt_style: heuristic or few-shot
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Tuple of (noteworthy, rationale) (bool, str)
|
| 149 |
+
"""
|
| 150 |
+
|
| 151 |
+
# Values to return if there is an error
|
| 152 |
+
noteworthy, rationale = None, None
|
| 153 |
+
if not old_revision or not new_revision:
|
| 154 |
+
return noteworthy, rationale
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
# Run classifier model
|
| 158 |
+
result = classifier(old_revision, new_revision, prompt_style=prompt_style)
|
| 159 |
+
if result:
|
| 160 |
+
noteworthy = result.get("noteworthy", None)
|
| 161 |
+
rationale = result.get("rationale", "")
|
| 162 |
+
else:
|
| 163 |
+
error_msg = f"Error: Could not get {prompt_style} model result"
|
| 164 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
error_msg = f"Error running model: {str(e)}"
|
| 168 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 169 |
+
|
| 170 |
+
return noteworthy, rationale
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@logfire.instrument("Step 3a: Run heuristic classifier")
|
| 174 |
+
def run_heuristic_classifier(old_revision: str, new_revision: str):
|
| 175 |
+
return run_classifier(old_revision, new_revision, prompt_style="heuristic")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@logfire.instrument("Step 3b: Run few-shot classifier")
|
| 179 |
+
def run_fewshot_classifier(old_revision: str, new_revision: str):
|
| 180 |
+
return run_classifier(old_revision, new_revision, prompt_style="few-shot")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@logfire.instrument("Step 4: Run judge")
|
| 184 |
+
def run_judge(
|
| 185 |
+
old_revision: str,
|
| 186 |
+
new_revision: str,
|
| 187 |
+
heuristic_rationale,
|
| 188 |
+
fewshot_rationale,
|
| 189 |
+
judge_mode: str,
|
| 190 |
+
):
|
| 191 |
+
"""
|
| 192 |
+
Run classification models and judge on the revisions.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
old_revision: Old revision text
|
| 196 |
+
new_revision: New revision text
|
| 197 |
+
heuristic_rationale: Heuristic model's rationale
|
| 198 |
+
fewshot_rationale: Few-shot model's rationale
|
| 199 |
+
judge_mode: Mode for judge function ("unaligned", "aligned-fewshot", "aligned-heuristic")
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
Tuple of (noteworthy, reasoning) (bool, str)
|
| 203 |
+
"""
|
| 204 |
+
|
| 205 |
+
# Values to return if there is an error
|
| 206 |
+
noteworthy, reasoning = None, None
|
| 207 |
+
if (
|
| 208 |
+
not old_revision
|
| 209 |
+
or not new_revision
|
| 210 |
+
or not heuristic_rationale
|
| 211 |
+
or not fewshot_rationale
|
| 212 |
+
):
|
| 213 |
+
return noteworthy, reasoning
|
| 214 |
+
|
| 215 |
+
try:
|
| 216 |
+
# Run judge
|
| 217 |
+
result = judge(
|
| 218 |
+
old_revision,
|
| 219 |
+
new_revision,
|
| 220 |
+
heuristic_rationale,
|
| 221 |
+
fewshot_rationale,
|
| 222 |
+
mode=judge_mode,
|
| 223 |
+
)
|
| 224 |
+
if result:
|
| 225 |
+
noteworthy = result.get("noteworthy", "")
|
| 226 |
+
reasoning = result.get("reasoning", "")
|
| 227 |
+
else:
|
| 228 |
+
error_msg = f"Error: Could not get judge's result"
|
| 229 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
error_msg = f"Error running judge: {str(e)}"
|
| 233 |
+
raise gr.Error(error_msg, print_exception=False)
|
| 234 |
+
|
| 235 |
+
return noteworthy, reasoning
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def format_noteworthy(noteworthy, reasoning):
|
| 239 |
+
"""
|
| 240 |
+
Format judge's noteworthy label as text.
|
| 241 |
+
"""
|
| 242 |
+
if not reasoning:
|
| 243 |
+
# If the reasoning is empty, return nothing
|
| 244 |
+
return None
|
| 245 |
+
else:
|
| 246 |
+
# Format noteworthy boolean as text
|
| 247 |
+
return str(noteworthy)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def compute_confidence(
|
| 251 |
+
heuristic_noteworthy,
|
| 252 |
+
fewshot_noteworthy,
|
| 253 |
+
judge_noteworthy,
|
| 254 |
+
heuristic_rationale,
|
| 255 |
+
fewshot_rationale,
|
| 256 |
+
judge_reasoning,
|
| 257 |
+
):
|
| 258 |
+
"""
|
| 259 |
+
Compute a confidence label using the noteworthy booleans.
|
| 260 |
+
"""
|
| 261 |
+
# Return None if any of the rationales or reasoning is missing.
|
| 262 |
+
if not heuristic_rationale or not fewshot_rationale or not judge_reasoning:
|
| 263 |
+
return None
|
| 264 |
+
if heuristic_noteworthy == fewshot_noteworthy == judge_noteworthy:
|
| 265 |
+
# Classifiers and judge all agree
|
| 266 |
+
return "High"
|
| 267 |
+
elif heuristic_noteworthy != fewshot_noteworthy:
|
| 268 |
+
# Classifiers disagree, judge decides
|
| 269 |
+
return "Moderate"
|
| 270 |
+
else:
|
| 271 |
+
# Classifiers agree, judge vetoes
|
| 272 |
+
return "Questionable"
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# Setup theme without background image
|
| 276 |
+
theme = gr.Theme.from_hub("NoCrypt/miku")
|
| 277 |
+
theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000")
|
| 278 |
+
|
| 279 |
+
# Create Gradio interface
|
| 280 |
+
with gr.Blocks(theme=theme, title="Noteworthy Differences") as demo:
|
| 281 |
+
with gr.Row():
|
| 282 |
+
with gr.Column(scale=2):
|
| 283 |
+
gr.Markdown(
|
| 284 |
+
"""
|
| 285 |
+
# Noteworthy Differences
|
| 286 |
+
Compare the current revision of a Wikipedia article (introduction only) with an old revision (number of days or revisions behind).<br>
|
| 287 |
+
Two classifier models, with relatively short heuristic and few-shot prompts, and a judge predict the noteworthiness of the differences.<br>
|
| 288 |
+
The judge has a longer prompt for AI alignment, also in heuristic or few-shot styles, produced as described in the
|
| 289 |
+
[GitHub repository](https://github.com/jedick/noteworthy-differences).
|
| 290 |
+
"""
|
| 291 |
+
)
|
| 292 |
+
with gr.Column(scale=1):
|
| 293 |
+
gr.Markdown(
|
| 294 |
+
"""#### Confidence Key
|
| 295 |
+
- **High:** heuristic = few-shot = judge
|
| 296 |
+
- **Moderate:** heuristic β few-shot, judge decides
|
| 297 |
+
- **Questionable:** heuristic = few-shot, judge vetoes
|
| 298 |
+
"""
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
with gr.Row():
|
| 302 |
+
title_input = gr.Textbox(
|
| 303 |
+
label="Wikipedia Page Title", placeholder="e.g., Albert Einstein", value=""
|
| 304 |
+
)
|
| 305 |
+
number_input = gr.Number(label="Number", value=100, minimum=0, precision=0)
|
| 306 |
+
unit_dropdown = gr.Dropdown(
|
| 307 |
+
choices=["days", "revisions"], value="days", label="Unit"
|
| 308 |
+
)
|
| 309 |
+
judge_mode_dropdown = gr.Dropdown(
|
| 310 |
+
choices=["unaligned", "aligned-fewshot", "aligned-heuristic"],
|
| 311 |
+
value="aligned-heuristic",
|
| 312 |
+
label="Judge Mode",
|
| 313 |
+
)
|
| 314 |
+
with gr.Column():
|
| 315 |
+
random_btn = gr.Button("Get Random Page Title")
|
| 316 |
+
submit_btn = gr.Button("Fetch Revisions and Run Model", variant="primary")
|
| 317 |
+
|
| 318 |
+
with gr.Row():
|
| 319 |
+
with gr.Column():
|
| 320 |
+
gr.Markdown("### Old Revision")
|
| 321 |
+
old_timestamp = gr.Markdown("")
|
| 322 |
+
old_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False)
|
| 323 |
+
gr.Markdown(
|
| 324 |
+
"""
|
| 325 |
+
- Page title is case-sensitive; use underscores or spaces.
|
| 326 |
+
- Specify any number of days or up to 499 revisions behind.
|
| 327 |
+
"""
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
with gr.Column():
|
| 331 |
+
gr.Markdown("### Current Revision")
|
| 332 |
+
new_timestamp = gr.Markdown("")
|
| 333 |
+
new_revision = gr.Textbox(label="", lines=15, max_lines=30, container=False)
|
| 334 |
+
|
| 335 |
+
with gr.Column():
|
| 336 |
+
gr.Markdown("### Model Output")
|
| 337 |
+
heuristic_rationale = gr.Textbox(
|
| 338 |
+
label="Heuristic Model's Rationale",
|
| 339 |
+
lines=2,
|
| 340 |
+
max_lines=7,
|
| 341 |
+
)
|
| 342 |
+
fewshot_rationale = gr.Textbox(
|
| 343 |
+
label="Few-shot Model's Rationale",
|
| 344 |
+
lines=2,
|
| 345 |
+
max_lines=7,
|
| 346 |
+
)
|
| 347 |
+
judge_reasoning = gr.Textbox(
|
| 348 |
+
label="Judge's Reasoning",
|
| 349 |
+
lines=2,
|
| 350 |
+
max_lines=7,
|
| 351 |
+
)
|
| 352 |
+
with gr.Row(variant="default"):
|
| 353 |
+
noteworthy_text = gr.Textbox(
|
| 354 |
+
label="Noteworthy Differences",
|
| 355 |
+
lines=1,
|
| 356 |
+
interactive=False,
|
| 357 |
+
)
|
| 358 |
+
confidence = gr.Textbox(
|
| 359 |
+
label="Confidence",
|
| 360 |
+
lines=1,
|
| 361 |
+
interactive=False,
|
| 362 |
+
)
|
| 363 |
+
rerun_btn = gr.Button("Rerun Model", variant="primary")
|
| 364 |
+
|
| 365 |
+
# Hidden checkboxes to store boolean values
|
| 366 |
+
heuristic_noteworthy = gr.Checkbox(visible=False)
|
| 367 |
+
fewshot_noteworthy = gr.Checkbox(visible=False)
|
| 368 |
+
judge_noteworthy = gr.Checkbox(visible=False)
|
| 369 |
+
|
| 370 |
+
random_btn.click(
|
| 371 |
+
fn=get_random_wikipedia_title,
|
| 372 |
+
inputs=None,
|
| 373 |
+
outputs=[title_input],
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
gr.on(
|
| 377 |
+
# Press Enter in textbox or use button to submit
|
| 378 |
+
triggers=[title_input.submit, submit_btn.click],
|
| 379 |
+
# Clear the new_revision and new_timestamp values before proceeding.
|
| 380 |
+
# The empty values will propagate to the other components (through function return values) if there is an error.
|
| 381 |
+
fn=lambda: (gr.update(value=""), gr.update(value="")),
|
| 382 |
+
inputs=None,
|
| 383 |
+
outputs=[new_revision, new_timestamp],
|
| 384 |
+
api_name=False,
|
| 385 |
+
).then(
|
| 386 |
+
fn=fetch_current_revision,
|
| 387 |
+
inputs=[title_input],
|
| 388 |
+
outputs=[new_revision, new_timestamp],
|
| 389 |
+
api_name=False,
|
| 390 |
+
).then(
|
| 391 |
+
fn=fetch_previous_revision,
|
| 392 |
+
inputs=[title_input, unit_dropdown, number_input, new_revision],
|
| 393 |
+
outputs=[old_revision, old_timestamp],
|
| 394 |
+
api_name=False,
|
| 395 |
+
).then(
|
| 396 |
+
fn=run_heuristic_classifier,
|
| 397 |
+
inputs=[old_revision, new_revision],
|
| 398 |
+
outputs=[heuristic_noteworthy, heuristic_rationale],
|
| 399 |
+
api_name=False,
|
| 400 |
+
).then(
|
| 401 |
+
fn=run_fewshot_classifier,
|
| 402 |
+
inputs=[old_revision, new_revision],
|
| 403 |
+
outputs=[fewshot_noteworthy, fewshot_rationale],
|
| 404 |
+
api_name=False,
|
| 405 |
+
).then(
|
| 406 |
+
fn=run_judge,
|
| 407 |
+
inputs=[
|
| 408 |
+
old_revision,
|
| 409 |
+
new_revision,
|
| 410 |
+
heuristic_rationale,
|
| 411 |
+
fewshot_rationale,
|
| 412 |
+
judge_mode_dropdown,
|
| 413 |
+
],
|
| 414 |
+
outputs=[judge_noteworthy, judge_reasoning],
|
| 415 |
+
api_name=False,
|
| 416 |
+
).then(
|
| 417 |
+
fn=format_noteworthy,
|
| 418 |
+
inputs=[judge_noteworthy, judge_reasoning],
|
| 419 |
+
outputs=[noteworthy_text],
|
| 420 |
+
api_name=False,
|
| 421 |
+
).then(
|
| 422 |
+
fn=compute_confidence,
|
| 423 |
+
inputs=[
|
| 424 |
+
heuristic_noteworthy,
|
| 425 |
+
fewshot_noteworthy,
|
| 426 |
+
judge_noteworthy,
|
| 427 |
+
heuristic_rationale,
|
| 428 |
+
fewshot_rationale,
|
| 429 |
+
judge_reasoning,
|
| 430 |
+
],
|
| 431 |
+
outputs=[confidence],
|
| 432 |
+
api_name=False,
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
# Rerun model when rerun button is clicked
|
| 436 |
+
gr.on(
|
| 437 |
+
triggers=[rerun_btn.click],
|
| 438 |
+
fn=run_heuristic_classifier,
|
| 439 |
+
inputs=[old_revision, new_revision],
|
| 440 |
+
outputs=[heuristic_noteworthy, heuristic_rationale],
|
| 441 |
+
api_name=False,
|
| 442 |
+
).then(
|
| 443 |
+
fn=run_fewshot_classifier,
|
| 444 |
+
inputs=[old_revision, new_revision],
|
| 445 |
+
outputs=[fewshot_noteworthy, fewshot_rationale],
|
| 446 |
+
api_name=False,
|
| 447 |
+
).then(
|
| 448 |
+
fn=run_judge,
|
| 449 |
+
inputs=[
|
| 450 |
+
old_revision,
|
| 451 |
+
new_revision,
|
| 452 |
+
heuristic_rationale,
|
| 453 |
+
fewshot_rationale,
|
| 454 |
+
judge_mode_dropdown,
|
| 455 |
+
],
|
| 456 |
+
outputs=[judge_noteworthy, judge_reasoning],
|
| 457 |
+
api_name=False,
|
| 458 |
+
).then(
|
| 459 |
+
fn=format_noteworthy,
|
| 460 |
+
inputs=[judge_noteworthy, judge_reasoning],
|
| 461 |
+
outputs=[noteworthy_text],
|
| 462 |
+
api_name=False,
|
| 463 |
+
).then(
|
| 464 |
+
fn=compute_confidence,
|
| 465 |
+
inputs=[
|
| 466 |
+
heuristic_noteworthy,
|
| 467 |
+
fewshot_noteworthy,
|
| 468 |
+
judge_noteworthy,
|
| 469 |
+
heuristic_rationale,
|
| 470 |
+
fewshot_rationale,
|
| 471 |
+
judge_reasoning,
|
| 472 |
+
],
|
| 473 |
+
outputs=[confidence],
|
| 474 |
+
api_name=False,
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
if __name__ == "__main__":
|
| 478 |
+
demo.launch()
|
collect_data.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import csv
|
| 3 |
+
from wiki_data_fetcher import (
|
| 4 |
+
get_previous_revisions,
|
| 5 |
+
extract_revision_info,
|
| 6 |
+
get_wikipedia_introduction,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
title = []
|
| 10 |
+
revid_0, revid_10, revid_100 = [], [], []
|
| 11 |
+
ts_0, ts_10, ts_100 = [], [], []
|
| 12 |
+
intro_0, intro_10, intro_100 = [], [], []
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
|
| 17 |
+
# Open the file in read mode
|
| 18 |
+
with open("data/wikipedia_titles.txt", "r") as file:
|
| 19 |
+
# Iterate through each line in the file
|
| 20 |
+
for line in file:
|
| 21 |
+
# Get title from each line without trailing newline characters
|
| 22 |
+
this_title = line.strip()
|
| 23 |
+
print(this_title)
|
| 24 |
+
# Append title
|
| 25 |
+
title.append(this_title)
|
| 26 |
+
# Get info for most recent 100 revisions
|
| 27 |
+
json_data = get_previous_revisions(this_title, revisions=100)
|
| 28 |
+
# Append data for current revision
|
| 29 |
+
info_0 = extract_revision_info(json_data, 0)
|
| 30 |
+
revid_0.append(info_0["revid"])
|
| 31 |
+
ts_0.append(info_0["timestamp"])
|
| 32 |
+
intro_0.append(get_wikipedia_introduction(info_0["revid"]))
|
| 33 |
+
# Append data for 10th revision before current
|
| 34 |
+
info_10 = extract_revision_info(json_data, 10)
|
| 35 |
+
revid_10.append(info_10["revid"])
|
| 36 |
+
ts_10.append(info_10["timestamp"])
|
| 37 |
+
intro_10.append(get_wikipedia_introduction(info_10["revid"]))
|
| 38 |
+
# Append data for 100th revision before current
|
| 39 |
+
info_100 = extract_revision_info(json_data, 100)
|
| 40 |
+
revid_100.append(info_100["revid"])
|
| 41 |
+
ts_100.append(info_100["timestamp"])
|
| 42 |
+
intro_100.append(get_wikipedia_introduction(info_100["revid"]))
|
| 43 |
+
|
| 44 |
+
# Write the CSV in each loop in case we need to restart after an error
|
| 45 |
+
# Combine the lists
|
| 46 |
+
# fmt: off
|
| 47 |
+
export_data = zip(
|
| 48 |
+
title, revid_0, revid_10, revid_100,
|
| 49 |
+
ts_0, ts_10, ts_100, intro_0, intro_10, intro_100,
|
| 50 |
+
)
|
| 51 |
+
column_names = [
|
| 52 |
+
"title", "revid_0", "revid_10", "revid_100",
|
| 53 |
+
"ts_0", "ts_10", "ts_100",
|
| 54 |
+
"intro_0", "intro_10", "intro_100",
|
| 55 |
+
]
|
| 56 |
+
# fmt: on
|
| 57 |
+
|
| 58 |
+
with open(
|
| 59 |
+
"data/wikipedia_introductions.csv", "w", newline="", encoding="utf-8"
|
| 60 |
+
) as myfile:
|
| 61 |
+
wr = csv.writer(myfile)
|
| 62 |
+
# Write a header row
|
| 63 |
+
wr.writerow(column_names)
|
| 64 |
+
# Write the combined data rows
|
| 65 |
+
wr.writerows(export_data)
|
| 66 |
+
|
| 67 |
+
# Rate limit our API calls
|
| 68 |
+
time.sleep(5)
|
create_examples.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from models import classifier
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def run_classifier(row):
|
| 6 |
+
"""
|
| 7 |
+
Run the model on one row of data from 'data/wikipedia_introductions.csv'.
|
| 8 |
+
The model is run up to four times: two prompt styles (heuristic and few-shot)
|
| 9 |
+
and two revision intervals (from 10th and 100th previous revisions to current).
|
| 10 |
+
|
| 11 |
+
Usage:
|
| 12 |
+
|
| 13 |
+
df = pd.read_csv("data/wikipedia_introductions.csv")
|
| 14 |
+
row = df.iloc[38]
|
| 15 |
+
run_classifier(row)
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
# Initialize output dict
|
| 19 |
+
output = {}
|
| 20 |
+
|
| 21 |
+
output["heuristic_10"] = classifier(row["intro_10"], row["intro_0"], "heuristic")
|
| 22 |
+
output["few-shot_10"] = classifier(row["intro_10"], row["intro_0"], "few-shot")
|
| 23 |
+
output["heuristic_100"] = classifier(row["intro_100"], row["intro_0"], "heuristic")
|
| 24 |
+
output["few-shot_100"] = classifier(row["intro_100"], row["intro_0"], "few-shot")
|
| 25 |
+
|
| 26 |
+
return output
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
|
| 31 |
+
"""
|
| 32 |
+
Run the classifier on all rows from 'data/wikipedia_introductions.csv' and save results in 'data/examples.csv'.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
# Read the data
|
| 36 |
+
df = pd.read_csv("data/wikipedia_introductions.csv")
|
| 37 |
+
|
| 38 |
+
# For reference: Find row indices with at least one missing value
|
| 39 |
+
# missing_rows = df.index[df.isnull().any(axis=1)].tolist()
|
| 40 |
+
# print("\nRow indices with missing values:", missing_rows)
|
| 41 |
+
|
| 42 |
+
# Initialize output data frame
|
| 43 |
+
df_out = None
|
| 44 |
+
|
| 45 |
+
for index, row in df.iterrows():
|
| 46 |
+
# Print the title to see progress
|
| 47 |
+
print(row["title"])
|
| 48 |
+
# Run classifier
|
| 49 |
+
output = run_classifier(row)
|
| 50 |
+
print(output)
|
| 51 |
+
# Create column names and row for data frame
|
| 52 |
+
column_names = [
|
| 53 |
+
outer_k + "_" + inner_k
|
| 54 |
+
for outer_k in output.keys()
|
| 55 |
+
for inner_k in output[outer_k].keys()
|
| 56 |
+
]
|
| 57 |
+
row_values = [
|
| 58 |
+
inner_v for outer_k in output.keys() for inner_v in output[outer_k].values()
|
| 59 |
+
]
|
| 60 |
+
# Add title to output
|
| 61 |
+
column_names = ["title"] + column_names
|
| 62 |
+
row_values = [row["title"]] + row_values
|
| 63 |
+
df_row = pd.DataFrame([row_values], columns=column_names)
|
| 64 |
+
if df_out is None:
|
| 65 |
+
df_out = df_row
|
| 66 |
+
else:
|
| 67 |
+
df_out = pd.concat([df_out, df_row])
|
| 68 |
+
# Write CSV in every loop to avoid data loss if errors occur
|
| 69 |
+
df_out.to_csv("data/examples.csv", index=False, encoding="utf-8")
|
data/alignment_fewshot.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model 1: The differences are minor grammatical adjustments and the addition of a more specific locality (ΓaΔΔ±rkan) within the already stated Kaman District, which provides deeper detail but does not alter the fundamental understanding of the museum's location. Model 2: The new revision provides a more precise location for the museum by adding "ΓaΔΔ±rkan," which enhances the factual accuracy and completeness of the article. Human: The addition of a city name adds detail but doesnβt change the main content. Noteworthy: False.
|
| 2 |
+
Model 1: The differences are not noteworthy because the change only involves the removal of a single word ("lucky"), which is a minor word choice correction and does not alter the facts, people, places, dates, or analysis of the event. Model 2: The new revision removes the inappropriate word "lucky" when referring to the people killed, correcting the tone and factual accuracy of the sentence, which is a noteworthy change. Human: The word βluckyβ is vandalism that radically alters the tone and destroys the neutrality of the article. Noteworthy: True.
|
| 3 |
+
Model 1: The new revision adds "juvenile detention" and changes "sexual affairs" to "sexual awakening" as key life events for the protagonist IstvΓ‘n, which constitutes a change to major events in his life story. Model 2: The new revision adds minor details about the protagonist's journey and makes a few stylistic changes, but these do not substantially alter the overall summary or critical reception of the novel. Human: The change from βsexual affairsβ to βsexual awakeningβ is a nuance in interpretation that doesnβt warrant a noteworthy label. Noteworthy: False.
|
| 4 |
+
Model 1: The differences are not noteworthy as they only involve an update to the population data with a more recent projection, which constitutes a deeper analysis of a topic without changing the overall conclusion or major facts about the department. Model 2: The new revision updates the population data for Cundinamarca by adding 2023 projections, which is a significant improvement in accuracy and currency for an informational article. Human: The population projections for 2023 make the article five years more up-to-date, which is a significant change. Noteworthy: True.
|
| 5 |
+
Model 1: The differences are not noteworthy because they primarily consist of deeper analysis and minor clarifications, such as adding pronunciation, specifying the French origin, and detailing seating configurations, rather than changing the fundamental definition or conclusion about what a coupe is. Model 2: The new revision significantly expands the definition of a 'coupe' by adding pronunciation, clarifying the typical door count, detailing seating configurations, and distinguishing it from a 'sports coupe', which provides a more comprehensive understanding of the term. Human: The changes are noteworthy for extending the definition of a coupe to include four seats and addition of different pronunciations in the US and elsewhere. Noteworthy: True.
|
| 6 |
+
Model 1: The differences are not noteworthy as they involve minor rephrasing, a slight broadening of scope for roof variations, and the removal of "cargo space" as a listed drawback, none of which substantially alters the overall conclusion or understanding of convertibles. Model 2: The new revision removes "cargo space" as a stated potential drawback of convertibles and rephrases the description of reduced structural rigidity, altering the completeness of the information regarding convertible disadvantages. Human: There are small changes like βmodelsβ vs βeras and manufacturersβ and not mentioning cargo space in the new revision, but these donβt alter crucial information. Noteworthy: False.
|
| 7 |
+
Model 1: The differences are noteworthy because they include changes to biographical details such as the addition of a middle name, the removal of a birthplace, and a change in nationality from English to British. Model 2: The new revision makes minor changes to the subject's full name, nationality, and birth details, which do not substantially affect the biographical overview. Human: The change from the birth city (Sheffield, South Yorkshire) to βBritishβ is a matter of reporting depth rather than an actual change to the facts and therefore is not notable for most readers. Noteworthy: False.
|
| 8 |
+
Model 1: The differences between the revisions are not noteworthy as they primarily consist of minor grammatical and wording changes, the removal of a specific historical detail about aluminium's use in the World Wars, and the addition of a sentence providing deeper analysis about its density, none of which alter the main conclusions or understanding of the topic. Model 2: The new revision is noteworthy because it adds a scientific detail explaining that aluminium's low standard atomic weight contributes to its low density, which is responsible for many of its uses, thus enhancing the reader's understanding of its properties and applications. Human: The new revision adds that βaluminiumβ is the preferred IUPAC name and also gives a reason for the relatively low density compared to other metals, both of which contribute to a more complete overview. Noteworthy: True.
|
| 9 |
+
Model 1: The differences between the revisions are not noteworthy as they primarily involve minor rephrasing, changes in structure, and deeper analysis of the engine assembly process (such as the Model 2: The new revision provides more specific and detailed information regarding the assembly process of the different engine variants, clarifying the 'one man, one engine' principle for AMG engines versus machine assembly for mainstream Mercedes-Benz models. Human: The new revision omits the detail about two blocks sharing a single crankshaft, but this is implied by the M133 ancestry, and also has less detail about where specific variants are manufactured, none of which are highly notable changes. Noteworthy: False.
|
| 10 |
+
Model 1: The differences are noteworthy because the new revision adds Bayden Barber's birth year, which is a change to a date/personal detail. Model 2: The new revision adds a birth year and corrects a spelling of a MΔori term, which are minor factual updates that do not substantially change the biographical overview. Human: Birth year is important information for a biographical entry. Noteworthy: True.
|
| 11 |
+
Model 1: The differences are not noteworthy as they primarily consist of minor word choice changes, rephrasing, removal of descriptive details, and a clarification regarding Yorkshire's current administrative status and the specific dynasty of the House of York, none of which alter the overall meaning or conclusion about Yorkshire's history and identity. Model 2: The new revision clarifies that Yorkshire is no longer used for administration, adds important historical context about the "Plantagenet" royal House of York, and refines some geographical descriptions, leading to a more precise and updated overview of the region. Human: The mention of βno longer being used for administrationβ is a notable change in the new revision; text clean-up (less repetition of compass directions), and a name in the emblemβs origin (Plantagenet) are changes that would not be counted as noteworthy. Noteworthy: True.
|
| 12 |
+
Model 1: The differences are not noteworthy because they consist of minor statistical updates (death toll), small factual additions (Lincoln as Republican, Lincoln giving Grant command), and a more explicit but not substantially different analysis of the Confederacy's motivation for secession related to slavery, which does not change the overall conclusion that slavery was the central cause of the war. Model 2: The new revision explicitly states that the Confederacy was formed to preserve African American slavery and clarifies that Lincoln gave Grant command of Union armies, providing a more precise and complete account of the war's origins and leadership. Human: These revisions introduce the article with quite different settings of the conflict (βexpand in the western territoriesβ vs βthe election of Abraham Lincoln and the abolitionist movement in the Northβ), resulting in a noteworthy difference in historical outlook. Noteworthy: True.
|
| 13 |
+
Model 1: The new revision adds a specific, technical detail about the duration of 1908, which provides a deeper analysis of the topic without changing the overall conclusion or introducing new people, places, dates, or major events. Model 2: The new revision adds a significant factual detail, stating that 1908 was the longest year in either the Julian or Gregorian calendars, measured in Terrestrial Time. Human: The fact that 1908 was the longest year in either calendar would be a piece of trivia in other contexts, but on the Wikipedia page for the year itself is a notable mention of a record-breaking characteristic. Noteworthy: True.
|
| 14 |
+
Model 1: The new revision only adds "mass population displacement" as an additional contributing factor to the population decline, which is a minor detail that does not alter the overall analysis or conclusion of the article. Model 2: The new revision adds "mass population displacement" as a contributing factor to the sharp decline in the Congolese population, which provides a more complete understanding of the atrocities' impact. Human: The addition of mass population displacement as a cause of population decline is a noteworthy change in the new revision. Noteworthy: True.
|
| 15 |
+
Model 1: The differences are not noteworthy as they consist of minor word choice changes like "locals" to "Indigenous people," grammatical adjustments, and the addition of the French name for the state, none of which alter the core facts, analysis, or conclusion of the article. Model 2: The new revision improves factual accuracy by adding the French name of the state and uses more precise and respectful terminology by changing "locals" to "Indigenous people" when referring to the inhabitants. Human: There are minor changes in wording (e.g. βwent thereβ vs βvisited itβ and βlocalβ vs βIndigenousβ) that do not substantially alter the information presented. Noteworthy: False.
|
| 16 |
+
Model 1: The new revision adds 'and photographer' to the description of Howard H. Baker Jr.'s professions, which is a factual detail that offers a slightly deeper understanding of his overall life but does not change the main narrative or conclusions about his political and diplomatic career. Model 2: The new revision adds "and photographer" to Howard Henry Baker Jr.'s listed professions, which is a noteworthy addition to his biographical overview. Human: The new revision adds an occupation (photographer) that is notable for making a more complete biographical entry. Noteworthy: True.
|
| 17 |
+
Model 1: The differences are not noteworthy as they involve minor word choice changes, the addition of a colloquial nickname for the city ("La Nevera"), and the removal of a general descriptive phrase ("one of the largest cities in the world"), none of which substantially alter the main facts, analysis, or conclusion about BogotΓ‘. Model 2: The new revision removes the claim that BogotΓ‘ is "one of the largest cities in the world" and adds a new nickname, which significantly alters the description of the city's global standing and cultural information. Human: The new revision adds a nickname ("La Nevera" (The Fridge")) that makes the changes noteworthy; the ommision of minor trivia (βone of the largest cities in the worldβ) is not a notable change. Noteworthy: True.
|
| 18 |
+
Model 1: The differences are not noteworthy because they involve minor rephrasing and the addition of a geographical detail (located along the North Pacific Ocean) that does not change the overall understanding or conclusion about Kushiro. Model 2: The new revision is noteworthy because it adds specific geographical information about Kushiro, stating that it is "Located along the coast of the North Pacific Ocean", which provides a more complete description of the city's location. Human: The new revision adds coastal location, an important characteristic that is not shared by all cities in Japan and that cannot be inferred from the old revision. Noteworthy: True.
|
| 19 |
+
Model 1: The differences are not noteworthy as they involve minor grammatical corrections, stylistic changes, and the addition of a specific financial detail (cost cap) which provides deeper analysis without changing the overall conclusion about Formula One's financial landscape. Model 2: The new revision adds important information about the current F1 cost cap, providing a more complete financial overview of the sport beyond just the average team running costs, and also corrects a minor typo. Human: There is a small typo in the old revision (okFIA) that is not significant; however, the cost cap added to the new revision is a notable enhancement despite the poor grammar in the edit. Noteworthy: False.
|
| 20 |
+
Model 1: The changes primarily involve adding alternative names for the Concordat of Worms and clarifying terminology (e.g., "temporalities"), while removing some extraneous initial text, none of which alter the core facts or conclusions about the agreement. Model 2: The new revision is noteworthy because it removes irrelevant headings, adds crucial alternative names and translations for the Concordat of Worms, and refines the terminology regarding the lands associated with episcopal sees, thereby significantly enhancing the article's clarity, comprehensiveness, and accuracy. Human: The addition of Latin and German names is a small but useful addition to the new revision. Noteworthy: True.
|
| 21 |
+
Model 1: The differences include a minor clarification by adding "(cubes)" after "voxels" and an additional sentence about Minecraft being a cultural touchstone for Generation Z, which offers deeper analysis of the game's impact but does not alter the overall conclusion of the article. Model 2: The new revision adds a noteworthy point about Minecraft being a cultural touchstone for Generation Z, which enhances the understanding of the game's societal impact and popularization. Human: The reference to Generation Z in the new revision places the article in a firmer context. Noteworthy: True.
|
| 22 |
+
Model 1: The differences are not noteworthy as they primarily consist of minor grammatical restructuring, slight changes in terminology (e.g., "Native Americans" to "Indigenous Americans"), and the addition of specific Cherokee script for a name, none of which alter the core factual information or analysis of the article's subject. Model 2: The new revision is noteworthy because it provides the actual Cherokee script and romanized version of Wilma Mankiller's name, replacing a generic template, and updates the terminology from "Native Americans" to "Indigenous Americans" in one instance, making the biographical overview more complete and current. Human: The addition of Cherokee and romanized names gives greater recognition to the cultural background of the subject of this article. Noteworthy: True.
|
| 23 |
+
Model 1: The differences are not noteworthy because the new revision offers deeper analysis by clarifying specific human factors like the destruction of natural topsoil and adding context about an existing agricultural recession, but these details do not change the fundamental conclusions about the Dust Bowl's causes or effects. Model 2: The new revision provides a more detailed explanation of the human-made causes of the Dust Bowl, highlights its exacerbation of an existing agricultural recession, and includes more specific details about related cultural works, offering a more comprehensive account. Human: The new revision goes into greater depth on causes (destruction of natural topsoil by settlers) and consequences (exacerbated an already existing drought condition) but does not substantially change the main message. Noteworthy: False.
|
| 24 |
+
Model 1: The differences are not noteworthy because the new revision primarily adds a parenthetical explanation for the cool wet season's role in soil moisture and removes a sentence about arid-zone agriculture development, without fundamentally changing the definition or analysis of dryland farming. Model 2: The new revision adds an important clarification about the role of the cool wet season in charging the soil with moisture for dryland farming and removes a sentence about arid-zone agriculture being developed, which significantly enhances the explanation of the agricultural technique. Human: The phrase about βwhich charges the soil with virtually all the moisture that the crops will receive prior to harvestβ in the new revision is a clarification rather than a significant change. Noteworthy: False.
|
| 25 |
+
Model 1: The differences are not noteworthy because they consist of minor additions, such as the descriptor 'landlocked' and the nickname 'Gateway to the West,' and a minor word change from 'makes up' to 'forms' for the eastern border, none of which alter the core information or analysis of the article. Model 2: The new revision adds the significant nickname "Gateway to the West" to the state's description, providing a more complete overview of how Missouri is historically and culturally recognized. Human: The new revision adds landlocked, but this can be inferred by the lack of an aquatic border, and changes the order of nicknames, which is a minor change. Noteworthy: False.
|
| 26 |
+
Model 1: The differences are not noteworthy because the new revision only adds a more recent population estimate for 2024, which is a deeper analysis of the topic without changing the conclusion about Tracy's status. Model 2: The new revision includes an updated population estimate for 2024, providing more current information about the city's demographics. Human: Addition of population estimate in the new revision indicates the growth of the city in recent years. Noteworthy: True.
|
data/alignment_heuristic.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Noteworthy Changes (True)
|
| 2 |
+
|
| 3 |
+
**Factual Additions/Corrections:**
|
| 4 |
+
- Adding significant biographical details (birth year, occupations, cultural names)
|
| 5 |
+
- Updating population data to be significantly more current (5+ years)
|
| 6 |
+
- Adding record-breaking characteristics or superlatives relevant to the article subject
|
| 7 |
+
- Inserting important geographical features (coastal location, landlocked status when not obvious)
|
| 8 |
+
- Including cultural/historical context that wasn't inferable (nicknames with meaning, generational impact)
|
| 9 |
+
|
| 10 |
+
**Tone & Neutrality:**
|
| 11 |
+
- Removing vandalism or inappropriate language that destroys article neutrality
|
| 12 |
+
- Correcting words that radically alter tone (e.g., "lucky" when referring to deaths)
|
| 13 |
+
|
| 14 |
+
**Substantive Content Changes:**
|
| 15 |
+
- Altering the framing of historical events (different causes or contexts presented)
|
| 16 |
+
- Adding causes or factors to complex events (mass displacement, additional contributing factors)
|
| 17 |
+
- Expanding definitions to include significant variations (four-door coupes, multiple pronunciations)
|
| 18 |
+
- Adding technical/scientific explanations for key properties or phenomena
|
| 19 |
+
|
| 20 |
+
**Cultural Recognition:**
|
| 21 |
+
- Including indigenous language names/scripts that honor cultural background
|
| 22 |
+
- Adding alternative official names (Latin, other languages) for historical agreements or places
|
| 23 |
+
|
| 24 |
+
## Not Noteworthy Changes (False)
|
| 25 |
+
|
| 26 |
+
**Minor Details:**
|
| 27 |
+
- Grammatical adjustments, rephrasing, or structural reorganization
|
| 28 |
+
- Adding locality specificity within already-stated regions (sub-district within district)
|
| 29 |
+
- Removing minor trivia that doesn't affect core understanding
|
| 30 |
+
- Adding nicknames without significant cultural/historical weight
|
| 31 |
+
- Word choice refinements that don't change meaning ("locals" β "Indigenous people" as terminology update only)
|
| 32 |
+
|
| 33 |
+
**Inferable Information:**
|
| 34 |
+
- Details implied by other information (e.g., "landlocked" when no coastal borders mentioned)
|
| 35 |
+
- Ancestry relationships that explain omitted technical details
|
| 36 |
+
|
| 37 |
+
**Depth vs. Substance:**
|
| 38 |
+
- Deeper analysis that doesn't change fundamental conclusions
|
| 39 |
+
- Additional technical details that provide nuance but not new understanding
|
| 40 |
+
- Clarifications of already-conveyed information
|
| 41 |
+
- Minor statistical updates that don't represent significant temporal gaps
|
| 42 |
+
|
| 43 |
+
**Administrative/Format Changes:**
|
| 44 |
+
- Order changes in lists
|
| 45 |
+
- Removal of extraneous text or typos (unless they substantially affected meaning)
|
| 46 |
+
- Template replacements with equivalent content
|
| 47 |
+
|
| 48 |
+
## Key Principles
|
| 49 |
+
|
| 50 |
+
1. **Inferability Test**: If the new information could be reasonably inferred from the old revision, it's likely not noteworthy
|
| 51 |
+
2. **Completeness vs. Depth**: Adding information that makes an entry more complete (new occupation, birth year) is noteworthy; adding depth to existing information usually isn't
|
| 52 |
+
3. **Context Matters**: The same change can be noteworthy or not depending on article context (longest year trivia is noteworthy on a year's own page)
|
| 53 |
+
4. **Temporal Significance**: Updates spanning 5+ years are noteworthy; minor year-to-year updates typically aren't
|
| 54 |
+
5. **Framing Changes**: Alterations to how events/subjects are presented or understood are noteworthy; rewordings that preserve meaning aren't
|
| 55 |
+
6. **Cultural Respect**: Additions recognizing cultural identity, indigenous languages, or heritage are noteworthy
|
judge_disagreements.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from models import judge
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Run the judge on all rows from 'data/disagreements_for_AI.csv' and save results in 'data/AI_judgments_unaligned.csv'.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
# Read the data
|
| 12 |
+
df = pd.read_csv("data/disagreements_for_AI.csv")
|
| 13 |
+
|
| 14 |
+
# Add empty columns for AI judgments
|
| 15 |
+
df["noteworthy"] = None
|
| 16 |
+
df["reasoning"] = None
|
| 17 |
+
|
| 18 |
+
# We run the unaligned judge unless the script is called with --aligned-fewshot or --aligned--heuristic
|
| 19 |
+
mode = "unaligned"
|
| 20 |
+
outfile = "data/AI_judgments_unaligned.csv"
|
| 21 |
+
# Check if an argument was passed
|
| 22 |
+
if len(sys.argv) > 1:
|
| 23 |
+
# sys.argv[0] is the script name, sys.argv[1] is the first argument
|
| 24 |
+
argument = sys.argv[1]
|
| 25 |
+
if argument == "--aligned-fewshot":
|
| 26 |
+
mode = "aligned-fewshot"
|
| 27 |
+
outfile = "data/AI_judgments_fewshot.csv"
|
| 28 |
+
elif argument == "--aligned-heuristic":
|
| 29 |
+
mode = "aligned-heuristic"
|
| 30 |
+
outfile = "data/AI_judgments_heuristic.csv"
|
| 31 |
+
else:
|
| 32 |
+
raise ValueError(f"Unknown argument: {argument}")
|
| 33 |
+
|
| 34 |
+
print(f"Saving judgments to {outfile}")
|
| 35 |
+
|
| 36 |
+
for index, row in df.iterrows():
|
| 37 |
+
# Change this if needed (to restart after errors)
|
| 38 |
+
if index < 0:
|
| 39 |
+
next
|
| 40 |
+
else:
|
| 41 |
+
# Print the title to see progress
|
| 42 |
+
print(row["title"])
|
| 43 |
+
# Run judge
|
| 44 |
+
try:
|
| 45 |
+
output = judge(
|
| 46 |
+
df.iloc[index]["old_revision"],
|
| 47 |
+
df.iloc[index]["new_revision"],
|
| 48 |
+
df.iloc[index]["heuristic_rationale"],
|
| 49 |
+
df.iloc[index]["few-shot_rationale"],
|
| 50 |
+
mode=mode,
|
| 51 |
+
)
|
| 52 |
+
except:
|
| 53 |
+
output = {"noteworthy": None, "reasoning": None}
|
| 54 |
+
print(output)
|
| 55 |
+
# Update data frame
|
| 56 |
+
df.at[index, "noteworthy"] = output["noteworthy"]
|
| 57 |
+
df.at[index, "reasoning"] = output["reasoning"]
|
| 58 |
+
# Write CSV in every loop to avoid data loss if errors occur
|
| 59 |
+
df.to_csv(outfile, index=False, encoding="utf-8")
|
models.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Noteworthy Differences:
|
| 2 |
+
# Classification of noteworthy differences between revisions of Wikipedia articles: an AI alignment project
|
| 3 |
+
# 20251114 jmd version 1
|
| 4 |
+
|
| 5 |
+
from google import genai
|
| 6 |
+
from google.genai import types
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from prompts import analyzer_prompts, judge_prompt
|
| 13 |
+
from retry_with_backoff import retry_with_backoff
|
| 14 |
+
import logfire
|
| 15 |
+
|
| 16 |
+
# Load API keys
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# Setup Logfire
|
| 20 |
+
logfire.configure()
|
| 21 |
+
|
| 22 |
+
# This wraps Google Gen AI client calls
|
| 23 |
+
# to capture prompts, responses, and metadata
|
| 24 |
+
logfire.instrument_google_genai()
|
| 25 |
+
|
| 26 |
+
# Initialize the Gemini LLM
|
| 27 |
+
client = genai.Client()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@retry_with_backoff()
|
| 31 |
+
def classifier(old_revision, new_revision, prompt_style):
|
| 32 |
+
"""
|
| 33 |
+
Classify noteworthy differences between revisions of a Wikipedia article
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
old_revision: Old revision of article
|
| 37 |
+
new_revision: New revision of article
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
noteworthy: True if the differences are noteworthy; False if not
|
| 41 |
+
rationale: One-sentence rational for the classification
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
# Return None for missing revisions
|
| 45 |
+
if not pd.notna(old_revision) or not pd.notna(new_revision):
|
| 46 |
+
return {"noteworthy": None, "rationale": None}
|
| 47 |
+
|
| 48 |
+
# Get prompt template for given style
|
| 49 |
+
prompt_template = analyzer_prompts[prompt_style]
|
| 50 |
+
|
| 51 |
+
# Add article revisions to prompt
|
| 52 |
+
prompt = prompt_template.replace("{{old_revision}}", old_revision).replace(
|
| 53 |
+
"{{new_revision}}", new_revision
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Define response schema
|
| 57 |
+
class Response(BaseModel):
|
| 58 |
+
noteworthy: bool
|
| 59 |
+
rationale: str
|
| 60 |
+
|
| 61 |
+
# Generate response
|
| 62 |
+
response = client.models.generate_content(
|
| 63 |
+
model="gemini-2.5-flash",
|
| 64 |
+
contents=prompt,
|
| 65 |
+
config=types.GenerateContentConfig(
|
| 66 |
+
response_mime_type="application/json",
|
| 67 |
+
response_schema=Response.model_json_schema(),
|
| 68 |
+
),
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
return json.loads(response.text)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@retry_with_backoff()
|
| 75 |
+
def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"):
|
| 76 |
+
"""
|
| 77 |
+
AI judge to settle disagreements between classification models
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
old_revision: Old revision of article
|
| 81 |
+
new_revision: New revision of article
|
| 82 |
+
rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
|
| 83 |
+
rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
|
| 84 |
+
mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
noteworthy: True if the differences are noteworthy; False if not
|
| 88 |
+
reasoning: One-sentence reason for the judgment
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
prompt = judge_prompt
|
| 92 |
+
# Add article revisions to prompt
|
| 93 |
+
prompt = prompt.replace("{{old_revision}}", old_revision).replace(
|
| 94 |
+
"{{new_revision}}", new_revision
|
| 95 |
+
)
|
| 96 |
+
# Add rationales to prompt
|
| 97 |
+
prompt = prompt.replace("{{model_1_rationale}}", rationale_1).replace(
|
| 98 |
+
"{{model_2_rationale}}", rationale_2
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Optionally add alignment text to prompt
|
| 102 |
+
if mode == "unaligned":
|
| 103 |
+
alignment_text = ""
|
| 104 |
+
elif mode == "aligned-fewshot":
|
| 105 |
+
with open("data/alignment_fewshot.txt", "r") as file:
|
| 106 |
+
lines = file.readlines()
|
| 107 |
+
alignment_text = "".join(lines)
|
| 108 |
+
elif mode == "aligned-heuristic":
|
| 109 |
+
with open("data/alignment_heuristic.txt", "r") as file:
|
| 110 |
+
lines = file.readlines()
|
| 111 |
+
alignment_text = "".join(lines)
|
| 112 |
+
else:
|
| 113 |
+
raise ValueError(f"Unknown mode: {mode}")
|
| 114 |
+
|
| 115 |
+
prompt = prompt.replace("{{alignment_text}}", alignment_text)
|
| 116 |
+
|
| 117 |
+
# Define response schema
|
| 118 |
+
class Response(BaseModel):
|
| 119 |
+
noteworthy: bool
|
| 120 |
+
reasoning: str
|
| 121 |
+
|
| 122 |
+
# Generate response
|
| 123 |
+
response = client.models.generate_content(
|
| 124 |
+
model="gemini-2.5-flash",
|
| 125 |
+
contents=prompt,
|
| 126 |
+
config=types.GenerateContentConfig(
|
| 127 |
+
response_mime_type="application/json",
|
| 128 |
+
response_schema=Response.model_json_schema(),
|
| 129 |
+
),
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
return json.loads(response.text)
|
prompts.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
skeleton = """
|
| 2 |
+
You are a reading assistant tasked with finding noteworthy differences between revisions of a Wikipedia article.
|
| 3 |
+
Decide if the differences between the old and new revisions are noteworthy.
|
| 4 |
+
|
| 5 |
+
{{instructions}}
|
| 6 |
+
|
| 7 |
+
Return a JSON-formatted response with keys for:
|
| 8 |
+
- 'noteworthy' (True if differences between revisions are noteworthy or False if they are not)
|
| 9 |
+
- 'rationale' (one sentence explaining why the differences are or are not noteworthy, including a summary of the differences)
|
| 10 |
+
|
| 11 |
+
<old_revision>
|
| 12 |
+
{{old_revision}}
|
| 13 |
+
</old_revision>
|
| 14 |
+
|
| 15 |
+
<new_revision>
|
| 16 |
+
{{new_revision}}
|
| 17 |
+
</new_revision>
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
analyzer_prompts = {
|
| 21 |
+
"heuristic": skeleton.replace(
|
| 22 |
+
"{{instructions}}",
|
| 23 |
+
"""
|
| 24 |
+
Noteworthy differences are characterized by:
|
| 25 |
+
- Different people or places mentioned
|
| 26 |
+
- Changes to dates or major events
|
| 27 |
+
- Different analysis of a topic leading to a substantially different conclusion
|
| 28 |
+
|
| 29 |
+
These are differences that are not noteworthy:
|
| 30 |
+
- Changes to grammar or minor word choice
|
| 31 |
+
- Different structure but same meaning
|
| 32 |
+
- Deeper analysis of a topic that does not change the conclusion
|
| 33 |
+
""",
|
| 34 |
+
),
|
| 35 |
+
"few-shot": skeleton.replace(
|
| 36 |
+
"{{instructions}}",
|
| 37 |
+
"""
|
| 38 |
+
Example of noteworthy differences:
|
| 39 |
+
|
| 40 |
+
Old revision: David Szalay (/ΛsΙlΙΙͺ/; born 1974 in Montreal, Canada) is a Canadian born-Hungarian-British writer. His sixth novel, Flesh, won the 2025 Booker Prize.[1]
|
| 41 |
+
|
| 42 |
+
New revision: David Szalay (/ΛsΙlΙΙͺ/ SOL-oy; born January 1974) is a Canadian-born Hungarian-British writer. His novels All That Man Is[1] and Turbulence[2] are noted for their unique narrative structure, being collections of intertwined short stories. All That Man Is was shortlisted for the 2016 Man Booker Prize and won the 2016 Gordon Burn Prize. His sixth novel, Flesh,[3] won the 2025 Booker Prize.[4][5][6]
|
| 43 |
+
|
| 44 |
+
Rationale: The new revision provides more information about the author's work leading to a more complete biographical overview.
|
| 45 |
+
|
| 46 |
+
Example of differences that are not noteworthy:
|
| 47 |
+
|
| 48 |
+
Old revision: David Szalay (/ΛsΙlΙΙͺ/ SOL-oy; born January 1974) is a Canadian-born Hungarian-British writer. His novels All That Man Is and Turbulence are noted for their unique narrative structure, being collections of intertwined short stories. All That Man Is was shortlisted for the 2016 Man Booker Prize and won the 2016 Gordon Burn Prize. His sixth novel, Flesh, featured a more traditional narrative but was noted for its ability for readers to connect with its protagonist in spite of its sparse prose and dialogue. Flesh won the 2025 Booker Prize.
|
| 49 |
+
|
| 50 |
+
New revision: David Szalay (/ΛsΙlΙΙͺ/ SOL-oy; born January 1974) is a Canadian-born Hungarian-British writer. His novels All That Man Is[1] and Turbulence[2] are noted for their unique narrative structure, being collections of intertwined short stories. All That Man Is was shortlisted for the 2016 Man Booker Prize and won the 2016 Gordon Burn Prize. His sixth novel, Flesh,[3] won the 2025 Booker Prize.[4][5][6]
|
| 51 |
+
|
| 52 |
+
Rationale: The old revision analyzes a book in more depth but does not substantially affect the biographical overview.
|
| 53 |
+
""",
|
| 54 |
+
),
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
judge_prompt = """
|
| 58 |
+
You are a judge tasked with using the output of two classification models together with human preferences to make a final decision.
|
| 59 |
+
The models were asked to provide rationales about whether noteworthy differences exist between old and new revisions of a Wikipedia article.
|
| 60 |
+
|
| 61 |
+
If the models disagree:
|
| 62 |
+
Use the rationales and article revisions to make an informed judgment about which model is correct.
|
| 63 |
+
|
| 64 |
+
If the models agree:
|
| 65 |
+
You may veto the models and change the label only if there would be strong human preference to do so.
|
| 66 |
+
|
| 67 |
+
In both cases, align your response to human preferences (if available) and state how this affects your reasoning.
|
| 68 |
+
Use the examples (if available) to infer patterns of human preference that can be generalized to the topics and situations in any article.
|
| 69 |
+
|
| 70 |
+
{{alignment_text}}
|
| 71 |
+
|
| 72 |
+
Return a JSON-formatted response with keys for:
|
| 73 |
+
- 'noteworthy' (True if differences between revisions are noteworthy or False if they are not)
|
| 74 |
+
- 'reasoning' (one sentence explaining how you made the judgment)
|
| 75 |
+
|
| 76 |
+
<old_revision>
|
| 77 |
+
{{old_revision}}
|
| 78 |
+
</old_revision>
|
| 79 |
+
|
| 80 |
+
<new_revision>
|
| 81 |
+
{{new_revision}}
|
| 82 |
+
</new_revision>
|
| 83 |
+
|
| 84 |
+
<model_1_rationale>
|
| 85 |
+
{{model_1_rationale}}
|
| 86 |
+
</model_1_rationale>
|
| 87 |
+
|
| 88 |
+
<model_2_rationale>
|
| 89 |
+
{{model_2_rationale}}
|
| 90 |
+
</model_2_rationale>
|
| 91 |
+
"""
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
google-genai
|
| 2 |
+
pydantic
|
| 3 |
+
pandas
|
| 4 |
+
dotenv
|
| 5 |
+
gradio
|
| 6 |
+
requests
|
| 7 |
+
logfire
|
| 8 |
+
opentelemetry-instrumentation-google-genai
|
retry_with_backoff.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import functools
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def retry_with_backoff(
|
| 7 |
+
max_retries=5, base_delay=2, backoff_factor=2, exceptions=(Exception,)
|
| 8 |
+
):
|
| 9 |
+
"""
|
| 10 |
+
Decorator to retry a function with exponential backoff.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
max_retries (int): Maximum number of retries before giving up.
|
| 14 |
+
base_delay (float): Initial delay in seconds before retrying.
|
| 15 |
+
backoff_factor (float): Multiplier for delay after each failure.
|
| 16 |
+
exceptions (tuple): Exception types to catch and retry on.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def decorator(func):
|
| 20 |
+
@functools.wraps(func)
|
| 21 |
+
def wrapper(*args, **kwargs):
|
| 22 |
+
delay = base_delay
|
| 23 |
+
attempt = 0
|
| 24 |
+
while attempt < max_retries:
|
| 25 |
+
try:
|
| 26 |
+
# Pass args and kwargs
|
| 27 |
+
return func(*args, **kwargs)
|
| 28 |
+
except exceptions as e:
|
| 29 |
+
attempt += 1
|
| 30 |
+
if attempt >= max_retries:
|
| 31 |
+
# Raise the last exception if max retries reached
|
| 32 |
+
raise
|
| 33 |
+
print(
|
| 34 |
+
f"[Retry {attempt}/{max_retries}] Error: {e}. Retrying in {delay:.2f}s..."
|
| 35 |
+
)
|
| 36 |
+
time.sleep(delay)
|
| 37 |
+
# Exponential backoff with jitter
|
| 38 |
+
delay *= backoff_factor + random.uniform(0, 1)
|
| 39 |
+
|
| 40 |
+
return wrapper
|
| 41 |
+
|
| 42 |
+
return decorator
|
test_models.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from models import classifier, judge
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import logfire
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Load API keys
|
| 7 |
+
load_dotenv()
|
| 8 |
+
# Setup Logfire
|
| 9 |
+
# We need send_to_logfire=True to capture traces under Pytest
|
| 10 |
+
# https://logfire.pydantic.dev/docs/reference/advanced/testing/
|
| 11 |
+
logfire.configure(send_to_logfire=True)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def classifier_logic(i):
|
| 15 |
+
"""
|
| 16 |
+
Return scenario flags for heuristic/few-shot classifier outputs.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
i: Current iteration (for logging)
|
| 20 |
+
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
old_revision = """Henry Purcell (/ΛpΙΛrsΙl/, rare: /pΙrΛsΙl/;[n 1] c.β10 September 1659[n 2] β 21 November 1695) was an English composer of Baroque music. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
|
| 24 |
+
|
| 25 |
+
new_revision = """Henry Purcell (/ΛpΙΛrsΙl/, rare: /pΙrΛsΙl/;[n 1] c.β10 September 1659[n 2] β 21 November 1695) was an English composer and organist of the middle Baroque era. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
|
| 26 |
+
|
| 27 |
+
with logfire.span("classifier_logic {i}", i=i):
|
| 28 |
+
# Run classifier models
|
| 29 |
+
heuristic = classifier(old_revision, new_revision, "heuristic")
|
| 30 |
+
few_shot = classifier(old_revision, new_revision, "few-shot")
|
| 31 |
+
heuristic_true = heuristic["noteworthy"] is True
|
| 32 |
+
few_shot_true = few_shot["noteworthy"] is True
|
| 33 |
+
|
| 34 |
+
only_heuristic_true = heuristic_true and not few_shot_true
|
| 35 |
+
only_few_shot_true = few_shot_true and not heuristic_true
|
| 36 |
+
both_true = heuristic_true and few_shot_true
|
| 37 |
+
both_false = (heuristic_true is False) and (few_shot_true is False)
|
| 38 |
+
|
| 39 |
+
return (
|
| 40 |
+
only_heuristic_true,
|
| 41 |
+
only_few_shot_true,
|
| 42 |
+
both_true,
|
| 43 |
+
both_false,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def judge_logic(i):
|
| 48 |
+
"""
|
| 49 |
+
Return scenario flags for judge outputs.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
i: Current iteration (for logging)
|
| 53 |
+
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
old_revision = """Kaman-KalehΓΆyΓΌk Archaeological Museum (Turkish: Kaman-KalehΓΆyΓΌk Arkeoloji MΓΌzesi) is an archaeological museum in Kaman District of KΔ±rΕehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-KalehΓΆyΓΌk. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
|
| 57 |
+
|
| 58 |
+
new_revision = """The Kaman-KalehΓΆyΓΌk Archaeological Museum (Turkish: Kaman-KalehΓΆyΓΌk Arkeoloji MΓΌzesi) is an archaeological museum in ΓaΔΔ±rkan, Kaman District, KΔ±rΕehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-KalehΓΆyΓΌk. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
|
| 59 |
+
|
| 60 |
+
with logfire.span("judge_logic {i}", i=i):
|
| 61 |
+
heuristic = classifier(old_revision, new_revision, "heuristic")
|
| 62 |
+
few_shot = classifier(old_revision, new_revision, "few-shot")
|
| 63 |
+
judge_few_shot = judge(
|
| 64 |
+
old_revision,
|
| 65 |
+
new_revision,
|
| 66 |
+
heuristic["rationale"],
|
| 67 |
+
few_shot["rationale"],
|
| 68 |
+
mode="aligned-fewshot",
|
| 69 |
+
)
|
| 70 |
+
judge_heuristic = judge(
|
| 71 |
+
old_revision,
|
| 72 |
+
new_revision,
|
| 73 |
+
heuristic["rationale"],
|
| 74 |
+
few_shot["rationale"],
|
| 75 |
+
mode="aligned-heuristic",
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Test condition is True if aligned judges both give False
|
| 79 |
+
judge_condition = (
|
| 80 |
+
judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
return judge_condition
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# pytest -vv test_models.py::test_classifier
|
| 87 |
+
def test_classifier():
|
| 88 |
+
"""Run classifier logic 5 times and compare outcomes."""
|
| 89 |
+
tries = 5
|
| 90 |
+
with logfire.span("test_classifier"):
|
| 91 |
+
outcomes = [classifier_logic(i) for i in range(tries)]
|
| 92 |
+
|
| 93 |
+
only_heuristic_true = sum(result[0] for result in outcomes)
|
| 94 |
+
only_few_shot_true = sum(result[1] for result in outcomes)
|
| 95 |
+
both_true = sum(result[2] for result in outcomes)
|
| 96 |
+
both_false = sum(result[3] for result in outcomes)
|
| 97 |
+
|
| 98 |
+
heuristic_true_count = only_heuristic_true + both_true
|
| 99 |
+
few_shot_true_count = only_few_shot_true + both_true
|
| 100 |
+
disagree_count = only_heuristic_true + only_few_shot_true
|
| 101 |
+
agree_count = both_true + both_false
|
| 102 |
+
|
| 103 |
+
few_shot_more_often = few_shot_true_count > heuristic_true_count
|
| 104 |
+
disagree_more_than_agree = disagree_count > agree_count
|
| 105 |
+
|
| 106 |
+
if not few_shot_more_often:
|
| 107 |
+
print(
|
| 108 |
+
"Few-shot classifier did not return True more often than the heuristic classifier."
|
| 109 |
+
)
|
| 110 |
+
if not disagree_more_than_agree:
|
| 111 |
+
print("Classifiers did not disagree more often than they agreed.")
|
| 112 |
+
|
| 113 |
+
assert few_shot_more_often and disagree_more_than_agree
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# pytest -vv test_models.py::test_judge
|
| 117 |
+
def test_judge():
|
| 118 |
+
"""Run judge logic up to 5 times"""
|
| 119 |
+
current_try = 0
|
| 120 |
+
max_trys = 5
|
| 121 |
+
with logfire.span("test_judge"):
|
| 122 |
+
while current_try < max_trys:
|
| 123 |
+
result = judge_logic(current_try)
|
| 124 |
+
current_try += 1
|
| 125 |
+
if result is True:
|
| 126 |
+
print(f"Try {current_try} succeeded")
|
| 127 |
+
break
|
| 128 |
+
else:
|
| 129 |
+
print(f"Try {current_try} failed")
|
| 130 |
+
# The assert for pytest
|
| 131 |
+
assert result is True
|
test_workflows.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from workflows import llm_workflow
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def llm_workflow_logic():
|
| 5 |
+
|
| 6 |
+
old_revision = """Kaman-KalehΓΆyΓΌk Archaeological Museum (Turkish: Kaman-KalehΓΆyΓΌk Arkeoloji MΓΌzesi) is an archaeological museum in Kaman District of KΔ±rΕehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-KalehΓΆyΓΌk. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
|
| 7 |
+
|
| 8 |
+
new_revision = """The Kaman-KalehΓΆyΓΌk Archaeological Museum (Turkish: Kaman-KalehΓΆyΓΌk Arkeoloji MΓΌzesi) is an archaeological museum in ΓaΔΔ±rkan, Kaman District, KΔ±rΕehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-KalehΓΆyΓΌk. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
|
| 9 |
+
|
| 10 |
+
response = llm_workflow(old_revision, new_revision, "aligned-fewshot")
|
| 11 |
+
|
| 12 |
+
# The judge should responsd with noteworthy: False regardless of the classifier models' responses
|
| 13 |
+
return response["judge"]["noteworthy"] is False
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# pytest -vv test_workflows.py::test_llm_workflow
|
| 17 |
+
def test_llm_workflow():
|
| 18 |
+
"""Run LLM workflow logic up to 5 times"""
|
| 19 |
+
current_try = 0
|
| 20 |
+
max_trys = 5
|
| 21 |
+
while current_try < max_trys:
|
| 22 |
+
current_try += 1
|
| 23 |
+
result = llm_workflow_logic()
|
| 24 |
+
if result is True:
|
| 25 |
+
print(f"Try {current_try} succeeded")
|
| 26 |
+
break
|
| 27 |
+
else:
|
| 28 |
+
print(f"Try {current_try} failed")
|
| 29 |
+
# The actual test for pytest
|
| 30 |
+
assert result is True
|
wiki_data_fetcher.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from datetime import datetime, timedelta
|
| 3 |
+
from typing import Dict, Optional
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def run_get_request(params: dict):
|
| 8 |
+
"""
|
| 9 |
+
Utility function to run GET request against Wikipedia API
|
| 10 |
+
"""
|
| 11 |
+
base_url = "https://en.wikipedia.org/w/api.php"
|
| 12 |
+
|
| 13 |
+
# We need to supply headers for the request to work
|
| 14 |
+
headers = {
|
| 15 |
+
"User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}"
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
response = requests.get(base_url, params=params, headers=headers)
|
| 19 |
+
# Handle HTTP errors
|
| 20 |
+
response.raise_for_status()
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
json_data = response.json()
|
| 24 |
+
except Exception:
|
| 25 |
+
raise ValueError(f"Unable to parse response: {response}")
|
| 26 |
+
|
| 27 |
+
return json_data
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def extract_revision_info(json_data, revision=0):
|
| 31 |
+
"""
|
| 32 |
+
Utility function to extract page revision info from JSON data returned from API call
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
revision: revision before current
|
| 36 |
+
|
| 37 |
+
Examples:
|
| 38 |
+
title = 'David_Szalay'
|
| 39 |
+
json_data = get_previous_revisions(title, revisions = 100)
|
| 40 |
+
extract_revision_info(json_data) # Current revision
|
| 41 |
+
extract_revision_info(json_data, 10) # 10th revision before current
|
| 42 |
+
extract_revision_info(json_data, 100) # 10th revision before current
|
| 43 |
+
"""
|
| 44 |
+
# Extract page and revision info
|
| 45 |
+
pages = json_data["query"]["pages"]
|
| 46 |
+
page_id = list(pages.keys())[0]
|
| 47 |
+
|
| 48 |
+
if page_id == "-1":
|
| 49 |
+
# Page not found, return empty dict
|
| 50 |
+
return {"revid": None, "timestamp": None}
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# Get the specified revision
|
| 54 |
+
revision = pages[page_id]["revisions"][revision]
|
| 55 |
+
revid = revision["revid"]
|
| 56 |
+
timestamp = revision["timestamp"]
|
| 57 |
+
except:
|
| 58 |
+
# Revision not found, return empty dict
|
| 59 |
+
return {"revid": None, "timestamp": None}
|
| 60 |
+
|
| 61 |
+
# NOTUSED: Create permanent URL
|
| 62 |
+
# permanent_url = f"https://en.wikipedia.org/w/index.php?title={title}&oldid={revid}"
|
| 63 |
+
|
| 64 |
+
# Remove the parentid key because we don't use it
|
| 65 |
+
_ = revision.pop("parentid", None)
|
| 66 |
+
return revision
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]:
|
| 70 |
+
"""
|
| 71 |
+
Get the revision info of a Wikipedia article closest to the age in days.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
title: Wikipedia article title (e.g., 'David_Szalay')
|
| 75 |
+
age_days: Age of the article revision in days (0 for current)
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dictionary containing:
|
| 79 |
+
- 'revid': Revision id of the article revision
|
| 80 |
+
- 'timestamp': Timestamp of the article revision
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
# Get the target date
|
| 84 |
+
target_date = datetime.utcnow() - timedelta(days=age_days)
|
| 85 |
+
|
| 86 |
+
# Get the revision closest to the target date
|
| 87 |
+
params = {
|
| 88 |
+
"action": "query",
|
| 89 |
+
"titles": title,
|
| 90 |
+
"prop": "revisions",
|
| 91 |
+
"rvlimit": 1,
|
| 92 |
+
"rvdir": "older",
|
| 93 |
+
"rvstart": target_date.isoformat() + "Z",
|
| 94 |
+
"rvprop": "ids|timestamp",
|
| 95 |
+
"format": "json",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
# Run GET request
|
| 99 |
+
json_data = run_get_request(params)
|
| 100 |
+
|
| 101 |
+
# Return revision info
|
| 102 |
+
return extract_revision_info(json_data)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]:
|
| 106 |
+
"""
|
| 107 |
+
Get the revision info of a Wikipedia article a certain number of revisions before the current one.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
title: Wikipedia article title (e.g., 'David_Szalay')
|
| 111 |
+
revision: What revision before current (0 for current, must be between 0 and 499)
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Dictionary containing:
|
| 115 |
+
- 'revid': Revision id of the article revision
|
| 116 |
+
- 'timestamp': Timestamp of the article revision
|
| 117 |
+
|
| 118 |
+
Note:
|
| 119 |
+
In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500
|
| 120 |
+
rvlimit = 1 returns a single revision: the current one
|
| 121 |
+
rvlimit = 101 returns the 100 most recent revisions and the current one
|
| 122 |
+
This is why we use rvlimit = revision + 1
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
# Get the revision closest to the target date
|
| 126 |
+
params = {
|
| 127 |
+
"action": "query",
|
| 128 |
+
"prop": "revisions",
|
| 129 |
+
"titles": title,
|
| 130 |
+
"rvlimit": revisions + 1,
|
| 131 |
+
"rvdir": "older",
|
| 132 |
+
"rvprop": "ids|timestamp",
|
| 133 |
+
"format": "json",
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# Run GET request
|
| 137 |
+
json_data = run_get_request(params)
|
| 138 |
+
|
| 139 |
+
# Return info for all revisions
|
| 140 |
+
return json_data
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_wikipedia_introduction(revid: int) -> Dict[str, str]:
|
| 144 |
+
"""
|
| 145 |
+
Retrieve the introduction of a Wikipedia article.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
revid: Revision id of the article
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Text of the introduction
|
| 152 |
+
|
| 153 |
+
Example:
|
| 154 |
+
# Get intro from current article revision
|
| 155 |
+
revision_info = get_revision_from_age("David_Szalay")
|
| 156 |
+
get_wikipedia_introduction(revision_info["revid"])
|
| 157 |
+
"""
|
| 158 |
+
|
| 159 |
+
# Return None for missing revid
|
| 160 |
+
if not revid:
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
# Get the content of this specific revision
|
| 164 |
+
params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"}
|
| 165 |
+
|
| 166 |
+
json_data = run_get_request(params)
|
| 167 |
+
|
| 168 |
+
# Sometimes a revision is deleted and can't be viewed
|
| 169 |
+
# E.g. revid = '1276494621' for Turin
|
| 170 |
+
try:
|
| 171 |
+
html_content = json_data["parse"]["text"]["*"]
|
| 172 |
+
except:
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
# Extract introduction (text before first section heading)
|
| 176 |
+
# Remove everything from the first <h2> tag onwards
|
| 177 |
+
intro_html = re.split(r"<h2", html_content, maxsplit=1)[0]
|
| 178 |
+
|
| 179 |
+
# Extract text from paragraphs, excluding certain elements
|
| 180 |
+
from html.parser import HTMLParser
|
| 181 |
+
|
| 182 |
+
class IntroParser(HTMLParser):
|
| 183 |
+
def __init__(self):
|
| 184 |
+
super().__init__()
|
| 185 |
+
self.text = []
|
| 186 |
+
self.in_p = False
|
| 187 |
+
self.skip = False
|
| 188 |
+
|
| 189 |
+
def handle_starttag(self, tag, attrs):
|
| 190 |
+
if tag == "p":
|
| 191 |
+
self.in_p = True
|
| 192 |
+
# Skip certain elements
|
| 193 |
+
if tag in ["style", "script", "table", "div"]:
|
| 194 |
+
attrs_dict = dict(attrs)
|
| 195 |
+
# Skip infoboxes, navboxes, etc.
|
| 196 |
+
if "class" in attrs_dict:
|
| 197 |
+
if any(
|
| 198 |
+
x in attrs_dict["class"]
|
| 199 |
+
for x in ["infobox", "navbox", "metadata", "toc"]
|
| 200 |
+
):
|
| 201 |
+
self.skip = True
|
| 202 |
+
if tag in ["style", "script"]:
|
| 203 |
+
self.skip = True
|
| 204 |
+
|
| 205 |
+
def handle_endtag(self, tag):
|
| 206 |
+
if tag == "p":
|
| 207 |
+
if self.in_p and self.text and not self.text[-1].endswith("\n\n"):
|
| 208 |
+
self.text.append("\n\n")
|
| 209 |
+
self.in_p = False
|
| 210 |
+
if tag in ["style", "script", "table", "div"]:
|
| 211 |
+
self.skip = False
|
| 212 |
+
|
| 213 |
+
def handle_data(self, data):
|
| 214 |
+
if self.in_p and not self.skip:
|
| 215 |
+
# *Don't* clean up whitespace here - it makes run-on words
|
| 216 |
+
# text = " ".join(data.split())
|
| 217 |
+
text = data
|
| 218 |
+
if text:
|
| 219 |
+
self.text.append(text)
|
| 220 |
+
|
| 221 |
+
parser = IntroParser()
|
| 222 |
+
parser.feed(intro_html)
|
| 223 |
+
|
| 224 |
+
# Join and clean up the text
|
| 225 |
+
introduction = "".join(parser.text).strip()
|
| 226 |
+
|
| 227 |
+
# Remove multiple newlines
|
| 228 |
+
introduction = re.sub(r"\n{3,}", "\n\n", introduction)
|
| 229 |
+
|
| 230 |
+
# Remove empty paragraphs
|
| 231 |
+
paragraphs = [p.strip() for p in introduction.split("\n\n") if p.strip()]
|
| 232 |
+
introduction = "\n\n".join(paragraphs)
|
| 233 |
+
|
| 234 |
+
return introduction
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def get_revisions_behind(title: str, revid: int) -> int:
|
| 238 |
+
"""
|
| 239 |
+
Get the number of revisions a given revid is behind the current revision of the page.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
revid: Revision ID of the page
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
Integer representing the number of revisions back (0 if it's the current revision)
|
| 246 |
+
|
| 247 |
+
Example:
|
| 248 |
+
# Get how many revisions behind a specific revid is
|
| 249 |
+
revisions_behind = get_revisions_behind(123456789)
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
## First, get the page title from the revid
|
| 253 |
+
# params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"}
|
| 254 |
+
# try:
|
| 255 |
+
# json_data = run_get_request(params)
|
| 256 |
+
# title = json_data["parse"]["title"]
|
| 257 |
+
# except Exception:
|
| 258 |
+
# # If we can't get the title, the revid might be invalid
|
| 259 |
+
# raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.")
|
| 260 |
+
|
| 261 |
+
# Search through revisions going back from current
|
| 262 |
+
# We'll paginate through results if needed
|
| 263 |
+
revision_count = 0
|
| 264 |
+
continue_token = None
|
| 265 |
+
|
| 266 |
+
# Run the loop twice to get up to 1000 revisions behind
|
| 267 |
+
for i in range(2):
|
| 268 |
+
params = {
|
| 269 |
+
"action": "query",
|
| 270 |
+
"titles": title,
|
| 271 |
+
"prop": "revisions",
|
| 272 |
+
"rvlimit": 500, # API limit per request
|
| 273 |
+
"rvdir": "older",
|
| 274 |
+
"rvprop": "ids",
|
| 275 |
+
"format": "json",
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
if continue_token:
|
| 279 |
+
params["rvcontinue"] = continue_token
|
| 280 |
+
|
| 281 |
+
try:
|
| 282 |
+
json_data = run_get_request(params)
|
| 283 |
+
pages = json_data["query"]["pages"]
|
| 284 |
+
page_id = list(pages.keys())[0]
|
| 285 |
+
|
| 286 |
+
if page_id == "-1":
|
| 287 |
+
raise ValueError(f"Page not found for revid {revid}")
|
| 288 |
+
|
| 289 |
+
revisions = pages[page_id]["revisions"]
|
| 290 |
+
|
| 291 |
+
# Find the index of the given revid in the current batch of revisions
|
| 292 |
+
for i, revision in enumerate(revisions):
|
| 293 |
+
if revision["revid"] == revid:
|
| 294 |
+
return revision_count + i
|
| 295 |
+
|
| 296 |
+
# Update the count of revisions we've checked
|
| 297 |
+
revision_count += len(revisions)
|
| 298 |
+
|
| 299 |
+
# Check if there are more revisions to search
|
| 300 |
+
continue_token = json_data.get("continue", {}).get("rvcontinue")
|
| 301 |
+
|
| 302 |
+
if not continue_token:
|
| 303 |
+
# Reached the end of revisions but didn't find the revid
|
| 304 |
+
raise ValueError(
|
| 305 |
+
f"Revid {revid} not found in the revision history of the page. "
|
| 306 |
+
f"It may be from a different page or may have been deleted."
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
except ValueError:
|
| 310 |
+
# Re-raise ValueError exceptions
|
| 311 |
+
raise
|
| 312 |
+
except Exception as e:
|
| 313 |
+
raise ValueError(f"Error searching for revid {revid}: {e}")
|
| 314 |
+
|
| 315 |
+
# If we looped without returning the revision count, return it as a negative number
|
| 316 |
+
negative_revision_count = -revision_count
|
| 317 |
+
return negative_revision_count
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def get_random_wikipedia_title():
|
| 321 |
+
url = "https://en.wikipedia.org/w/api.php"
|
| 322 |
+
params = {
|
| 323 |
+
"action": "query",
|
| 324 |
+
"list": "random",
|
| 325 |
+
"rnnamespace": 0,
|
| 326 |
+
"rnlimit": 1,
|
| 327 |
+
"format": "json",
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
json_data = run_get_request(params)
|
| 332 |
+
|
| 333 |
+
# Extract the title
|
| 334 |
+
title = json_data["query"]["random"][0]["title"]
|
| 335 |
+
return title
|
| 336 |
+
|
| 337 |
+
except requests.RequestException as e:
|
| 338 |
+
print(f"Error fetching random Wikipedia title: {e}")
|
| 339 |
+
return None
|
workflows.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from models import classifier, judge
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def llm_workflow(old_revision, new_revision, mode="aligned-fewshot"):
|
| 5 |
+
"""
|
| 6 |
+
Run LLM workflow (input to response)
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
mode: "aligned-fewshot" for few-shot alignment or "aligned-heuristic" for heuristic alignment
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# Run classifier and judge models
|
| 13 |
+
heuristic = classifier(old_revision, new_revision, "heuristic")
|
| 14 |
+
few_shot = classifier(old_revision, new_revision, "few-shot")
|
| 15 |
+
judge_response = judge(
|
| 16 |
+
old_revision,
|
| 17 |
+
new_revision,
|
| 18 |
+
heuristic["rationale"],
|
| 19 |
+
few_shot["rationale"],
|
| 20 |
+
mode=mode,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
return {"heuristic": heuristic, "few-shot": few_shot, "judge": judge_response}
|