jedick commited on
Commit
e42e305
·
1 Parent(s): 103ea6f

Add production alignments

Browse files
app.py CHANGED
@@ -212,7 +212,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
212
  """
213
  *Click to find an interesting example
214
  by running the model on random pages
215
- until we get a confidence score that is not High,
216
  up to 20 tries*"""
217
  )
218
 
 
212
  """
213
  *Click to find an interesting example
214
  by running the model on random pages
215
+ until the confidence score is not High,
216
  up to 20 tries*"""
217
  )
218
 
models.py CHANGED
@@ -9,9 +9,11 @@ from dotenv import load_dotenv
9
  import json
10
  import os
11
  import pandas as pd
12
- from prompts import analyzer_prompts, judge_prompt
13
  from retry_with_backoff import retry_with_backoff
14
  import logfire
 
 
15
 
16
  # Load API keys
17
  load_dotenv()
@@ -24,6 +26,31 @@ logfire.instrument_google_genai()
24
  client = genai.Client()
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @retry_with_backoff()
28
  def classifier(old_revision, new_revision, prompt_style):
29
  """
@@ -43,7 +70,7 @@ def classifier(old_revision, new_revision, prompt_style):
43
  return {"noteworthy": None, "rationale": None}
44
 
45
  # Get prompt template for given style
46
- prompt_template = analyzer_prompts[prompt_style]
47
 
48
  # Add article revisions to prompt
49
  prompt = prompt_template.replace("{{old_revision}}", old_revision).replace(
@@ -69,7 +96,14 @@ def classifier(old_revision, new_revision, prompt_style):
69
 
70
 
71
  @retry_with_backoff()
72
- def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"):
 
 
 
 
 
 
 
73
  """
74
  AI judge to settle disagreements between classification models
75
 
@@ -79,6 +113,7 @@ def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"
79
  rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
80
  rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
81
  mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
 
82
 
83
  Returns:
84
  noteworthy: True if the differences are noteworthy; False if not
@@ -103,7 +138,10 @@ def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"
103
  lines = file.readlines()
104
  alignment_text = "".join(lines)
105
  elif mode == "aligned-heuristic":
106
- with open("development/alignment_heuristic.txt", "r") as file:
 
 
 
107
  lines = file.readlines()
108
  alignment_text = "".join(lines)
109
  else:
 
9
  import json
10
  import os
11
  import pandas as pd
12
+ from prompts import classifier_prompts, judge_prompt
13
  from retry_with_backoff import retry_with_backoff
14
  import logfire
15
+ import re
16
+ import glob
17
 
18
  # Load API keys
19
  load_dotenv()
 
26
  client = genai.Client()
27
 
28
 
29
+ def get_latest_iteration():
30
+ """
31
+ Find the latest iteration number from alignment files in the production directory.
32
+ Returns the highest numeric suffix from files matching alignment_*.txt pattern.
33
+ """
34
+ pattern = "production/alignment_*.txt"
35
+ files = glob.glob(pattern)
36
+
37
+ if not files:
38
+ raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
39
+
40
+ max_iteration = 0
41
+ for file in files:
42
+ # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
43
+ match = re.search(r"alignment_(\d+)\.txt$", file)
44
+ if match:
45
+ iteration = int(match.group(1))
46
+ max_iteration = max(max_iteration, iteration)
47
+
48
+ if max_iteration == 0:
49
+ raise ValueError("No valid iteration numbers found in alignment files")
50
+
51
+ return max_iteration
52
+
53
+
54
  @retry_with_backoff()
55
  def classifier(old_revision, new_revision, prompt_style):
56
  """
 
70
  return {"noteworthy": None, "rationale": None}
71
 
72
  # Get prompt template for given style
73
+ prompt_template = classifier_prompts[prompt_style]
74
 
75
  # Add article revisions to prompt
76
  prompt = prompt_template.replace("{{old_revision}}", old_revision).replace(
 
96
 
97
 
98
  @retry_with_backoff()
99
+ def judge(
100
+ old_revision,
101
+ new_revision,
102
+ rationale_1,
103
+ rationale_2,
104
+ mode="aligned-heuristic",
105
+ iteration=None,
106
+ ):
107
  """
108
  AI judge to settle disagreements between classification models
109
 
 
113
  rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
114
  rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
115
  mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
116
+ iteration: Iteration to use for heuristic alignment (None for latest)
117
 
118
  Returns:
119
  noteworthy: True if the differences are noteworthy; False if not
 
138
  lines = file.readlines()
139
  alignment_text = "".join(lines)
140
  elif mode == "aligned-heuristic":
141
+ # Use latest iteration if iteration is None
142
+ if iteration is None:
143
+ iteration = get_latest_iteration()
144
+ with open(f"production/alignment_{str(iteration)}.txt", "r") as file:
145
  lines = file.readlines()
146
  alignment_text = "".join(lines)
147
  else:
production/alignment_1.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Noteworthy Changes (True)
2
+
3
+ **Factual Additions/Corrections:**
4
+ - Adding significant biographical details (birth year, occupations, cultural names)
5
+ - Updating population data to be significantly more current (5+ years)
6
+ - Adding record-breaking characteristics or superlatives relevant to the article subject
7
+ - Inserting important geographical features (coastal location, landlocked status when not obvious)
8
+ - Including cultural/historical context that wasn't inferable (nicknames with meaning, generational impact)
9
+
10
+ **Tone & Neutrality:**
11
+ - Removing vandalism or inappropriate language that destroys article neutrality
12
+ - Correcting words that radically alter tone (e.g., "lucky" when referring to deaths)
13
+
14
+ **Substantive Content Changes:**
15
+ - Altering the framing of historical events (different causes or contexts presented)
16
+ - Adding causes or factors to complex events (mass displacement, additional contributing factors)
17
+ - Expanding definitions to include significant variations (four-door coupes, multiple pronunciations)
18
+ - Adding technical/scientific explanations for key properties or phenomena
19
+
20
+ **Cultural Recognition:**
21
+ - Including indigenous language names/scripts that honor cultural background
22
+ - Adding alternative official names (Latin, other languages) for historical agreements or places
23
+
24
+ ## Not Noteworthy Changes (False)
25
+
26
+ **Minor Details:**
27
+ - Grammatical adjustments, rephrasing, or structural reorganization
28
+ - Adding locality specificity within already-stated regions (sub-district within district)
29
+ - Removing minor trivia that doesn't affect core understanding
30
+ - Adding nicknames without significant cultural/historical weight
31
+ - Word choice refinements that don't change meaning ("locals" → "Indigenous people" as terminology update only)
32
+
33
+ **Inferable Information:**
34
+ - Details implied by other information (e.g., "landlocked" when no coastal borders mentioned)
35
+ - Ancestry relationships that explain omitted technical details
36
+
37
+ **Depth vs. Substance:**
38
+ - Deeper analysis that doesn't change fundamental conclusions
39
+ - Additional technical details that provide nuance but not new understanding
40
+ - Clarifications of already-conveyed information
41
+ - Minor statistical updates that don't represent significant temporal gaps
42
+
43
+ **Administrative/Format Changes:**
44
+ - Order changes in lists
45
+ - Removal of extraneous text or typos (unless they substantially affected meaning)
46
+ - Template replacements with equivalent content
47
+
48
+ ## Key Principles
49
+
50
+ 1. **Inferability Test**: If the new information could be reasonably inferred from the old revision, it's likely not noteworthy
51
+ 2. **Completeness vs. Depth**: Adding information that makes an entry more complete (new occupation, birth year) is noteworthy; adding depth to existing information usually isn't
52
+ 3. **Context Matters**: The same change can be noteworthy or not depending on article context (longest year trivia is noteworthy on a year's own page)
53
+ 4. **Temporal Significance**: Updates spanning 5+ years are noteworthy; minor year-to-year updates typically aren't
54
+ 5. **Framing Changes**: Alterations to how events/subjects are presented or understood are noteworthy; rewordings that preserve meaning aren't
55
+ 6. **Cultural Respect**: Additions recognizing cultural identity, indigenous languages, or heritage are noteworthy
production/alignment_2.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Noteworthy Changes (True)
2
+
3
+ **Factual Additions/Corrections:**
4
+ - Adding significant biographical details (birth year, occupations, cultural names). This includes quantifiable traits like height or blood type.
5
+ - Updating population data to be significantly more current (5+ years).
6
+ - Adding record-breaking characteristics or superlatives relevant to the article subject.
7
+ - Inserting important geographical features (coastal location, landlocked status when not obvious).
8
+ - Including cultural/historical context that wasn't inferable (nicknames with meaning, generational impact, direct quotes revealing significant personal preference).
9
+ - Adding specific dates for events or tenures that were previously undated or only broadly described.
10
+
11
+ **Tone & Neutrality:**
12
+ - Removing vandalism or inappropriate language that destroys article neutrality.
13
+ - Correcting words that radically alter tone (e.g., "lucky" when referring to deaths).
14
+
15
+ **Substantive Content Changes:**
16
+ - Altering the framing of historical events (different causes or contexts presented).
17
+ - Adding causes or factors to complex events (mass displacement, additional contributing factors).
18
+ - Expanding definitions to include **new significant categories or fundamental variations** (e.g., four-door coupes, multiple pronunciations, entirely new types of an object).
19
+ - Adding **new or significantly altered** technical/scientific explanations for key properties or phenomena (vs. adding depth or nuance to existing explanations).
20
+ - Removal of explicit lists of components (e.g., list of schools in a district) that define a subject's fundamental structure.
21
+ - Significant factual updates to a subject's current status (e.g., changing 'plays for' to 'played in' and specifying league for a player).
22
+
23
+ **Cultural Recognition:**
24
+ - Including indigenous language names/scripts that honor cultural background.
25
+ - Adding alternative official names (Latin, other languages) for historical agreements or places.
26
+ - Adding native language pronunciation (e.g., IPA or romanization) when previously absent or incorrect/broken template.
27
+
28
+ ## Not Noteworthy Changes (False)
29
+
30
+ **Minor Details:**
31
+ - Grammatical adjustments, rephrasing, or structural reorganization. This includes minor grammatical corrections that may incidentally imply a factual update, but the core meaning remains clear (e.g., 'formally' to 'formerly', 'was' to 'is' when the status is broadly understood or easily inferable).
32
+ - Adding locality specificity within already-stated regions (sub-district within district, specific county within a known state/country).
33
+ - Adding nicknames without significant cultural/historical weight.
34
+ - Word choice refinements that don't change fundamental meaning or fundamentally alter the subject's core identity or role (e.g., "locals" → "Indigenous people" as terminology update only; "fashion model" to "supermodel and activist"; "large" to "coeducational"; clarifying a 'brook' is a 'river' when its nature is implicit).
35
+ - Removing specific birth/death *locations* (city, state/province) if the broader nationality or region is known or implied, and if no other essential geographical information about the person's origin or demise is lost.
36
+ - Removal of alternative names or lesser-known designations if the primary identifier remains and the core identity is unaffected.
37
+ - Adding official subtitles or taglines for media (e.g., TV season subtitles), which serve as additional identifiers rather than core content.
38
+ - Adding an item to an existing, general list of types or instances if it doesn't represent a significant new category or a major increase in scope (e.g., adding one more country to a long list, one more hit song to an existing list, 'rapid transit' to railway types if the article broadly covers railway types).
39
+ - Adding specific ordinal numbers (e.g., '60th governor', 'sixteenth album') or precise dates for tenures/events that are already generally known or broadly described.
40
+ - Adding etymological meanings or detailed historical anecdotes/origins for names/terms if their primary identity and context are already established and the addition does not introduce new fundamental understanding or change the article's core narrative.
41
+ - Adding relational context or familial prominence (e.g., 'disciple of X', 'member of prominent family') if it doesn't introduce specific new achievements, roles, or a fundamental shift in the subject's identity directly attributable to that relationship.
42
+ - Adding universally known dates to major events already mentioned (e.g., "1485" for the Battle of Bosworth).
43
+
44
+ **Inferable Information:**
45
+ - Details implied by other information (e.g., "landlocked" when no coastal borders mentioned).
46
+ - Ancestry relationships that explain omitted technical details.
47
+ - Generalizing a location from a specific city to a broader country, when both are known/implied and no new specific information or clarification is provided.
48
+
49
+ **Depth vs. Substance:**
50
+ - Deeper analysis that doesn't change fundamental conclusions.
51
+ - Additional technical details that provide nuance but not new fundamental understanding or mechanism for a concept.
52
+ - Clarifications of already-conveyed information.
53
+ - Minor statistical updates that don't represent significant temporal gaps.
54
+ - Adding specific criteria or detailed definitions that deepen understanding of an existing concept but do not change its fundamental nature or core conclusion.
55
+
56
+ **Administrative/Format Changes:**
57
+ - Order changes in lists.
58
+ - Removal of extraneous text or typos (unless they substantially affected meaning).
59
+ - Template replacements with equivalent content (unless fixing a broken template that rendered information inaccessible).
60
+
61
+ ## Key Principles
62
+
63
+ 1. **Inferability Test**: If the new information could be reasonably inferred from the old revision, or if it generalizes already specific information without adding new insight, it's likely not noteworthy.
64
+ 2. **Completeness vs. Depth**: Adding information that makes an entry more complete by introducing **new core facts** (e.g., a previously missing occupation, birth year, specific height/blood type, adding a previously missing indigenous script) is noteworthy. Adding **deeper analysis, minor specificity, or additional context** to existing information without changing fundamental conclusions usually isn't.
65
+ 3. **Context Matters**: The same change can be noteworthy or not depending on article context (longest year trivia is noteworthy on a year's own page).
66
+ 4. **Temporal Significance**: Updates spanning 5+ years are noteworthy; minor year-to-year updates typically aren't.
67
+ 5. **Framing Changes**: Alterations to how events or subjects are presented or understood at a fundamental, definitional level are noteworthy; descriptive upgrades or rewordings that preserve core meaning are not.
68
+ 6. **Cultural Respect**: Additions recognizing cultural identity, indigenous languages, or heritage (e.g., native scripts, pronunciations, meaningful cultural names) are noteworthy.
prompts.py CHANGED
@@ -17,7 +17,7 @@ Return a JSON-formatted response with keys for:
17
  </new_revision>
18
  """
19
 
20
- analyzer_prompts = {
21
  "heuristic": skeleton.replace(
22
  "{{instructions}}",
23
  """
@@ -89,3 +89,22 @@ Return a JSON-formatted response with keys for:
89
  {{model_2_rationale}}
90
  </model_2_rationale>
91
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  </new_revision>
18
  """
19
 
20
+ classifier_prompts = {
21
  "heuristic": skeleton.replace(
22
  "{{instructions}}",
23
  """
 
89
  {{model_2_rationale}}
90
  </model_2_rationale>
91
  """
92
+
93
+ update_prompt = """
94
+ You are fine-tuning an AI system for detecting noteworthy differences between Wikipedia article revisions.
95
+ The system has two classifier models and an AI judge.
96
+ The alignment text for the judge is provided below.
97
+ Please update this alignment text based on the example text.
98
+ The example text contains the models' responses as well as human feedback.
99
+ You should change, remove or add alignment text wherever needed to make it consistent with the human feedback.
100
+ The new alignment text should provide guidance to an LLM to make the correct choice on unseen examples.
101
+ Respond only with an updated alignment text.
102
+
103
+ <alignment_text>
104
+ {{alignment_text}}
105
+ </alignment_text>
106
+
107
+ <examples_text>
108
+ {{examples_text}}
109
+ </examples_text>
110
+ """
update_alignment.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from google import genai
3
+ from dotenv import load_dotenv
4
+ from retry_with_backoff import retry_with_backoff
5
+ from prompts import update_prompt
6
+ import logfire
7
+
8
+ # Load API keys
9
+ load_dotenv()
10
+
11
+ # This wraps Google Gen AI client calls
12
+ # to capture prompts, responses, and metadata
13
+ logfire.configure()
14
+ logfire.instrument_google_genai()
15
+
16
+ # Initialize the Gemini LLM
17
+ client = genai.Client()
18
+
19
+
20
+ @logfire.instrument("Update alignment")
21
+ def update_alignment():
22
+ # Load feedback dataset
23
+ dataset = load_dataset("jedick/noteworthy-differences-feedback")
24
+ # Convert to DataFrame
25
+ df = dataset["train"].to_pandas()
26
+ # Remove samples with High confidence where feedback is "agree"
27
+ high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
28
+ df = df.loc[~high_and_agree]
29
+ # Get 30 examples for training the LLM
30
+ examples = df[df.confidence_score != "High"].iloc[:30, :]
31
+ examples_text = []
32
+ # Loop over rows
33
+ for index, row in df.iterrows():
34
+ # Construct training text for this row
35
+ noteworthy = "not noteworthy differences"
36
+ if row["judge_noteworthy"] and row["feedback"] == "agree":
37
+ noteworthy = "noteworthy differences"
38
+ if not row["judge_noteworthy"] and row["feedback"] == "disagree":
39
+ noteworthy = "noteworthy differences"
40
+ heuristic = f"Model 1: {row['heuristic_rationale']}"
41
+ fewshot = f"Model 2: {row['fewshot_rationale']}"
42
+ judge = f"AI Judge: {row['judge_reasoning']}"
43
+ human = f"Human feedback: {row['feedback']}"
44
+ row_text = f"{heuristic}\n{fewshot}\n{judge}\n{human} ({noteworthy})."
45
+ examples_text.append(row_text)
46
+
47
+ examples_text = "\n\n".join(examples_text)
48
+
49
+ # Read the existing alignment
50
+ with open("production/alignment_1.txt", "r") as file:
51
+ lines = file.readlines()
52
+ alignment_text = "".join(lines)
53
+
54
+ # Write prompt to update alignment
55
+ prompt = update_prompt.replace("{{alignment_text}}", alignment_text).replace(
56
+ "{{examples_text}}", examples_text
57
+ )
58
+
59
+ # Function to generate response
60
+ @retry_with_backoff()
61
+ def get_response():
62
+ response = client.models.generate_content(
63
+ model="gemini-2.5-flash",
64
+ contents=prompt,
65
+ )
66
+ return response
67
+
68
+ # Get the response
69
+ response = get_response()
70
+ # Save to new alignment text file
71
+ with open("production/alignment_2.txt", "w") as file:
72
+ file.write(response.text)
73
+
74
+
75
+ if __name__ == "__main__":
76
+
77
+ update_alignment()