Spaces:

jedick
/

noteworthy-differences

Sleeping

App Files Files Community

jedick commited on Dec 21, 2025

Commit

e42e305

1 Parent(s): 103ea6f

Add production alignments

Browse files

Files changed (6) hide show

app.py +1 -1
models.py +42 -4
production/alignment_1.txt +55 -0
production/alignment_2.txt +68 -0
prompts.py +20 -1
update_alignment.py +77 -0

app.py CHANGED Viewed

@@ -212,7 +212,7 @@ with gr.Blocks(title="Noteworthy Differences") as demo:
                             """
                         *Click to find an interesting example
                         by running the model on random pages
-                        until we get a confidence score that is not High,
                         up to 20 tries*"""
                         )

                             """
                         *Click to find an interesting example
                         by running the model on random pages
+                        until the confidence score is not High,
                         up to 20 tries*"""
                         )

models.py CHANGED Viewed

@@ -9,9 +9,11 @@ from dotenv import load_dotenv
 import json
 import os
 import pandas as pd
-from prompts import analyzer_prompts, judge_prompt
 from retry_with_backoff import retry_with_backoff
 import logfire
 # Load API keys
 load_dotenv()
@@ -24,6 +26,31 @@ logfire.instrument_google_genai()
 client = genai.Client()
 @retry_with_backoff()
 def classifier(old_revision, new_revision, prompt_style):
     """
@@ -43,7 +70,7 @@ def classifier(old_revision, new_revision, prompt_style):
         return {"noteworthy": None, "rationale": None}
     # Get prompt template for given style
-    prompt_template = analyzer_prompts[prompt_style]
     # Add article revisions to prompt
     prompt = prompt_template.replace("{{old_revision}}", old_revision).replace(
@@ -69,7 +96,14 @@ def classifier(old_revision, new_revision, prompt_style):
 @retry_with_backoff()
-def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"):
     """
     AI judge to settle disagreements between classification models
@@ -79,6 +113,7 @@ def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"
         rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
         rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
         mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
     Returns:
         noteworthy: True if the differences are noteworthy; False if not
@@ -103,7 +138,10 @@ def judge(old_revision, new_revision, rationale_1, rationale_2, mode="unaligned"
             lines = file.readlines()
             alignment_text = "".join(lines)
     elif mode == "aligned-heuristic":
-        with open("development/alignment_heuristic.txt", "r") as file:
             lines = file.readlines()
             alignment_text = "".join(lines)
     else:

 import json
 import os
 import pandas as pd
+from prompts import classifier_prompts, judge_prompt
 from retry_with_backoff import retry_with_backoff
 import logfire
+import re
+import glob
 # Load API keys
 load_dotenv()
 client = genai.Client()
+def get_latest_iteration():
+    """
+    Find the latest iteration number from alignment files in the production directory.
+    Returns the highest numeric suffix from files matching alignment_*.txt pattern.
+    """
+    pattern = "production/alignment_*.txt"
+    files = glob.glob(pattern)
+    if not files:
+        raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
+    max_iteration = 0
+    for file in files:
+        # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
+        match = re.search(r"alignment_(\d+)\.txt$", file)
+        if match:
+            iteration = int(match.group(1))
+            max_iteration = max(max_iteration, iteration)
+    if max_iteration == 0:
+        raise ValueError("No valid iteration numbers found in alignment files")
+    return max_iteration
 @retry_with_backoff()
 def classifier(old_revision, new_revision, prompt_style):
     """
         return {"noteworthy": None, "rationale": None}
     # Get prompt template for given style
+    prompt_template = classifier_prompts[prompt_style]
     # Add article revisions to prompt
     prompt = prompt_template.replace("{{old_revision}}", old_revision).replace(
 @retry_with_backoff()
+def judge(
+    old_revision,
+    new_revision,
+    rationale_1,
+    rationale_2,
+    mode="aligned-heuristic",
+    iteration=None,
+):
     """
     AI judge to settle disagreements between classification models
         rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
         rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
         mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
+        iteration: Iteration to use for heuristic alignment (None for latest)
     Returns:
         noteworthy: True if the differences are noteworthy; False if not
             lines = file.readlines()
             alignment_text = "".join(lines)
     elif mode == "aligned-heuristic":
+        # Use latest iteration if iteration is None
+        if iteration is None:
+            iteration = get_latest_iteration()
+        with open(f"production/alignment_{str(iteration)}.txt", "r") as file:
             lines = file.readlines()
             alignment_text = "".join(lines)
     else:

production/alignment_1.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+## Noteworthy Changes (True)
+**Factual Additions/Corrections:**
+- Adding significant biographical details (birth year, occupations, cultural names)
+- Updating population data to be significantly more current (5+ years)
+- Adding record-breaking characteristics or superlatives relevant to the article subject
+- Inserting important geographical features (coastal location, landlocked status when not obvious)
+- Including cultural/historical context that wasn't inferable (nicknames with meaning, generational impact)
+**Tone & Neutrality:**
+- Removing vandalism or inappropriate language that destroys article neutrality
+- Correcting words that radically alter tone (e.g., "lucky" when referring to deaths)
+**Substantive Content Changes:**
+- Altering the framing of historical events (different causes or contexts presented)
+- Adding causes or factors to complex events (mass displacement, additional contributing factors)
+- Expanding definitions to include significant variations (four-door coupes, multiple pronunciations)
+- Adding technical/scientific explanations for key properties or phenomena
+**Cultural Recognition:**
+- Including indigenous language names/scripts that honor cultural background
+- Adding alternative official names (Latin, other languages) for historical agreements or places
+## Not Noteworthy Changes (False)
+**Minor Details:**
+- Grammatical adjustments, rephrasing, or structural reorganization
+- Adding locality specificity within already-stated regions (sub-district within district)
+- Removing minor trivia that doesn't affect core understanding
+- Adding nicknames without significant cultural/historical weight
+- Word choice refinements that don't change meaning ("locals" → "Indigenous people" as terminology update only)
+**Inferable Information:**
+- Details implied by other information (e.g., "landlocked" when no coastal borders mentioned)
+- Ancestry relationships that explain omitted technical details
+**Depth vs. Substance:**
+- Deeper analysis that doesn't change fundamental conclusions
+- Additional technical details that provide nuance but not new understanding
+- Clarifications of already-conveyed information
+- Minor statistical updates that don't represent significant temporal gaps
+**Administrative/Format Changes:**
+- Order changes in lists
+- Removal of extraneous text or typos (unless they substantially affected meaning)
+- Template replacements with equivalent content
+## Key Principles
+1. **Inferability Test**: If the new information could be reasonably inferred from the old revision, it's likely not noteworthy
+2. **Completeness vs. Depth**: Adding information that makes an entry more complete (new occupation, birth year) is noteworthy; adding depth to existing information usually isn't
+3. **Context Matters**: The same change can be noteworthy or not depending on article context (longest year trivia is noteworthy on a year's own page)
+4. **Temporal Significance**: Updates spanning 5+ years are noteworthy; minor year-to-year updates typically aren't
+5. **Framing Changes**: Alterations to how events/subjects are presented or understood are noteworthy; rewordings that preserve meaning aren't
+6. **Cultural Respect**: Additions recognizing cultural identity, indigenous languages, or heritage are noteworthy

production/alignment_2.txt ADDED Viewed

	@@ -0,0 +1,68 @@

+## Noteworthy Changes (True)
+**Factual Additions/Corrections:**
+- Adding significant biographical details (birth year, occupations, cultural names). This includes quantifiable traits like height or blood type.
+- Updating population data to be significantly more current (5+ years).
+- Adding record-breaking characteristics or superlatives relevant to the article subject.
+- Inserting important geographical features (coastal location, landlocked status when not obvious).
+- Including cultural/historical context that wasn't inferable (nicknames with meaning, generational impact, direct quotes revealing significant personal preference).
+- Adding specific dates for events or tenures that were previously undated or only broadly described.
+**Tone & Neutrality:**
+- Removing vandalism or inappropriate language that destroys article neutrality.
+- Correcting words that radically alter tone (e.g., "lucky" when referring to deaths).
+**Substantive Content Changes:**
+- Altering the framing of historical events (different causes or contexts presented).
+- Adding causes or factors to complex events (mass displacement, additional contributing factors).
+- Expanding definitions to include **new significant categories or fundamental variations** (e.g., four-door coupes, multiple pronunciations, entirely new types of an object).
+- Adding **new or significantly altered** technical/scientific explanations for key properties or phenomena (vs. adding depth or nuance to existing explanations).
+- Removal of explicit lists of components (e.g., list of schools in a district) that define a subject's fundamental structure.
+- Significant factual updates to a subject's current status (e.g., changing 'plays for' to 'played in' and specifying league for a player).
+**Cultural Recognition:**
+- Including indigenous language names/scripts that honor cultural background.
+- Adding alternative official names (Latin, other languages) for historical agreements or places.
+- Adding native language pronunciation (e.g., IPA or romanization) when previously absent or incorrect/broken template.
+## Not Noteworthy Changes (False)
+**Minor Details:**
+- Grammatical adjustments, rephrasing, or structural reorganization. This includes minor grammatical corrections that may incidentally imply a factual update, but the core meaning remains clear (e.g., 'formally' to 'formerly', 'was' to 'is' when the status is broadly understood or easily inferable).
+- Adding locality specificity within already-stated regions (sub-district within district, specific county within a known state/country).
+- Adding nicknames without significant cultural/historical weight.
+- Word choice refinements that don't change fundamental meaning or fundamentally alter the subject's core identity or role (e.g., "locals" → "Indigenous people" as terminology update only; "fashion model" to "supermodel and activist"; "large" to "coeducational"; clarifying a 'brook' is a 'river' when its nature is implicit).
+- Removing specific birth/death *locations* (city, state/province) if the broader nationality or region is known or implied, and if no other essential geographical information about the person's origin or demise is lost.
+- Removal of alternative names or lesser-known designations if the primary identifier remains and the core identity is unaffected.
+- Adding official subtitles or taglines for media (e.g., TV season subtitles), which serve as additional identifiers rather than core content.
+- Adding an item to an existing, general list of types or instances if it doesn't represent a significant new category or a major increase in scope (e.g., adding one more country to a long list, one more hit song to an existing list, 'rapid transit' to railway types if the article broadly covers railway types).
+- Adding specific ordinal numbers (e.g., '60th governor', 'sixteenth album') or precise dates for tenures/events that are already generally known or broadly described.
+- Adding etymological meanings or detailed historical anecdotes/origins for names/terms if their primary identity and context are already established and the addition does not introduce new fundamental understanding or change the article's core narrative.
+- Adding relational context or familial prominence (e.g., 'disciple of X', 'member of prominent family') if it doesn't introduce specific new achievements, roles, or a fundamental shift in the subject's identity directly attributable to that relationship.
+- Adding universally known dates to major events already mentioned (e.g., "1485" for the Battle of Bosworth).
+**Inferable Information:**
+- Details implied by other information (e.g., "landlocked" when no coastal borders mentioned).
+- Ancestry relationships that explain omitted technical details.
+- Generalizing a location from a specific city to a broader country, when both are known/implied and no new specific information or clarification is provided.
+**Depth vs. Substance:**
+- Deeper analysis that doesn't change fundamental conclusions.
+- Additional technical details that provide nuance but not new fundamental understanding or mechanism for a concept.
+- Clarifications of already-conveyed information.
+- Minor statistical updates that don't represent significant temporal gaps.
+- Adding specific criteria or detailed definitions that deepen understanding of an existing concept but do not change its fundamental nature or core conclusion.
+**Administrative/Format Changes:**
+- Order changes in lists.
+- Removal of extraneous text or typos (unless they substantially affected meaning).
+- Template replacements with equivalent content (unless fixing a broken template that rendered information inaccessible).
+## Key Principles
+1.  **Inferability Test**: If the new information could be reasonably inferred from the old revision, or if it generalizes already specific information without adding new insight, it's likely not noteworthy.
+2.  **Completeness vs. Depth**: Adding information that makes an entry more complete by introducing **new core facts** (e.g., a previously missing occupation, birth year, specific height/blood type, adding a previously missing indigenous script) is noteworthy. Adding **deeper analysis, minor specificity, or additional context** to existing information without changing fundamental conclusions usually isn't.
+3.  **Context Matters**: The same change can be noteworthy or not depending on article context (longest year trivia is noteworthy on a year's own page).
+4.  **Temporal Significance**: Updates spanning 5+ years are noteworthy; minor year-to-year updates typically aren't.
+5.  **Framing Changes**: Alterations to how events or subjects are presented or understood at a fundamental, definitional level are noteworthy; descriptive upgrades or rewordings that preserve core meaning are not.
+6.  **Cultural Respect**: Additions recognizing cultural identity, indigenous languages, or heritage (e.g., native scripts, pronunciations, meaningful cultural names) are noteworthy.

prompts.py CHANGED Viewed

@@ -17,7 +17,7 @@ Return a JSON-formatted response with keys for:
 </new_revision>
 """
-analyzer_prompts = {
     "heuristic": skeleton.replace(
         "{{instructions}}",
         """
@@ -89,3 +89,22 @@ Return a JSON-formatted response with keys for:
 {{model_2_rationale}}
 </model_2_rationale>
 """

 </new_revision>
 """
+classifier_prompts = {
     "heuristic": skeleton.replace(
         "{{instructions}}",
         """
 {{model_2_rationale}}
 </model_2_rationale>
 """
+update_prompt = """
+You are fine-tuning an AI system for detecting noteworthy differences between Wikipedia article revisions.
+The system has two classifier models and an AI judge.
+The alignment text for the judge is provided below.
+Please update this alignment text based on the example text.
+The example text contains the models' responses as well as human feedback.
+You should change, remove or add alignment text wherever needed to make it consistent with the human feedback.
+The new alignment text should provide guidance to an LLM to make the correct choice on unseen examples.
+Respond only with an updated alignment text.
+<alignment_text>
+{{alignment_text}}
+</alignment_text>
+<examples_text>
+{{examples_text}}
+</examples_text>
+"""

update_alignment.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from datasets import load_dataset
+from google import genai
+from dotenv import load_dotenv
+from retry_with_backoff import retry_with_backoff
+from prompts import update_prompt
+import logfire
+# Load API keys
+load_dotenv()
+# This wraps Google Gen AI client calls
+# to capture prompts, responses, and metadata
+logfire.configure()
+logfire.instrument_google_genai()
+# Initialize the Gemini LLM
+client = genai.Client()
+@logfire.instrument("Update alignment")
+def update_alignment():
+    # Load feedback dataset
+    dataset = load_dataset("jedick/noteworthy-differences-feedback")
+    # Convert to DataFrame
+    df = dataset["train"].to_pandas()
+    # Remove samples with High confidence where feedback is "agree"
+    high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
+    df = df.loc[~high_and_agree]
+    # Get 30 examples for training the LLM
+    examples = df[df.confidence_score != "High"].iloc[:30, :]
+    examples_text = []
+    # Loop over rows
+    for index, row in df.iterrows():
+        # Construct training text for this row
+        noteworthy = "not noteworthy differences"
+        if row["judge_noteworthy"] and row["feedback"] == "agree":
+            noteworthy = "noteworthy differences"
+        if not row["judge_noteworthy"] and row["feedback"] == "disagree":
+            noteworthy = "noteworthy differences"
+        heuristic = f"Model 1: {row['heuristic_rationale']}"
+        fewshot = f"Model 2: {row['fewshot_rationale']}"
+        judge = f"AI Judge: {row['judge_reasoning']}"
+        human = f"Human feedback: {row['feedback']}"
+        row_text = f"{heuristic}\n{fewshot}\n{judge}\n{human} ({noteworthy})."
+        examples_text.append(row_text)
+    examples_text = "\n\n".join(examples_text)
+    # Read the existing alignment
+    with open("production/alignment_1.txt", "r") as file:
+        lines = file.readlines()
+        alignment_text = "".join(lines)
+    # Write prompt to update alignment
+    prompt = update_prompt.replace("{{alignment_text}}", alignment_text).replace(
+        "{{examples_text}}", examples_text
+    )
+    # Function to generate response
+    @retry_with_backoff()
+    def get_response():
+        response = client.models.generate_content(
+            model="gemini-2.5-flash",
+            contents=prompt,
+        )
+        return response
+    # Get the response
+    response = get_response()
+    # Save to new alignment text file
+    with open("production/alignment_2.txt", "w") as file:
+        file.write(response.text)
+if __name__ == "__main__":
+    update_alignment()