Spaces:

jedick
/

noteworthy-differences

Sleeping

App Files Files Community

jedick commited on Dec 23, 2025

Commit

9d450de

1 Parent(s): e42e305

Change iteration to round

Browse files

Files changed (3) hide show

evaluate.py +184 -0
models.py +14 -14
update_alignment.py +19 -10

evaluate.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from datasets import load_dataset
+from dotenv import load_dotenv
+from datetime import datetime
+from models import judge
+import pandas as pd
+import logfire
+# Load API keys
+load_dotenv()
+# Setup logging with Logfire
+logfire.configure()
+def select_round(dataset, split, round=None):
+    """
+    Select the production round for a given dataset and split.
+    Args:
+        dataset: Hugging Face dataset
+        split: train or test
+        round: round number (None for most recent)
+    Returns a tuple of (index, round) with the the indices of files in the round and the round used.
+    """
+    # Define production time spans for rounds
+    time_spans = [
+        # First round (development) has no time span
+        [None, None],
+        ["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
+    ]
+    # If no round is specified, use the most recent one
+    if round is None:
+        round = len(time_spans)
+        print(f"Selected round {round}")
+    # Get file names
+    file_urls = list(dataset.info.download_checksums.keys())
+    file_names = [x.split("/data/")[1] for x in file_urls]
+    # Filter list using list comprehension
+    split_file_names = [x for x in file_names if f"{split}-" in x]
+    # Remove test- prefix and .json suffix
+    timestamps = [
+        x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
+    ]
+    # Convert to datetime object
+    dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
+    # Get time span for this round
+    time_span = time_spans[round - 1]
+    dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
+    # Get index of files that are between the cutoff times
+    index = [
+        i
+        for i, x in enumerate(dt_timestamps)
+        if x > dt_cutoffs[0] and x < dt_cutoffs[1]
+    ]
+    return index, round
+def get_evalset(round=None):
+    """
+    Get the evalset for a given round.
+    Returns:
+        Tuple of (df, y) where df is a DataFrame with model input
+        and y is a list of boolean with ground truth.
+    """
+    dataset = None
+    index = None
+    # Get latest round if argument is None
+    if round is None:
+        dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
+        index, round = select_round(dataset, "test", round)
+    if round == 1:
+        # For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
+        df = pd.read_csv("development/test/disagreements_for_AI.csv")
+        # Get y list (ground truth)
+        y_df = pd.read_csv("development/test/human_alignments.csv")
+        y = list(y_df["noteworthy"])
+        # Sanity check: page titles are the same
+        if not y_df["title"].equals(df["title"]):
+            raise ValueError("Titles aren't equal")
+        # Rename columns for consistency with later rounds
+        df.rename(
+            columns={
+                "title": "page_title",
+                "few-shot_noteworthy": "fewshot_noteworthy",
+                "few-shot_rationale": "fewshot_rationale",
+            },
+            inplace=True,
+        )
+        # Return results
+        return df, y
+    else:
+        if dataset is None:
+            # For the 2nd and higher rounds we use production data (examples with user feedback)
+            # Load feedback dataset
+            dataset = load_dataset(
+                "jedick/noteworthy-differences-feedback", split="test"
+            )
+            # Get indices of files in this round
+            index, _ = select_round(dataset, "test", round)
+        # Convert to DataFrame
+        df = dataset.to_pandas()
+        # Use only these examples
+        df = df.iloc[index]
+        # Construct y list (ground truth)
+        judge = list(df["judge_noteworthy"])
+        feedback = list(df["feedback"])
+        y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
+        # Return results
+        return df, y
+def evaluate(e_round=1, a_round=1, rep=1):
+    """
+    Run evaluation for a given evalset and alignment prompt.
+    Args:
+        e_round: The round of the evalset to use (> 0).
+        a_round: The round of the alignment to use (>= 0).
+    Details:
+        Round 0 corresponds to the unaligned judge.
+        Round 1 corresponds to the development evalset and first heuristic alignment.
+        Rounds 2 and higher correspond to production evalsets and alignments.
+    Results:
+        Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
+    """
+    span_name = f"Evalset {e_round}, alignment {a_round}"
+    with logfire.span(span_name):
+        # Select judge mode
+        judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
+        # Define output file
+        outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
+        print(f"Saving evaluation results to {outfile}")
+        # Get evalset and ground truth
+        df, y = get_evalset(e_round)
+        # Initialize output lists
+        page_title = []
+        judge_reasoning = []
+        judge_noteworthy = []
+        human_noteworthy = []
+        for index, row in df.iterrows():
+            # Change this if needed (to restart after errors)
+            if index < 0:
+                next
+            else:
+                # Run judge
+                try:
+                    with logfire.span(row["page_title"]):
+                        output = judge(
+                            df.iloc[index]["old_revision"],
+                            df.iloc[index]["new_revision"],
+                            df.iloc[index]["heuristic_rationale"],
+                            df.iloc[index]["fewshot_rationale"],
+                            mode=judge_mode,
+                            round=a_round,
+                        )
+                except:
+                    output = {"noteworthy": None, "reasoning": None}
+                print(output)
+                # Update output lists
+                page_title.append(row["page_title"])
+                judge_reasoning.append(output["reasoning"])
+                judge_noteworthy.append(output["noteworthy"])
+                human_noteworthy.append(y[index])
+                # Write CSV in every loop to avoid data loss if errors occur
+                data_list = list(
+                    zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
+                )
+                columns = [
+                    "page_title",
+                    "judge_reasoning",
+                    "judge_noteworthy",
+                    "human_noteworthy",
+                ]
+                out_df = pd.DataFrame(data_list, columns=columns)
+                out_df.to_csv(outfile, index=False, encoding="utf-8")

models.py CHANGED Viewed

@@ -26,9 +26,9 @@ logfire.instrument_google_genai()
 client = genai.Client()
-def get_latest_iteration():
     """
-    Find the latest iteration number from alignment files in the production directory.
     Returns the highest numeric suffix from files matching alignment_*.txt pattern.
     """
     pattern = "production/alignment_*.txt"
@@ -37,18 +37,18 @@ def get_latest_iteration():
     if not files:
         raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
-    max_iteration = 0
     for file in files:
         # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
         match = re.search(r"alignment_(\d+)\.txt$", file)
         if match:
-            iteration = int(match.group(1))
-            max_iteration = max(max_iteration, iteration)
-    if max_iteration == 0:
-        raise ValueError("No valid iteration numbers found in alignment files")
-    return max_iteration
 @retry_with_backoff()
@@ -102,7 +102,7 @@ def judge(
     rationale_1,
     rationale_2,
     mode="aligned-heuristic",
-    iteration=None,
 ):
     """
     AI judge to settle disagreements between classification models
@@ -113,7 +113,7 @@ def judge(
         rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
         rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
         mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
-        iteration: Iteration to use for heuristic alignment (None for latest)
     Returns:
         noteworthy: True if the differences are noteworthy; False if not
@@ -138,10 +138,10 @@ def judge(
             lines = file.readlines()
             alignment_text = "".join(lines)
     elif mode == "aligned-heuristic":
-        # Use latest iteration if iteration is None
-        if iteration is None:
-            iteration = get_latest_iteration()
-        with open(f"production/alignment_{str(iteration)}.txt", "r") as file:
             lines = file.readlines()
             alignment_text = "".join(lines)
     else:

 client = genai.Client()
+def get_latest_round():
     """
+    Find the latest round number from alignment files in the production directory.
     Returns the highest numeric suffix from files matching alignment_*.txt pattern.
     """
     pattern = "production/alignment_*.txt"
     if not files:
         raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
+    max_round = 0
     for file in files:
         # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
         match = re.search(r"alignment_(\d+)\.txt$", file)
         if match:
+            round = int(match.group(1))
+            max_round = max(max_round, round)
+    if max_round == 0:
+        raise ValueError("No valid round numbers found in alignment files")
+    return max_round
 @retry_with_backoff()
     rationale_1,
     rationale_2,
     mode="aligned-heuristic",
+    round=None,
 ):
     """
     AI judge to settle disagreements between classification models
         rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
         rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
         mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
+        round: Round to use for heuristic alignment (None for latest)
     Returns:
         noteworthy: True if the differences are noteworthy; False if not
             lines = file.readlines()
             alignment_text = "".join(lines)
     elif mode == "aligned-heuristic":
+        # Use latest round if round is None
+        if round is None:
+            round = get_latest_round()
+        with open(f"production/alignment_{str(round)}.txt", "r") as file:
             lines = file.readlines()
             alignment_text = "".join(lines)
     else:

update_alignment.py CHANGED Viewed

@@ -3,6 +3,7 @@ from google import genai
 from dotenv import load_dotenv
 from retry_with_backoff import retry_with_backoff
 from prompts import update_prompt
 import logfire
 # Load API keys
@@ -18,16 +19,24 @@ client = genai.Client()
 @logfire.instrument("Update alignment")
-def update_alignment():
     # Load feedback dataset
-    dataset = load_dataset("jedick/noteworthy-differences-feedback")
     # Convert to DataFrame
-    df = dataset["train"].to_pandas()
-    # Remove samples with High confidence where feedback is "agree"
-    high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
-    df = df.loc[~high_and_agree]
-    # Get 30 examples for training the LLM
-    examples = df[df.confidence_score != "High"].iloc[:30, :]
     examples_text = []
     # Loop over rows
     for index, row in df.iterrows():
@@ -47,7 +56,7 @@ def update_alignment():
     examples_text = "\n\n".join(examples_text)
     # Read the existing alignment
-    with open("production/alignment_1.txt", "r") as file:
         lines = file.readlines()
         alignment_text = "".join(lines)
@@ -68,7 +77,7 @@ def update_alignment():
     # Get the response
     response = get_response()
     # Save to new alignment text file
-    with open("production/alignment_2.txt", "w") as file:
         file.write(response.text)

 from dotenv import load_dotenv
 from retry_with_backoff import retry_with_backoff
 from prompts import update_prompt
+from evaluate import select_round
 import logfire
 # Load API keys
 @logfire.instrument("Update alignment")
+def update_alignment(round=None):
+    """
+    Update the alignment prompt using feedback collect from production app.
+    Args:
+        round: alignment round, starting with 2 (None uses most recent available round)
+    """
     # Load feedback dataset
+    dataset = load_dataset("jedick/noteworthy-differences-feedback", split="train")
     # Convert to DataFrame
+    df = dataset.to_pandas()
+    # Get examples for this round
+    # This also gets the number of the most recent round if the argument is None
+    index, round = select_round(dataset, "train", round)
+    examples = df.iloc[index]
+    ## Remove samples with High confidence where feedback is "agree"
+    # high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
+    # df = df.loc[~high_and_agree]
     examples_text = []
     # Loop over rows
     for index, row in df.iterrows():
     examples_text = "\n\n".join(examples_text)
     # Read the existing alignment
+    with open(f"production/alignment_{str(round - 1)}.txt", "r") as file:
         lines = file.readlines()
         alignment_text = "".join(lines)
     # Get the response
     response = get_response()
     # Save to new alignment text file
+    with open(f"production/alignment_{str(round)}.txt", "w") as file:
         file.write(response.text)