Spaces:
Sleeping
Sleeping
jedick
commited on
Commit
·
9d450de
1
Parent(s):
e42e305
Change iteration to round
Browse files- evaluate.py +184 -0
- models.py +14 -14
- update_alignment.py +19 -10
evaluate.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from models import judge
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import logfire
|
| 7 |
+
|
| 8 |
+
# Load API keys
|
| 9 |
+
load_dotenv()
|
| 10 |
+
# Setup logging with Logfire
|
| 11 |
+
logfire.configure()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def select_round(dataset, split, round=None):
|
| 15 |
+
"""
|
| 16 |
+
Select the production round for a given dataset and split.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
dataset: Hugging Face dataset
|
| 20 |
+
split: train or test
|
| 21 |
+
round: round number (None for most recent)
|
| 22 |
+
|
| 23 |
+
Returns a tuple of (index, round) with the the indices of files in the round and the round used.
|
| 24 |
+
"""
|
| 25 |
+
# Define production time spans for rounds
|
| 26 |
+
time_spans = [
|
| 27 |
+
# First round (development) has no time span
|
| 28 |
+
[None, None],
|
| 29 |
+
["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
|
| 30 |
+
]
|
| 31 |
+
# If no round is specified, use the most recent one
|
| 32 |
+
if round is None:
|
| 33 |
+
round = len(time_spans)
|
| 34 |
+
print(f"Selected round {round}")
|
| 35 |
+
# Get file names
|
| 36 |
+
file_urls = list(dataset.info.download_checksums.keys())
|
| 37 |
+
file_names = [x.split("/data/")[1] for x in file_urls]
|
| 38 |
+
# Filter list using list comprehension
|
| 39 |
+
split_file_names = [x for x in file_names if f"{split}-" in x]
|
| 40 |
+
# Remove test- prefix and .json suffix
|
| 41 |
+
timestamps = [
|
| 42 |
+
x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
|
| 43 |
+
]
|
| 44 |
+
# Convert to datetime object
|
| 45 |
+
dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
|
| 46 |
+
# Get time span for this round
|
| 47 |
+
time_span = time_spans[round - 1]
|
| 48 |
+
dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
|
| 49 |
+
# Get index of files that are between the cutoff times
|
| 50 |
+
index = [
|
| 51 |
+
i
|
| 52 |
+
for i, x in enumerate(dt_timestamps)
|
| 53 |
+
if x > dt_cutoffs[0] and x < dt_cutoffs[1]
|
| 54 |
+
]
|
| 55 |
+
return index, round
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def get_evalset(round=None):
|
| 59 |
+
"""
|
| 60 |
+
Get the evalset for a given round.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
Tuple of (df, y) where df is a DataFrame with model input
|
| 64 |
+
and y is a list of boolean with ground truth.
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
dataset = None
|
| 68 |
+
index = None
|
| 69 |
+
|
| 70 |
+
# Get latest round if argument is None
|
| 71 |
+
if round is None:
|
| 72 |
+
dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
|
| 73 |
+
index, round = select_round(dataset, "test", round)
|
| 74 |
+
|
| 75 |
+
if round == 1:
|
| 76 |
+
# For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
|
| 77 |
+
df = pd.read_csv("development/test/disagreements_for_AI.csv")
|
| 78 |
+
# Get y list (ground truth)
|
| 79 |
+
y_df = pd.read_csv("development/test/human_alignments.csv")
|
| 80 |
+
y = list(y_df["noteworthy"])
|
| 81 |
+
# Sanity check: page titles are the same
|
| 82 |
+
if not y_df["title"].equals(df["title"]):
|
| 83 |
+
raise ValueError("Titles aren't equal")
|
| 84 |
+
# Rename columns for consistency with later rounds
|
| 85 |
+
df.rename(
|
| 86 |
+
columns={
|
| 87 |
+
"title": "page_title",
|
| 88 |
+
"few-shot_noteworthy": "fewshot_noteworthy",
|
| 89 |
+
"few-shot_rationale": "fewshot_rationale",
|
| 90 |
+
},
|
| 91 |
+
inplace=True,
|
| 92 |
+
)
|
| 93 |
+
# Return results
|
| 94 |
+
return df, y
|
| 95 |
+
else:
|
| 96 |
+
if dataset is None:
|
| 97 |
+
# For the 2nd and higher rounds we use production data (examples with user feedback)
|
| 98 |
+
# Load feedback dataset
|
| 99 |
+
dataset = load_dataset(
|
| 100 |
+
"jedick/noteworthy-differences-feedback", split="test"
|
| 101 |
+
)
|
| 102 |
+
# Get indices of files in this round
|
| 103 |
+
index, _ = select_round(dataset, "test", round)
|
| 104 |
+
# Convert to DataFrame
|
| 105 |
+
df = dataset.to_pandas()
|
| 106 |
+
# Use only these examples
|
| 107 |
+
df = df.iloc[index]
|
| 108 |
+
# Construct y list (ground truth)
|
| 109 |
+
judge = list(df["judge_noteworthy"])
|
| 110 |
+
feedback = list(df["feedback"])
|
| 111 |
+
y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
|
| 112 |
+
# Return results
|
| 113 |
+
return df, y
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def evaluate(e_round=1, a_round=1, rep=1):
|
| 117 |
+
"""
|
| 118 |
+
Run evaluation for a given evalset and alignment prompt.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
e_round: The round of the evalset to use (> 0).
|
| 122 |
+
a_round: The round of the alignment to use (>= 0).
|
| 123 |
+
|
| 124 |
+
Details:
|
| 125 |
+
Round 0 corresponds to the unaligned judge.
|
| 126 |
+
Round 1 corresponds to the development evalset and first heuristic alignment.
|
| 127 |
+
Rounds 2 and higher correspond to production evalsets and alignments.
|
| 128 |
+
|
| 129 |
+
Results:
|
| 130 |
+
Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
span_name = f"Evalset {e_round}, alignment {a_round}"
|
| 134 |
+
with logfire.span(span_name):
|
| 135 |
+
# Select judge mode
|
| 136 |
+
judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
|
| 137 |
+
# Define output file
|
| 138 |
+
outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
|
| 139 |
+
print(f"Saving evaluation results to {outfile}")
|
| 140 |
+
# Get evalset and ground truth
|
| 141 |
+
df, y = get_evalset(e_round)
|
| 142 |
+
|
| 143 |
+
# Initialize output lists
|
| 144 |
+
page_title = []
|
| 145 |
+
judge_reasoning = []
|
| 146 |
+
judge_noteworthy = []
|
| 147 |
+
human_noteworthy = []
|
| 148 |
+
|
| 149 |
+
for index, row in df.iterrows():
|
| 150 |
+
# Change this if needed (to restart after errors)
|
| 151 |
+
if index < 0:
|
| 152 |
+
next
|
| 153 |
+
else:
|
| 154 |
+
# Run judge
|
| 155 |
+
try:
|
| 156 |
+
with logfire.span(row["page_title"]):
|
| 157 |
+
output = judge(
|
| 158 |
+
df.iloc[index]["old_revision"],
|
| 159 |
+
df.iloc[index]["new_revision"],
|
| 160 |
+
df.iloc[index]["heuristic_rationale"],
|
| 161 |
+
df.iloc[index]["fewshot_rationale"],
|
| 162 |
+
mode=judge_mode,
|
| 163 |
+
round=a_round,
|
| 164 |
+
)
|
| 165 |
+
except:
|
| 166 |
+
output = {"noteworthy": None, "reasoning": None}
|
| 167 |
+
print(output)
|
| 168 |
+
# Update output lists
|
| 169 |
+
page_title.append(row["page_title"])
|
| 170 |
+
judge_reasoning.append(output["reasoning"])
|
| 171 |
+
judge_noteworthy.append(output["noteworthy"])
|
| 172 |
+
human_noteworthy.append(y[index])
|
| 173 |
+
# Write CSV in every loop to avoid data loss if errors occur
|
| 174 |
+
data_list = list(
|
| 175 |
+
zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
|
| 176 |
+
)
|
| 177 |
+
columns = [
|
| 178 |
+
"page_title",
|
| 179 |
+
"judge_reasoning",
|
| 180 |
+
"judge_noteworthy",
|
| 181 |
+
"human_noteworthy",
|
| 182 |
+
]
|
| 183 |
+
out_df = pd.DataFrame(data_list, columns=columns)
|
| 184 |
+
out_df.to_csv(outfile, index=False, encoding="utf-8")
|
models.py
CHANGED
|
@@ -26,9 +26,9 @@ logfire.instrument_google_genai()
|
|
| 26 |
client = genai.Client()
|
| 27 |
|
| 28 |
|
| 29 |
-
def
|
| 30 |
"""
|
| 31 |
-
Find the latest
|
| 32 |
Returns the highest numeric suffix from files matching alignment_*.txt pattern.
|
| 33 |
"""
|
| 34 |
pattern = "production/alignment_*.txt"
|
|
@@ -37,18 +37,18 @@ def get_latest_iteration():
|
|
| 37 |
if not files:
|
| 38 |
raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
|
| 39 |
|
| 40 |
-
|
| 41 |
for file in files:
|
| 42 |
# Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
|
| 43 |
match = re.search(r"alignment_(\d+)\.txt$", file)
|
| 44 |
if match:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
if
|
| 49 |
-
raise ValueError("No valid
|
| 50 |
|
| 51 |
-
return
|
| 52 |
|
| 53 |
|
| 54 |
@retry_with_backoff()
|
|
@@ -102,7 +102,7 @@ def judge(
|
|
| 102 |
rationale_1,
|
| 103 |
rationale_2,
|
| 104 |
mode="aligned-heuristic",
|
| 105 |
-
|
| 106 |
):
|
| 107 |
"""
|
| 108 |
AI judge to settle disagreements between classification models
|
|
@@ -113,7 +113,7 @@ def judge(
|
|
| 113 |
rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
|
| 114 |
rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
|
| 115 |
mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
|
| 116 |
-
|
| 117 |
|
| 118 |
Returns:
|
| 119 |
noteworthy: True if the differences are noteworthy; False if not
|
|
@@ -138,10 +138,10 @@ def judge(
|
|
| 138 |
lines = file.readlines()
|
| 139 |
alignment_text = "".join(lines)
|
| 140 |
elif mode == "aligned-heuristic":
|
| 141 |
-
# Use latest
|
| 142 |
-
if
|
| 143 |
-
|
| 144 |
-
with open(f"production/alignment_{str(
|
| 145 |
lines = file.readlines()
|
| 146 |
alignment_text = "".join(lines)
|
| 147 |
else:
|
|
|
|
| 26 |
client = genai.Client()
|
| 27 |
|
| 28 |
|
| 29 |
+
def get_latest_round():
|
| 30 |
"""
|
| 31 |
+
Find the latest round number from alignment files in the production directory.
|
| 32 |
Returns the highest numeric suffix from files matching alignment_*.txt pattern.
|
| 33 |
"""
|
| 34 |
pattern = "production/alignment_*.txt"
|
|
|
|
| 37 |
if not files:
|
| 38 |
raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
|
| 39 |
|
| 40 |
+
max_round = 0
|
| 41 |
for file in files:
|
| 42 |
# Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
|
| 43 |
match = re.search(r"alignment_(\d+)\.txt$", file)
|
| 44 |
if match:
|
| 45 |
+
round = int(match.group(1))
|
| 46 |
+
max_round = max(max_round, round)
|
| 47 |
|
| 48 |
+
if max_round == 0:
|
| 49 |
+
raise ValueError("No valid round numbers found in alignment files")
|
| 50 |
|
| 51 |
+
return max_round
|
| 52 |
|
| 53 |
|
| 54 |
@retry_with_backoff()
|
|
|
|
| 102 |
rationale_1,
|
| 103 |
rationale_2,
|
| 104 |
mode="aligned-heuristic",
|
| 105 |
+
round=None,
|
| 106 |
):
|
| 107 |
"""
|
| 108 |
AI judge to settle disagreements between classification models
|
|
|
|
| 113 |
rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
|
| 114 |
rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
|
| 115 |
mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
|
| 116 |
+
round: Round to use for heuristic alignment (None for latest)
|
| 117 |
|
| 118 |
Returns:
|
| 119 |
noteworthy: True if the differences are noteworthy; False if not
|
|
|
|
| 138 |
lines = file.readlines()
|
| 139 |
alignment_text = "".join(lines)
|
| 140 |
elif mode == "aligned-heuristic":
|
| 141 |
+
# Use latest round if round is None
|
| 142 |
+
if round is None:
|
| 143 |
+
round = get_latest_round()
|
| 144 |
+
with open(f"production/alignment_{str(round)}.txt", "r") as file:
|
| 145 |
lines = file.readlines()
|
| 146 |
alignment_text = "".join(lines)
|
| 147 |
else:
|
update_alignment.py
CHANGED
|
@@ -3,6 +3,7 @@ from google import genai
|
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
from retry_with_backoff import retry_with_backoff
|
| 5 |
from prompts import update_prompt
|
|
|
|
| 6 |
import logfire
|
| 7 |
|
| 8 |
# Load API keys
|
|
@@ -18,16 +19,24 @@ client = genai.Client()
|
|
| 18 |
|
| 19 |
|
| 20 |
@logfire.instrument("Update alignment")
|
| 21 |
-
def update_alignment():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Load feedback dataset
|
| 23 |
-
dataset = load_dataset("jedick/noteworthy-differences-feedback")
|
| 24 |
# Convert to DataFrame
|
| 25 |
-
df = dataset
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
examples_text = []
|
| 32 |
# Loop over rows
|
| 33 |
for index, row in df.iterrows():
|
|
@@ -47,7 +56,7 @@ def update_alignment():
|
|
| 47 |
examples_text = "\n\n".join(examples_text)
|
| 48 |
|
| 49 |
# Read the existing alignment
|
| 50 |
-
with open("production/
|
| 51 |
lines = file.readlines()
|
| 52 |
alignment_text = "".join(lines)
|
| 53 |
|
|
@@ -68,7 +77,7 @@ def update_alignment():
|
|
| 68 |
# Get the response
|
| 69 |
response = get_response()
|
| 70 |
# Save to new alignment text file
|
| 71 |
-
with open("production/
|
| 72 |
file.write(response.text)
|
| 73 |
|
| 74 |
|
|
|
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
from retry_with_backoff import retry_with_backoff
|
| 5 |
from prompts import update_prompt
|
| 6 |
+
from evaluate import select_round
|
| 7 |
import logfire
|
| 8 |
|
| 9 |
# Load API keys
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
@logfire.instrument("Update alignment")
|
| 22 |
+
def update_alignment(round=None):
|
| 23 |
+
"""
|
| 24 |
+
Update the alignment prompt using feedback collect from production app.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
round: alignment round, starting with 2 (None uses most recent available round)
|
| 28 |
+
"""
|
| 29 |
# Load feedback dataset
|
| 30 |
+
dataset = load_dataset("jedick/noteworthy-differences-feedback", split="train")
|
| 31 |
# Convert to DataFrame
|
| 32 |
+
df = dataset.to_pandas()
|
| 33 |
+
# Get examples for this round
|
| 34 |
+
# This also gets the number of the most recent round if the argument is None
|
| 35 |
+
index, round = select_round(dataset, "train", round)
|
| 36 |
+
examples = df.iloc[index]
|
| 37 |
+
## Remove samples with High confidence where feedback is "agree"
|
| 38 |
+
# high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
|
| 39 |
+
# df = df.loc[~high_and_agree]
|
| 40 |
examples_text = []
|
| 41 |
# Loop over rows
|
| 42 |
for index, row in df.iterrows():
|
|
|
|
| 56 |
examples_text = "\n\n".join(examples_text)
|
| 57 |
|
| 58 |
# Read the existing alignment
|
| 59 |
+
with open(f"production/alignment_{str(round - 1)}.txt", "r") as file:
|
| 60 |
lines = file.readlines()
|
| 61 |
alignment_text = "".join(lines)
|
| 62 |
|
|
|
|
| 77 |
# Get the response
|
| 78 |
response = get_response()
|
| 79 |
# Save to new alignment text file
|
| 80 |
+
with open(f"production/alignment_{str(round)}.txt", "w") as file:
|
| 81 |
file.write(response.text)
|
| 82 |
|
| 83 |
|