jedick commited on
Commit
9d450de
·
1 Parent(s): e42e305

Change iteration to round

Browse files
Files changed (3) hide show
  1. evaluate.py +184 -0
  2. models.py +14 -14
  3. update_alignment.py +19 -10
evaluate.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from dotenv import load_dotenv
3
+ from datetime import datetime
4
+ from models import judge
5
+ import pandas as pd
6
+ import logfire
7
+
8
+ # Load API keys
9
+ load_dotenv()
10
+ # Setup logging with Logfire
11
+ logfire.configure()
12
+
13
+
14
+ def select_round(dataset, split, round=None):
15
+ """
16
+ Select the production round for a given dataset and split.
17
+
18
+ Args:
19
+ dataset: Hugging Face dataset
20
+ split: train or test
21
+ round: round number (None for most recent)
22
+
23
+ Returns a tuple of (index, round) with the the indices of files in the round and the round used.
24
+ """
25
+ # Define production time spans for rounds
26
+ time_spans = [
27
+ # First round (development) has no time span
28
+ [None, None],
29
+ ["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
30
+ ]
31
+ # If no round is specified, use the most recent one
32
+ if round is None:
33
+ round = len(time_spans)
34
+ print(f"Selected round {round}")
35
+ # Get file names
36
+ file_urls = list(dataset.info.download_checksums.keys())
37
+ file_names = [x.split("/data/")[1] for x in file_urls]
38
+ # Filter list using list comprehension
39
+ split_file_names = [x for x in file_names if f"{split}-" in x]
40
+ # Remove test- prefix and .json suffix
41
+ timestamps = [
42
+ x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
43
+ ]
44
+ # Convert to datetime object
45
+ dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
46
+ # Get time span for this round
47
+ time_span = time_spans[round - 1]
48
+ dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
49
+ # Get index of files that are between the cutoff times
50
+ index = [
51
+ i
52
+ for i, x in enumerate(dt_timestamps)
53
+ if x > dt_cutoffs[0] and x < dt_cutoffs[1]
54
+ ]
55
+ return index, round
56
+
57
+
58
+ def get_evalset(round=None):
59
+ """
60
+ Get the evalset for a given round.
61
+
62
+ Returns:
63
+ Tuple of (df, y) where df is a DataFrame with model input
64
+ and y is a list of boolean with ground truth.
65
+ """
66
+
67
+ dataset = None
68
+ index = None
69
+
70
+ # Get latest round if argument is None
71
+ if round is None:
72
+ dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
73
+ index, round = select_round(dataset, "test", round)
74
+
75
+ if round == 1:
76
+ # For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
77
+ df = pd.read_csv("development/test/disagreements_for_AI.csv")
78
+ # Get y list (ground truth)
79
+ y_df = pd.read_csv("development/test/human_alignments.csv")
80
+ y = list(y_df["noteworthy"])
81
+ # Sanity check: page titles are the same
82
+ if not y_df["title"].equals(df["title"]):
83
+ raise ValueError("Titles aren't equal")
84
+ # Rename columns for consistency with later rounds
85
+ df.rename(
86
+ columns={
87
+ "title": "page_title",
88
+ "few-shot_noteworthy": "fewshot_noteworthy",
89
+ "few-shot_rationale": "fewshot_rationale",
90
+ },
91
+ inplace=True,
92
+ )
93
+ # Return results
94
+ return df, y
95
+ else:
96
+ if dataset is None:
97
+ # For the 2nd and higher rounds we use production data (examples with user feedback)
98
+ # Load feedback dataset
99
+ dataset = load_dataset(
100
+ "jedick/noteworthy-differences-feedback", split="test"
101
+ )
102
+ # Get indices of files in this round
103
+ index, _ = select_round(dataset, "test", round)
104
+ # Convert to DataFrame
105
+ df = dataset.to_pandas()
106
+ # Use only these examples
107
+ df = df.iloc[index]
108
+ # Construct y list (ground truth)
109
+ judge = list(df["judge_noteworthy"])
110
+ feedback = list(df["feedback"])
111
+ y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
112
+ # Return results
113
+ return df, y
114
+
115
+
116
+ def evaluate(e_round=1, a_round=1, rep=1):
117
+ """
118
+ Run evaluation for a given evalset and alignment prompt.
119
+
120
+ Args:
121
+ e_round: The round of the evalset to use (> 0).
122
+ a_round: The round of the alignment to use (>= 0).
123
+
124
+ Details:
125
+ Round 0 corresponds to the unaligned judge.
126
+ Round 1 corresponds to the development evalset and first heuristic alignment.
127
+ Rounds 2 and higher correspond to production evalsets and alignments.
128
+
129
+ Results:
130
+ Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
131
+ """
132
+
133
+ span_name = f"Evalset {e_round}, alignment {a_round}"
134
+ with logfire.span(span_name):
135
+ # Select judge mode
136
+ judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
137
+ # Define output file
138
+ outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
139
+ print(f"Saving evaluation results to {outfile}")
140
+ # Get evalset and ground truth
141
+ df, y = get_evalset(e_round)
142
+
143
+ # Initialize output lists
144
+ page_title = []
145
+ judge_reasoning = []
146
+ judge_noteworthy = []
147
+ human_noteworthy = []
148
+
149
+ for index, row in df.iterrows():
150
+ # Change this if needed (to restart after errors)
151
+ if index < 0:
152
+ next
153
+ else:
154
+ # Run judge
155
+ try:
156
+ with logfire.span(row["page_title"]):
157
+ output = judge(
158
+ df.iloc[index]["old_revision"],
159
+ df.iloc[index]["new_revision"],
160
+ df.iloc[index]["heuristic_rationale"],
161
+ df.iloc[index]["fewshot_rationale"],
162
+ mode=judge_mode,
163
+ round=a_round,
164
+ )
165
+ except:
166
+ output = {"noteworthy": None, "reasoning": None}
167
+ print(output)
168
+ # Update output lists
169
+ page_title.append(row["page_title"])
170
+ judge_reasoning.append(output["reasoning"])
171
+ judge_noteworthy.append(output["noteworthy"])
172
+ human_noteworthy.append(y[index])
173
+ # Write CSV in every loop to avoid data loss if errors occur
174
+ data_list = list(
175
+ zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
176
+ )
177
+ columns = [
178
+ "page_title",
179
+ "judge_reasoning",
180
+ "judge_noteworthy",
181
+ "human_noteworthy",
182
+ ]
183
+ out_df = pd.DataFrame(data_list, columns=columns)
184
+ out_df.to_csv(outfile, index=False, encoding="utf-8")
models.py CHANGED
@@ -26,9 +26,9 @@ logfire.instrument_google_genai()
26
  client = genai.Client()
27
 
28
 
29
- def get_latest_iteration():
30
  """
31
- Find the latest iteration number from alignment files in the production directory.
32
  Returns the highest numeric suffix from files matching alignment_*.txt pattern.
33
  """
34
  pattern = "production/alignment_*.txt"
@@ -37,18 +37,18 @@ def get_latest_iteration():
37
  if not files:
38
  raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
39
 
40
- max_iteration = 0
41
  for file in files:
42
  # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
43
  match = re.search(r"alignment_(\d+)\.txt$", file)
44
  if match:
45
- iteration = int(match.group(1))
46
- max_iteration = max(max_iteration, iteration)
47
 
48
- if max_iteration == 0:
49
- raise ValueError("No valid iteration numbers found in alignment files")
50
 
51
- return max_iteration
52
 
53
 
54
  @retry_with_backoff()
@@ -102,7 +102,7 @@ def judge(
102
  rationale_1,
103
  rationale_2,
104
  mode="aligned-heuristic",
105
- iteration=None,
106
  ):
107
  """
108
  AI judge to settle disagreements between classification models
@@ -113,7 +113,7 @@ def judge(
113
  rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
114
  rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
115
  mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
116
- iteration: Iteration to use for heuristic alignment (None for latest)
117
 
118
  Returns:
119
  noteworthy: True if the differences are noteworthy; False if not
@@ -138,10 +138,10 @@ def judge(
138
  lines = file.readlines()
139
  alignment_text = "".join(lines)
140
  elif mode == "aligned-heuristic":
141
- # Use latest iteration if iteration is None
142
- if iteration is None:
143
- iteration = get_latest_iteration()
144
- with open(f"production/alignment_{str(iteration)}.txt", "r") as file:
145
  lines = file.readlines()
146
  alignment_text = "".join(lines)
147
  else:
 
26
  client = genai.Client()
27
 
28
 
29
+ def get_latest_round():
30
  """
31
+ Find the latest round number from alignment files in the production directory.
32
  Returns the highest numeric suffix from files matching alignment_*.txt pattern.
33
  """
34
  pattern = "production/alignment_*.txt"
 
37
  if not files:
38
  raise FileNotFoundError(f"No alignment files found matching pattern: {pattern}")
39
 
40
+ max_round = 0
41
  for file in files:
42
  # Extract numeric suffix from filename (e.g., "alignment_2.txt" -> 2)
43
  match = re.search(r"alignment_(\d+)\.txt$", file)
44
  if match:
45
+ round = int(match.group(1))
46
+ max_round = max(max_round, round)
47
 
48
+ if max_round == 0:
49
+ raise ValueError("No valid round numbers found in alignment files")
50
 
51
+ return max_round
52
 
53
 
54
  @retry_with_backoff()
 
102
  rationale_1,
103
  rationale_2,
104
  mode="aligned-heuristic",
105
+ round=None,
106
  ):
107
  """
108
  AI judge to settle disagreements between classification models
 
113
  rationale_1: Rationale provided by model 1 (i.e., heuristic prompt)
114
  rationale_2: Rationale provided by model 2 (i.e., few-shot prompt)
115
  mode: Prompt mode: unaligned, aligned-fewshot, or aligned-heuristic
116
+ round: Round to use for heuristic alignment (None for latest)
117
 
118
  Returns:
119
  noteworthy: True if the differences are noteworthy; False if not
 
138
  lines = file.readlines()
139
  alignment_text = "".join(lines)
140
  elif mode == "aligned-heuristic":
141
+ # Use latest round if round is None
142
+ if round is None:
143
+ round = get_latest_round()
144
+ with open(f"production/alignment_{str(round)}.txt", "r") as file:
145
  lines = file.readlines()
146
  alignment_text = "".join(lines)
147
  else:
update_alignment.py CHANGED
@@ -3,6 +3,7 @@ from google import genai
3
  from dotenv import load_dotenv
4
  from retry_with_backoff import retry_with_backoff
5
  from prompts import update_prompt
 
6
  import logfire
7
 
8
  # Load API keys
@@ -18,16 +19,24 @@ client = genai.Client()
18
 
19
 
20
  @logfire.instrument("Update alignment")
21
- def update_alignment():
 
 
 
 
 
 
22
  # Load feedback dataset
23
- dataset = load_dataset("jedick/noteworthy-differences-feedback")
24
  # Convert to DataFrame
25
- df = dataset["train"].to_pandas()
26
- # Remove samples with High confidence where feedback is "agree"
27
- high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
28
- df = df.loc[~high_and_agree]
29
- # Get 30 examples for training the LLM
30
- examples = df[df.confidence_score != "High"].iloc[:30, :]
 
 
31
  examples_text = []
32
  # Loop over rows
33
  for index, row in df.iterrows():
@@ -47,7 +56,7 @@ def update_alignment():
47
  examples_text = "\n\n".join(examples_text)
48
 
49
  # Read the existing alignment
50
- with open("production/alignment_1.txt", "r") as file:
51
  lines = file.readlines()
52
  alignment_text = "".join(lines)
53
 
@@ -68,7 +77,7 @@ def update_alignment():
68
  # Get the response
69
  response = get_response()
70
  # Save to new alignment text file
71
- with open("production/alignment_2.txt", "w") as file:
72
  file.write(response.text)
73
 
74
 
 
3
  from dotenv import load_dotenv
4
  from retry_with_backoff import retry_with_backoff
5
  from prompts import update_prompt
6
+ from evaluate import select_round
7
  import logfire
8
 
9
  # Load API keys
 
19
 
20
 
21
  @logfire.instrument("Update alignment")
22
+ def update_alignment(round=None):
23
+ """
24
+ Update the alignment prompt using feedback collect from production app.
25
+
26
+ Args:
27
+ round: alignment round, starting with 2 (None uses most recent available round)
28
+ """
29
  # Load feedback dataset
30
+ dataset = load_dataset("jedick/noteworthy-differences-feedback", split="train")
31
  # Convert to DataFrame
32
+ df = dataset.to_pandas()
33
+ # Get examples for this round
34
+ # This also gets the number of the most recent round if the argument is None
35
+ index, round = select_round(dataset, "train", round)
36
+ examples = df.iloc[index]
37
+ ## Remove samples with High confidence where feedback is "agree"
38
+ # high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
39
+ # df = df.loc[~high_and_agree]
40
  examples_text = []
41
  # Loop over rows
42
  for index, row in df.iterrows():
 
56
  examples_text = "\n\n".join(examples_text)
57
 
58
  # Read the existing alignment
59
+ with open(f"production/alignment_{str(round - 1)}.txt", "r") as file:
60
  lines = file.readlines()
61
  alignment_text = "".join(lines)
62
 
 
77
  # Get the response
78
  response = get_response()
79
  # Save to new alignment text file
80
+ with open(f"production/alignment_{str(round)}.txt", "w") as file:
81
  file.write(response.text)
82
 
83