Spaces:
Sleeping
Sleeping
File size: 7,082 Bytes
9d450de b6882bd 8ae98a8 9d450de b6882bd 9d450de 8ae98a8 9d450de 8ae98a8 b6882bd 9d450de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
from datasets import load_dataset
from dotenv import load_dotenv
from datetime import datetime
from models import judge
import pandas as pd
import logfire
# Load API keys
load_dotenv()
# Setup logging with Logfire
logfire.configure()
def select_round(dataset, split, round=None):
"""
Select the production round for a given dataset and split.
Args:
dataset: Hugging Face dataset
split: train or test
round: round number (None for most recent)
Returns a tuple of (index, round) with the the indices of files in the round and the round used.
"""
# Define production time spans for rounds
time_spans = [
# First round (development) has no time span
[None, None],
["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
["2025-12-23T01:20:55", "2025-12-23T06:39:43"],
["2025-12-25T03:46:46", "2025-12-25T07:38:35"],
]
# If no round is specified, use the most recent one
if round is None:
round = len(time_spans)
print(f"Selected round {round}")
# Return None for non-production round
if round < 2:
return None
# Get file names
file_urls = list(dataset.info.download_checksums.keys())
file_names = [x.split("/data/")[1] for x in file_urls]
# Filter list using list comprehension
split_file_names = [x for x in file_names if f"{split}-" in x]
# Remove test- prefix and .json suffix
timestamps = [
x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
]
# Convert to datetime object
dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
# Get time span for this round
time_span = time_spans[round - 1]
dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
# Get index of files that are between the cutoff times
index = [
i
for i, x in enumerate(dt_timestamps)
if x > dt_cutoffs[0] and x < dt_cutoffs[1]
]
return index, round
def get_evalset(round=None):
"""
Get the evalset for a given round.
Returns:
Tuple of (df, y) where df is a DataFrame with model input
and y is a list of boolean with ground truth.
"""
dataset = None
index = None
# Get latest round if argument is None
if round is None:
dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
index, round = select_round(dataset, "test", round)
if round == 1:
# For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
df = pd.read_csv("development/test/disagreements_for_AI.csv")
# Get y list (ground truth)
y_df = pd.read_csv("development/test/human_alignments.csv")
y = list(y_df["noteworthy"])
# Sanity check: page titles are the same
if not y_df["title"].equals(df["title"]):
raise ValueError("Titles aren't equal")
# Rename columns for consistency with later rounds
df.rename(
columns={
"title": "page_title",
"few-shot_noteworthy": "fewshot_noteworthy",
"few-shot_rationale": "fewshot_rationale",
},
inplace=True,
)
# Return results
return df, y
else:
if dataset is None:
# For the 2nd and higher rounds we use production data (examples with user feedback)
# Load feedback dataset
dataset = load_dataset(
"jedick/noteworthy-differences-feedback", split="test"
)
# Get indices of files in this round
index, _ = select_round(dataset, "test", round)
# Convert to DataFrame
df = dataset.to_pandas()
# Use only the examples in the selected round
df = df.iloc[index]
# Drop rows with None for judge_noteworthy
df = df.dropna(subset=["judge_noteworthy"])
# Reset the index after subsetting
df.reset_index(drop=True, inplace=True)
# Construct y list (ground truth)
judge = list(df["judge_noteworthy"])
feedback = list(df["feedback"])
y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
# Return results
return df, y
def evaluate(e_round=1, a_round=1, rep=1):
"""
Run evaluation for a given evalset and alignment prompt.
Args:
e_round: The round of the evalset to use (> 0).
a_round: The round of the alignment to use (>= 0).
Details:
Round 0 corresponds to the unaligned judge.
Round 1 corresponds to the development evalset and first heuristic alignment.
Rounds 2 and higher correspond to production evalsets and alignments.
Results:
Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
"""
span_name = f"Evalset {e_round}, alignment {a_round}"
with logfire.span(span_name):
# Select judge mode
judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
# Define output file
outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
print(f"Saving evaluation results to {outfile}")
# Get evalset and ground truth
df, y = get_evalset(e_round)
# Initialize output lists
page_title = []
judge_reasoning = []
judge_noteworthy = []
human_noteworthy = []
for index, row in df.iterrows():
# Change this if needed (to restart after errors)
if index < 0:
next
else:
# Run judge
try:
with logfire.span(row["page_title"]):
output = judge(
df.iloc[index]["old_revision"],
df.iloc[index]["new_revision"],
df.iloc[index]["heuristic_rationale"],
df.iloc[index]["fewshot_rationale"],
mode=judge_mode,
round=a_round,
)
except:
output = {"noteworthy": None, "reasoning": None}
print(output)
# Update output lists
page_title.append(row["page_title"])
judge_reasoning.append(output["reasoning"])
judge_noteworthy.append(output["noteworthy"])
human_noteworthy.append(y[index])
# Write CSV in every loop to avoid data loss if errors occur
data_list = list(
zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
)
columns = [
"page_title",
"judge_reasoning",
"judge_noteworthy",
"human_noteworthy",
]
out_df = pd.DataFrame(data_list, columns=columns)
out_df.to_csv(outfile, index=False, encoding="utf-8")
|