from datasets import load_dataset from dotenv import load_dotenv from datetime import datetime from models import judge import pandas as pd import logfire # Load API keys load_dotenv() # Setup logging with Logfire logfire.configure() def select_round(dataset, split, round=None): """ Select the production round for a given dataset and split. Args: dataset: Hugging Face dataset split: train or test round: round number (None for most recent) Returns a tuple of (index, round) with the the indices of files in the round and the round used. """ # Define production time spans for rounds time_spans = [ # First round (development) has no time span [None, None], ["2025-12-19T13:29:42", "2025-12-20T07:25:12"], ["2025-12-23T01:20:55", "2025-12-23T06:39:43"], ["2025-12-25T03:46:46", "2025-12-25T07:38:35"], ] # If no round is specified, use the most recent one if round is None: round = len(time_spans) print(f"Selected round {round}") # Return None for non-production round if round < 2: return None # Get file names file_urls = list(dataset.info.download_checksums.keys()) file_names = [x.split("/data/")[1] for x in file_urls] # Filter list using list comprehension split_file_names = [x for x in file_names if f"{split}-" in x] # Remove test- prefix and .json suffix timestamps = [ x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names ] # Convert to datetime object dt_timestamps = [datetime.fromisoformat(x) for x in timestamps] # Get time span for this round time_span = time_spans[round - 1] dt_cutoffs = [datetime.fromisoformat(x) for x in time_span] # Get index of files that are between the cutoff times index = [ i for i, x in enumerate(dt_timestamps) if x > dt_cutoffs[0] and x < dt_cutoffs[1] ] return index, round def get_evalset(round=None): """ Get the evalset for a given round. Returns: Tuple of (df, y) where df is a DataFrame with model input and y is a list of boolean with ground truth. """ dataset = None index = None # Get latest round if argument is None if round is None: dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test") index, round = select_round(dataset, "test", round) if round == 1: # For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page) df = pd.read_csv("development/test/disagreements_for_AI.csv") # Get y list (ground truth) y_df = pd.read_csv("development/test/human_alignments.csv") y = list(y_df["noteworthy"]) # Sanity check: page titles are the same if not y_df["title"].equals(df["title"]): raise ValueError("Titles aren't equal") # Rename columns for consistency with later rounds df.rename( columns={ "title": "page_title", "few-shot_noteworthy": "fewshot_noteworthy", "few-shot_rationale": "fewshot_rationale", }, inplace=True, ) # Return results return df, y else: if dataset is None: # For the 2nd and higher rounds we use production data (examples with user feedback) # Load feedback dataset dataset = load_dataset( "jedick/noteworthy-differences-feedback", split="test" ) # Get indices of files in this round index, _ = select_round(dataset, "test", round) # Convert to DataFrame df = dataset.to_pandas() # Use only the examples in the selected round df = df.iloc[index] # Drop rows with None for judge_noteworthy df = df.dropna(subset=["judge_noteworthy"]) # Reset the index after subsetting df.reset_index(drop=True, inplace=True) # Construct y list (ground truth) judge = list(df["judge_noteworthy"]) feedback = list(df["feedback"]) y = [j if f == "agree" else not j for j, f in zip(judge, feedback)] # Return results return df, y def evaluate(e_round=1, a_round=1, rep=1): """ Run evaluation for a given evalset and alignment prompt. Args: e_round: The round of the evalset to use (> 0). a_round: The round of the alignment to use (>= 0). Details: Round 0 corresponds to the unaligned judge. Round 1 corresponds to the development evalset and first heuristic alignment. Rounds 2 and higher correspond to production evalsets and alignments. Results: Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'. """ span_name = f"Evalset {e_round}, alignment {a_round}" with logfire.span(span_name): # Select judge mode judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic" # Define output file outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv" print(f"Saving evaluation results to {outfile}") # Get evalset and ground truth df, y = get_evalset(e_round) # Initialize output lists page_title = [] judge_reasoning = [] judge_noteworthy = [] human_noteworthy = [] for index, row in df.iterrows(): # Change this if needed (to restart after errors) if index < 0: next else: # Run judge try: with logfire.span(row["page_title"]): output = judge( df.iloc[index]["old_revision"], df.iloc[index]["new_revision"], df.iloc[index]["heuristic_rationale"], df.iloc[index]["fewshot_rationale"], mode=judge_mode, round=a_round, ) except: output = {"noteworthy": None, "reasoning": None} print(output) # Update output lists page_title.append(row["page_title"]) judge_reasoning.append(output["reasoning"]) judge_noteworthy.append(output["noteworthy"]) human_noteworthy.append(y[index]) # Write CSV in every loop to avoid data loss if errors occur data_list = list( zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy) ) columns = [ "page_title", "judge_reasoning", "judge_noteworthy", "human_noteworthy", ] out_df = pd.DataFrame(data_list, columns=columns) out_df.to_csv(outfile, index=False, encoding="utf-8")