Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from dotenv import load_dotenv | |
| from datetime import datetime | |
| from models import judge | |
| import pandas as pd | |
| import logfire | |
| # Load API keys | |
| load_dotenv() | |
| # Setup logging with Logfire | |
| logfire.configure() | |
| def select_round(dataset, split, round=None): | |
| """ | |
| Select the production round for a given dataset and split. | |
| Args: | |
| dataset: Hugging Face dataset | |
| split: train or test | |
| round: round number (None for most recent) | |
| Returns a tuple of (index, round) with the the indices of files in the round and the round used. | |
| """ | |
| # Define production time spans for rounds | |
| time_spans = [ | |
| # First round (development) has no time span | |
| [None, None], | |
| ["2025-12-19T13:29:42", "2025-12-20T07:25:12"], | |
| ["2025-12-23T01:20:55", "2025-12-23T06:39:43"], | |
| ["2025-12-25T03:46:46", "2025-12-25T07:38:35"], | |
| ] | |
| # If no round is specified, use the most recent one | |
| if round is None: | |
| round = len(time_spans) | |
| print(f"Selected round {round}") | |
| # Return None for non-production round | |
| if round < 2: | |
| return None | |
| # Get file names | |
| file_urls = list(dataset.info.download_checksums.keys()) | |
| file_names = [x.split("/data/")[1] for x in file_urls] | |
| # Filter list using list comprehension | |
| split_file_names = [x for x in file_names if f"{split}-" in x] | |
| # Remove test- prefix and .json suffix | |
| timestamps = [ | |
| x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names | |
| ] | |
| # Convert to datetime object | |
| dt_timestamps = [datetime.fromisoformat(x) for x in timestamps] | |
| # Get time span for this round | |
| time_span = time_spans[round - 1] | |
| dt_cutoffs = [datetime.fromisoformat(x) for x in time_span] | |
| # Get index of files that are between the cutoff times | |
| index = [ | |
| i | |
| for i, x in enumerate(dt_timestamps) | |
| if x > dt_cutoffs[0] and x < dt_cutoffs[1] | |
| ] | |
| return index, round | |
| def get_evalset(round=None): | |
| """ | |
| Get the evalset for a given round. | |
| Returns: | |
| Tuple of (df, y) where df is a DataFrame with model input | |
| and y is a list of boolean with ground truth. | |
| """ | |
| dataset = None | |
| index = None | |
| # Get latest round if argument is None | |
| if round is None: | |
| dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test") | |
| index, round = select_round(dataset, "test", round) | |
| if round == 1: | |
| # For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page) | |
| df = pd.read_csv("development/test/disagreements_for_AI.csv") | |
| # Get y list (ground truth) | |
| y_df = pd.read_csv("development/test/human_alignments.csv") | |
| y = list(y_df["noteworthy"]) | |
| # Sanity check: page titles are the same | |
| if not y_df["title"].equals(df["title"]): | |
| raise ValueError("Titles aren't equal") | |
| # Rename columns for consistency with later rounds | |
| df.rename( | |
| columns={ | |
| "title": "page_title", | |
| "few-shot_noteworthy": "fewshot_noteworthy", | |
| "few-shot_rationale": "fewshot_rationale", | |
| }, | |
| inplace=True, | |
| ) | |
| # Return results | |
| return df, y | |
| else: | |
| if dataset is None: | |
| # For the 2nd and higher rounds we use production data (examples with user feedback) | |
| # Load feedback dataset | |
| dataset = load_dataset( | |
| "jedick/noteworthy-differences-feedback", split="test" | |
| ) | |
| # Get indices of files in this round | |
| index, _ = select_round(dataset, "test", round) | |
| # Convert to DataFrame | |
| df = dataset.to_pandas() | |
| # Use only the examples in the selected round | |
| df = df.iloc[index] | |
| # Drop rows with None for judge_noteworthy | |
| df = df.dropna(subset=["judge_noteworthy"]) | |
| # Reset the index after subsetting | |
| df.reset_index(drop=True, inplace=True) | |
| # Construct y list (ground truth) | |
| judge = list(df["judge_noteworthy"]) | |
| feedback = list(df["feedback"]) | |
| y = [j if f == "agree" else not j for j, f in zip(judge, feedback)] | |
| # Return results | |
| return df, y | |
| def evaluate(e_round=1, a_round=1, rep=1): | |
| """ | |
| Run evaluation for a given evalset and alignment prompt. | |
| Args: | |
| e_round: The round of the evalset to use (> 0). | |
| a_round: The round of the alignment to use (>= 0). | |
| Details: | |
| Round 0 corresponds to the unaligned judge. | |
| Round 1 corresponds to the development evalset and first heuristic alignment. | |
| Rounds 2 and higher correspond to production evalsets and alignments. | |
| Results: | |
| Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'. | |
| """ | |
| span_name = f"Evalset {e_round}, alignment {a_round}" | |
| with logfire.span(span_name): | |
| # Select judge mode | |
| judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic" | |
| # Define output file | |
| outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv" | |
| print(f"Saving evaluation results to {outfile}") | |
| # Get evalset and ground truth | |
| df, y = get_evalset(e_round) | |
| # Initialize output lists | |
| page_title = [] | |
| judge_reasoning = [] | |
| judge_noteworthy = [] | |
| human_noteworthy = [] | |
| for index, row in df.iterrows(): | |
| # Change this if needed (to restart after errors) | |
| if index < 0: | |
| next | |
| else: | |
| # Run judge | |
| try: | |
| with logfire.span(row["page_title"]): | |
| output = judge( | |
| df.iloc[index]["old_revision"], | |
| df.iloc[index]["new_revision"], | |
| df.iloc[index]["heuristic_rationale"], | |
| df.iloc[index]["fewshot_rationale"], | |
| mode=judge_mode, | |
| round=a_round, | |
| ) | |
| except: | |
| output = {"noteworthy": None, "reasoning": None} | |
| print(output) | |
| # Update output lists | |
| page_title.append(row["page_title"]) | |
| judge_reasoning.append(output["reasoning"]) | |
| judge_noteworthy.append(output["noteworthy"]) | |
| human_noteworthy.append(y[index]) | |
| # Write CSV in every loop to avoid data loss if errors occur | |
| data_list = list( | |
| zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy) | |
| ) | |
| columns = [ | |
| "page_title", | |
| "judge_reasoning", | |
| "judge_noteworthy", | |
| "human_noteworthy", | |
| ] | |
| out_df = pd.DataFrame(data_list, columns=columns) | |
| out_df.to_csv(outfile, index=False, encoding="utf-8") | |