Spaces:

jedick
/

noteworthy-differences

Sleeping

jedick

Add alignment 4

8ae98a8 about 1 month ago

7.08 kB

	from datasets import load_dataset
	from dotenv import load_dotenv
	from datetime import datetime
	from models import judge
	import pandas as pd
	import logfire

	# Load API keys
	load_dotenv()
	# Setup logging with Logfire
	logfire.configure()


	def select_round(dataset, split, round=None):
	"""
	Select the production round for a given dataset and split.

	Args:
	dataset: Hugging Face dataset
	split: train or test
	round: round number (None for most recent)

	Returns a tuple of (index, round) with the the indices of files in the round and the round used.
	"""
	# Define production time spans for rounds
	time_spans = [
	# First round (development) has no time span
	[None, None],
	["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
	["2025-12-23T01:20:55", "2025-12-23T06:39:43"],
	["2025-12-25T03:46:46", "2025-12-25T07:38:35"],
	]
	# If no round is specified, use the most recent one
	if round is None:
	round = len(time_spans)
	print(f"Selected round {round}")
	# Return None for non-production round
	if round < 2:
	return None
	# Get file names
	file_urls = list(dataset.info.download_checksums.keys())
	file_names = [x.split("/data/")[1] for x in file_urls]
	# Filter list using list comprehension
	split_file_names = [x for x in file_names if f"{split}-" in x]
	# Remove test- prefix and .json suffix
	timestamps = [
	x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
	]
	# Convert to datetime object
	dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
	# Get time span for this round
	time_span = time_spans[round - 1]
	dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
	# Get index of files that are between the cutoff times
	index = [
	i
	for i, x in enumerate(dt_timestamps)
	if x > dt_cutoffs[0] and x < dt_cutoffs[1]
	]
	return index, round


	def get_evalset(round=None):
	"""
	Get the evalset for a given round.

	Returns:
	Tuple of (df, y) where df is a DataFrame with model input
	and y is a list of boolean with ground truth.
	"""

	dataset = None
	index = None

	# Get latest round if argument is None
	if round is None:
	dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
	index, round = select_round(dataset, "test", round)

	if round == 1:
	# For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
	df = pd.read_csv("development/test/disagreements_for_AI.csv")
	# Get y list (ground truth)
	y_df = pd.read_csv("development/test/human_alignments.csv")
	y = list(y_df["noteworthy"])
	# Sanity check: page titles are the same
	if not y_df["title"].equals(df["title"]):
	raise ValueError("Titles aren't equal")
	# Rename columns for consistency with later rounds
	df.rename(
	columns={
	"title": "page_title",
	"few-shot_noteworthy": "fewshot_noteworthy",
	"few-shot_rationale": "fewshot_rationale",
	},
	inplace=True,
	)
	# Return results
	return df, y
	else:
	if dataset is None:
	# For the 2nd and higher rounds we use production data (examples with user feedback)
	# Load feedback dataset
	dataset = load_dataset(
	"jedick/noteworthy-differences-feedback", split="test"
	)
	# Get indices of files in this round
	index, _ = select_round(dataset, "test", round)
	# Convert to DataFrame
	df = dataset.to_pandas()
	# Use only the examples in the selected round
	df = df.iloc[index]
	# Drop rows with None for judge_noteworthy
	df = df.dropna(subset=["judge_noteworthy"])
	# Reset the index after subsetting
	df.reset_index(drop=True, inplace=True)
	# Construct y list (ground truth)
	judge = list(df["judge_noteworthy"])
	feedback = list(df["feedback"])
	y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
	# Return results
	return df, y


	def evaluate(e_round=1, a_round=1, rep=1):
	"""
	Run evaluation for a given evalset and alignment prompt.

	Args:
	e_round: The round of the evalset to use (> 0).
	a_round: The round of the alignment to use (>= 0).

	Details:
	Round 0 corresponds to the unaligned judge.
	Round 1 corresponds to the development evalset and first heuristic alignment.
	Rounds 2 and higher correspond to production evalsets and alignments.

	Results:
	Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
	"""

	span_name = f"Evalset {e_round}, alignment {a_round}"
	with logfire.span(span_name):
	# Select judge mode
	judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
	# Define output file
	outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
	print(f"Saving evaluation results to {outfile}")
	# Get evalset and ground truth
	df, y = get_evalset(e_round)

	# Initialize output lists
	page_title = []
	judge_reasoning = []
	judge_noteworthy = []
	human_noteworthy = []

	for index, row in df.iterrows():
	# Change this if needed (to restart after errors)
	if index < 0:
	next
	else:
	# Run judge
	try:
	with logfire.span(row["page_title"]):
	output = judge(
	df.iloc[index]["old_revision"],
	df.iloc[index]["new_revision"],
	df.iloc[index]["heuristic_rationale"],
	df.iloc[index]["fewshot_rationale"],
	mode=judge_mode,
	round=a_round,
	)
	except:
	output = {"noteworthy": None, "reasoning": None}
	print(output)
	# Update output lists
	page_title.append(row["page_title"])
	judge_reasoning.append(output["reasoning"])
	judge_noteworthy.append(output["noteworthy"])
	human_noteworthy.append(y[index])
	# Write CSV in every loop to avoid data loss if errors occur
	data_list = list(
	zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
	)
	columns = [
	"page_title",
	"judge_reasoning",
	"judge_noteworthy",
	"human_noteworthy",
	]
	out_df = pd.DataFrame(data_list, columns=columns)
	out_df.to_csv(outfile, index=False, encoding="utf-8")