File size: 7,082 Bytes
9d450de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6882bd
8ae98a8
9d450de
 
 
 
 
b6882bd
 
 
9d450de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ae98a8
9d450de
8ae98a8
 
b6882bd
 
9d450de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from datasets import load_dataset
from dotenv import load_dotenv
from datetime import datetime
from models import judge
import pandas as pd
import logfire

# Load API keys
load_dotenv()
# Setup logging with Logfire
logfire.configure()


def select_round(dataset, split, round=None):
    """
    Select the production round for a given dataset and split.

    Args:
        dataset: Hugging Face dataset
        split: train or test
        round: round number (None for most recent)

    Returns a tuple of (index, round) with the the indices of files in the round and the round used.
    """
    # Define production time spans for rounds
    time_spans = [
        # First round (development) has no time span
        [None, None],
        ["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
        ["2025-12-23T01:20:55", "2025-12-23T06:39:43"],
        ["2025-12-25T03:46:46", "2025-12-25T07:38:35"],
    ]
    # If no round is specified, use the most recent one
    if round is None:
        round = len(time_spans)
        print(f"Selected round {round}")
    # Return None for non-production round
    if round < 2:
        return None
    # Get file names
    file_urls = list(dataset.info.download_checksums.keys())
    file_names = [x.split("/data/")[1] for x in file_urls]
    # Filter list using list comprehension
    split_file_names = [x for x in file_names if f"{split}-" in x]
    # Remove test- prefix and .json suffix
    timestamps = [
        x.replace(f"{split}-", "").replace(".json", "") for x in split_file_names
    ]
    # Convert to datetime object
    dt_timestamps = [datetime.fromisoformat(x) for x in timestamps]
    # Get time span for this round
    time_span = time_spans[round - 1]
    dt_cutoffs = [datetime.fromisoformat(x) for x in time_span]
    # Get index of files that are between the cutoff times
    index = [
        i
        for i, x in enumerate(dt_timestamps)
        if x > dt_cutoffs[0] and x < dt_cutoffs[1]
    ]
    return index, round


def get_evalset(round=None):
    """
    Get the evalset for a given round.

    Returns:
        Tuple of (df, y) where df is a DataFrame with model input
        and y is a list of boolean with ground truth.
    """

    dataset = None
    index = None

    # Get latest round if argument is None
    if round is None:
        dataset = load_dataset("jedick/noteworthy-differences-feedback", split="test")
        index, round = select_round(dataset, "test", round)

    if round == 1:
        # For the 1st round we use development set (model disagreements on pages linked from the Wikipedia main page)
        df = pd.read_csv("development/test/disagreements_for_AI.csv")
        # Get y list (ground truth)
        y_df = pd.read_csv("development/test/human_alignments.csv")
        y = list(y_df["noteworthy"])
        # Sanity check: page titles are the same
        if not y_df["title"].equals(df["title"]):
            raise ValueError("Titles aren't equal")
        # Rename columns for consistency with later rounds
        df.rename(
            columns={
                "title": "page_title",
                "few-shot_noteworthy": "fewshot_noteworthy",
                "few-shot_rationale": "fewshot_rationale",
            },
            inplace=True,
        )
        # Return results
        return df, y
    else:
        if dataset is None:
            # For the 2nd and higher rounds we use production data (examples with user feedback)
            # Load feedback dataset
            dataset = load_dataset(
                "jedick/noteworthy-differences-feedback", split="test"
            )
            # Get indices of files in this round
            index, _ = select_round(dataset, "test", round)
        # Convert to DataFrame
        df = dataset.to_pandas()
        # Use only the examples in the selected round
        df = df.iloc[index]
        # Drop rows with None for judge_noteworthy
        df = df.dropna(subset=["judge_noteworthy"])
        # Reset the index after subsetting
        df.reset_index(drop=True, inplace=True)
        # Construct y list (ground truth)
        judge = list(df["judge_noteworthy"])
        feedback = list(df["feedback"])
        y = [j if f == "agree" else not j for j, f in zip(judge, feedback)]
        # Return results
        return df, y


def evaluate(e_round=1, a_round=1, rep=1):
    """
    Run evaluation for a given evalset and alignment prompt.

    Args:
        e_round: The round of the evalset to use (> 0).
        a_round: The round of the alignment to use (>= 0).

    Details:
        Round 0 corresponds to the unaligned judge.
        Round 1 corresponds to the development evalset and first heuristic alignment.
        Rounds 2 and higher correspond to production evalsets and alignments.

    Results:
        Saves results in 'evals/evalset_{e_round}_alignment_{a_round}.csv'.
    """

    span_name = f"Evalset {e_round}, alignment {a_round}"
    with logfire.span(span_name):
        # Select judge mode
        judge_mode = "unaligned" if a_round == 0 else "aligned-heuristic"
        # Define output file
        outfile = f"evaluations/evalset_{e_round}_alignment_{a_round}_rep_{rep}.csv"
        print(f"Saving evaluation results to {outfile}")
        # Get evalset and ground truth
        df, y = get_evalset(e_round)

        # Initialize output lists
        page_title = []
        judge_reasoning = []
        judge_noteworthy = []
        human_noteworthy = []

        for index, row in df.iterrows():
            # Change this if needed (to restart after errors)
            if index < 0:
                next
            else:
                # Run judge
                try:
                    with logfire.span(row["page_title"]):
                        output = judge(
                            df.iloc[index]["old_revision"],
                            df.iloc[index]["new_revision"],
                            df.iloc[index]["heuristic_rationale"],
                            df.iloc[index]["fewshot_rationale"],
                            mode=judge_mode,
                            round=a_round,
                        )
                except:
                    output = {"noteworthy": None, "reasoning": None}
                print(output)
                # Update output lists
                page_title.append(row["page_title"])
                judge_reasoning.append(output["reasoning"])
                judge_noteworthy.append(output["noteworthy"])
                human_noteworthy.append(y[index])
                # Write CSV in every loop to avoid data loss if errors occur
                data_list = list(
                    zip(page_title, judge_reasoning, judge_noteworthy, human_noteworthy)
                )
                columns = [
                    "page_title",
                    "judge_reasoning",
                    "judge_noteworthy",
                    "human_noteworthy",
                ]
                out_df = pd.DataFrame(data_list, columns=columns)
                out_df.to_csv(outfile, index=False, encoding="utf-8")