Spaces:
Sleeping
Sleeping
jedick
commited on
Commit
·
b6882bd
1
Parent(s):
9d450de
Add alignment 3
Browse files- evaluate.py +6 -0
- production/alignment_3.txt +53 -0
- prompts.py +11 -11
- update_alignment.py +6 -11
evaluate.py
CHANGED
|
@@ -27,11 +27,15 @@ def select_round(dataset, split, round=None):
|
|
| 27 |
# First round (development) has no time span
|
| 28 |
[None, None],
|
| 29 |
["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
|
|
|
|
| 30 |
]
|
| 31 |
# If no round is specified, use the most recent one
|
| 32 |
if round is None:
|
| 33 |
round = len(time_spans)
|
| 34 |
print(f"Selected round {round}")
|
|
|
|
|
|
|
|
|
|
| 35 |
# Get file names
|
| 36 |
file_urls = list(dataset.info.download_checksums.keys())
|
| 37 |
file_names = [x.split("/data/")[1] for x in file_urls]
|
|
@@ -105,6 +109,8 @@ def get_evalset(round=None):
|
|
| 105 |
df = dataset.to_pandas()
|
| 106 |
# Use only these examples
|
| 107 |
df = df.iloc[index]
|
|
|
|
|
|
|
| 108 |
# Construct y list (ground truth)
|
| 109 |
judge = list(df["judge_noteworthy"])
|
| 110 |
feedback = list(df["feedback"])
|
|
|
|
| 27 |
# First round (development) has no time span
|
| 28 |
[None, None],
|
| 29 |
["2025-12-19T13:29:42", "2025-12-20T07:25:12"],
|
| 30 |
+
["2025-12-23T01:20:55", "2025-12-23T06:39:43"],
|
| 31 |
]
|
| 32 |
# If no round is specified, use the most recent one
|
| 33 |
if round is None:
|
| 34 |
round = len(time_spans)
|
| 35 |
print(f"Selected round {round}")
|
| 36 |
+
# Return None for non-production round
|
| 37 |
+
if round < 2:
|
| 38 |
+
return None
|
| 39 |
# Get file names
|
| 40 |
file_urls = list(dataset.info.download_checksums.keys())
|
| 41 |
file_names = [x.split("/data/")[1] for x in file_urls]
|
|
|
|
| 109 |
df = dataset.to_pandas()
|
| 110 |
# Use only these examples
|
| 111 |
df = df.iloc[index]
|
| 112 |
+
# Reset the index after subsetting
|
| 113 |
+
df.reset_index(drop=True, inplace=True)
|
| 114 |
# Construct y list (ground truth)
|
| 115 |
judge = list(df["judge_noteworthy"])
|
| 116 |
feedback = list(df["feedback"])
|
production/alignment_3.txt
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Noteworthy Changes (True)
|
| 2 |
+
|
| 3 |
+
Changes are noteworthy if they introduce new, essential factual information or significantly alter the understanding, scope, or accuracy of the article's core subject.
|
| 4 |
+
|
| 5 |
+
* **Addition or Correction of Core Factual Data:**
|
| 6 |
+
* Adding specific biographical details such as cause of death or specific location of death (city/state) when previously absent or only generally described.
|
| 7 |
+
* Adding specific honors, awards, or significant recognitions, along with their dates, for an individual.
|
| 8 |
+
* Correcting specific biographical dates (e.g., birth/death years, specific event years), even if the numerical change is minor, provided it improves factual accuracy.
|
| 9 |
+
* Adding or correcting specific causes or primary contributing factors for events or phenomena.
|
| 10 |
+
* Adding specific professional roles, contributions, or achievements for individuals (e.g., an actor's specific character in a production, a religious leader's specific denomination).
|
| 11 |
+
* Adding marital status for historically significant figures where the spouse also held prominence, indicating a significant relationship.
|
| 12 |
+
* Adding entirely new and significant political, ideological, or philosophical stances/affiliations for public figures.
|
| 13 |
+
* Providing significantly more precise and comprehensive descriptions of a person's primary profession, area of expertise, or artistic practice.
|
| 14 |
+
* Correcting core biographical facts (e.g., political affiliation) based on new scholarly consensus or confirmed evidence.
|
| 15 |
+
|
| 16 |
+
* **Changes to Identity, Scope, or Definition:**
|
| 17 |
+
* Updating or correcting official primary names for countries, organizations, or other entities due to rebranding or international recognition.
|
| 18 |
+
* Changes that significantly broaden the scope, scale, or importance of an entity or event (e.g., changing a "tournament" to a "national tournament").
|
| 19 |
+
* Adding primary or indigenous language names for places or people, especially when previously omitted.
|
| 20 |
+
* Expanding definitions or lists to include a truly new, distinct category or fundamental variation of the subject, significantly broadening the article's scope (e.g., adding 'electric cars' to a list that previously only had internal combustion and hybrid).
|
| 21 |
+
* Correcting or significantly refining the temporal period for major historical or cultural movements/events (e.g., changing '19th century' to 'early 20th century').
|
| 22 |
+
|
| 23 |
+
* **Removal of Significant Incorrect Information:**
|
| 24 |
+
* Removing significant, factually incorrect content, such as entire sections, misattributed information (e.g., quotes, achievements), or content about the wrong subject.
|
| 25 |
+
|
| 26 |
+
## Not Noteworthy Changes (False)
|
| 27 |
+
|
| 28 |
+
Changes are not noteworthy if they are purely stylistic, minor elaborations, rephrasing, formatting adjustments, or introduce details that are already implicitly understood or do not alter the core factual content or understanding of the article.
|
| 29 |
+
|
| 30 |
+
* **Minor Edits and Formatting:**
|
| 31 |
+
* Correcting minor typos, spelling errors of common words, or punctuation mistakes where the intended meaning was always clear and no factual error was introduced.
|
| 32 |
+
* Purely stylistic rephrasing, grammar adjustments, or minor structural reorganization that does not alter the core factual content or meaning of the text.
|
| 33 |
+
* Reordering items within lists, sections, or paragraphs without altering their content, meaning, or hierarchy.
|
| 34 |
+
* Administrative changes such as adding standard templates for external links or categories, or other formatting adjustments that do not introduce new factual content.
|
| 35 |
+
* Minor adjustments to internal hyperlinks (e.g., linking to a more specific section within the same article or a closely related one) that do not alter the linked text or its fundamental relevance.
|
| 36 |
+
* Removing obsolete, redundant, or superfluous reference citations that do not impact the factual content or verifiability of the surrounding text.
|
| 37 |
+
|
| 38 |
+
* **Minor Detail Additions or Refinements:**
|
| 39 |
+
* Adding more granular geographic detail (e.g., county within a state/country, sub-district within a district) when the broader region is already specified and the added detail does not introduce new essential context.
|
| 40 |
+
* Adding subtitles, taglines, or alternative *secondary* identifiers for media or entities when the primary identifier is already well-known and the addition does not introduce new core content.
|
| 41 |
+
* Adding common honorifics, suffixes (e.g., 'Jr.', 'Sr.'), or minor conventional titles that do not clarify fundamental identity or introduce new factual information if already generally understood.
|
| 42 |
+
* Adding minor, uncritical quotes or brief elaborations that reiterate an already stated fact or opinion without introducing new information, perspective, or core insight.
|
| 43 |
+
* Adding precise day and month to an event already dated by year, if it does not fundamentally alter the understanding of the event's timing, sequence, or historical context.
|
| 44 |
+
* Adding an item to an *already extensive and general list* where the added item is well-known and fits the existing scope, without introducing a new distinct category or significantly expanding the list's foundational definition.
|
| 45 |
+
* Adding minor descriptive or technical details that elaborate on an already defined characteristic or concept, without changing its primary understanding, classification, or core nature (e.g., a minor architectural detail when the primary style is known).
|
| 46 |
+
* Adding purely descriptive, non-essential details about physical appearance, unless directly relevant to the subject's achievements, significance, or identity.
|
| 47 |
+
|
| 48 |
+
* **Synonymous or Implicitly Understood Changes:**
|
| 49 |
+
* Swapping synonyms or making minor word choice refinements that do not change the fundamental meaning, objective scale, or tone (e.g., 'large' to 'significant', 'former' to 'retired', 'United States of America' to 'USA').
|
| 50 |
+
* Generalizing a specific location (e.g., city to country) when the broader location is already implicitly understood or contextually more relevant, and no new specific insight is provided by the more general term.
|
| 51 |
+
|
| 52 |
+
* **Minor Statistical Updates:**
|
| 53 |
+
* Minor statistical updates for ongoing trends (e.g., population, rankings) that fall within expected variation or a short temporal gap (e.g., 5 years or less), and do not represent a significant change in magnitude or implication.
|
prompts.py
CHANGED
|
@@ -91,20 +91,20 @@ Return a JSON-formatted response with keys for:
|
|
| 91 |
"""
|
| 92 |
|
| 93 |
update_prompt = """
|
| 94 |
-
You are
|
| 95 |
-
The
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
The
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
Respond only with
|
| 102 |
|
| 103 |
<alignment_text>
|
| 104 |
{{alignment_text}}
|
| 105 |
</alignment_text>
|
| 106 |
|
| 107 |
-
<
|
| 108 |
-
{{
|
| 109 |
-
</
|
| 110 |
"""
|
|
|
|
| 91 |
"""
|
| 92 |
|
| 93 |
update_prompt = """
|
| 94 |
+
You are updating an AI system for detecting noteworthy differences between Wikipedia article revisions.
|
| 95 |
+
The AI judge has alignment text that may have become ineffective due to concept drift.
|
| 96 |
+
Please update the alignment text based on the feedback data.
|
| 97 |
+
Be willing to make major changes (including deletions) to the text to align with the human feedback.
|
| 98 |
+
The new alignment text should provide detailed heuristics to allow an LLM to correctly classify unseen examples.
|
| 99 |
+
Base the new alignment only on the feedback data and not your own ideas of human preferences.
|
| 100 |
+
Furthermore, make the alignment reflect the overall frequency of human True/False classifications in the feedback.
|
| 101 |
+
Respond only with the updated alignment text.
|
| 102 |
|
| 103 |
<alignment_text>
|
| 104 |
{{alignment_text}}
|
| 105 |
</alignment_text>
|
| 106 |
|
| 107 |
+
<feedback_data>
|
| 108 |
+
{{feedback_data}}
|
| 109 |
+
</feedback_data>
|
| 110 |
"""
|
update_alignment.py
CHANGED
|
@@ -34,23 +34,18 @@ def update_alignment(round=None):
|
|
| 34 |
# This also gets the number of the most recent round if the argument is None
|
| 35 |
index, round = select_round(dataset, "train", round)
|
| 36 |
examples = df.iloc[index]
|
| 37 |
-
## Remove samples with High confidence where feedback is "agree"
|
| 38 |
-
# high_and_agree = (df["confidence_score"] == "High") & (df["feedback"] == "agree")
|
| 39 |
-
# df = df.loc[~high_and_agree]
|
| 40 |
examples_text = []
|
| 41 |
# Loop over rows
|
| 42 |
-
for index, row in
|
| 43 |
# Construct training text for this row
|
| 44 |
-
|
| 45 |
if row["judge_noteworthy"] and row["feedback"] == "agree":
|
| 46 |
-
|
| 47 |
if not row["judge_noteworthy"] and row["feedback"] == "disagree":
|
| 48 |
-
|
| 49 |
-
heuristic = f"Model 1: {row['heuristic_rationale']}"
|
| 50 |
-
fewshot = f"Model 2: {row['fewshot_rationale']}"
|
| 51 |
judge = f"AI Judge: {row['judge_reasoning']}"
|
| 52 |
-
human = f"Human feedback: {row['feedback']}"
|
| 53 |
-
row_text = f"{
|
| 54 |
examples_text.append(row_text)
|
| 55 |
|
| 56 |
examples_text = "\n\n".join(examples_text)
|
|
|
|
| 34 |
# This also gets the number of the most recent round if the argument is None
|
| 35 |
index, round = select_round(dataset, "train", round)
|
| 36 |
examples = df.iloc[index]
|
|
|
|
|
|
|
|
|
|
| 37 |
examples_text = []
|
| 38 |
# Loop over rows
|
| 39 |
+
for index, row in examples.iterrows():
|
| 40 |
# Construct training text for this row
|
| 41 |
+
ground_truth = "noteworthy=False"
|
| 42 |
if row["judge_noteworthy"] and row["feedback"] == "agree":
|
| 43 |
+
ground_truth = "noteworthy=True"
|
| 44 |
if not row["judge_noteworthy"] and row["feedback"] == "disagree":
|
| 45 |
+
ground_truth = "noteworthy=True"
|
|
|
|
|
|
|
| 46 |
judge = f"AI Judge: {row['judge_reasoning']}"
|
| 47 |
+
human = f"Human feedback: {row['feedback']} ({ground_truth})."
|
| 48 |
+
row_text = f"{judge} {human}"
|
| 49 |
examples_text.append(row_text)
|
| 50 |
|
| 51 |
examples_text = "\n\n".join(examples_text)
|