noteworthy-differences / test_models.py
jedick
Initial commit
48c27bb
from models import classifier, judge
from dotenv import load_dotenv
import logfire
# Load API keys
load_dotenv()
# Setup Logfire
# We need send_to_logfire=True to capture traces under Pytest
# https://logfire.pydantic.dev/docs/reference/advanced/testing/
logfire.configure(send_to_logfire=True)
def classifier_logic(i):
"""
Return scenario flags for heuristic/few-shot classifier outputs.
Args:
i: Current iteration (for logging)
"""
old_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer of Baroque music. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
new_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer and organist of the middle Baroque era. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""
with logfire.span("classifier_logic {i}", i=i):
# Run classifier models
heuristic = classifier(old_revision, new_revision, "heuristic")
few_shot = classifier(old_revision, new_revision, "few-shot")
heuristic_true = heuristic["noteworthy"] is True
few_shot_true = few_shot["noteworthy"] is True
only_heuristic_true = heuristic_true and not few_shot_true
only_few_shot_true = few_shot_true and not heuristic_true
both_true = heuristic_true and few_shot_true
both_false = (heuristic_true is False) and (few_shot_true is False)
return (
only_heuristic_true,
only_few_shot_true,
both_true,
both_false,
)
def judge_logic(i):
"""
Return scenario flags for judge outputs.
Args:
i: Current iteration (for logging)
"""
old_revision = """Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Kaman District of Kırşehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
new_revision = """The Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Çağırkan, Kaman District, Kırşehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""
with logfire.span("judge_logic {i}", i=i):
heuristic = classifier(old_revision, new_revision, "heuristic")
few_shot = classifier(old_revision, new_revision, "few-shot")
judge_few_shot = judge(
old_revision,
new_revision,
heuristic["rationale"],
few_shot["rationale"],
mode="aligned-fewshot",
)
judge_heuristic = judge(
old_revision,
new_revision,
heuristic["rationale"],
few_shot["rationale"],
mode="aligned-heuristic",
)
# Test condition is True if aligned judges both give False
judge_condition = (
judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False
)
return judge_condition
# pytest -vv test_models.py::test_classifier
def test_classifier():
"""Run classifier logic 5 times and compare outcomes."""
tries = 5
with logfire.span("test_classifier"):
outcomes = [classifier_logic(i) for i in range(tries)]
only_heuristic_true = sum(result[0] for result in outcomes)
only_few_shot_true = sum(result[1] for result in outcomes)
both_true = sum(result[2] for result in outcomes)
both_false = sum(result[3] for result in outcomes)
heuristic_true_count = only_heuristic_true + both_true
few_shot_true_count = only_few_shot_true + both_true
disagree_count = only_heuristic_true + only_few_shot_true
agree_count = both_true + both_false
few_shot_more_often = few_shot_true_count > heuristic_true_count
disagree_more_than_agree = disagree_count > agree_count
if not few_shot_more_often:
print(
"Few-shot classifier did not return True more often than the heuristic classifier."
)
if not disagree_more_than_agree:
print("Classifiers did not disagree more often than they agreed.")
assert few_shot_more_often and disagree_more_than_agree
# pytest -vv test_models.py::test_judge
def test_judge():
"""Run judge logic up to 5 times"""
current_try = 0
max_trys = 5
with logfire.span("test_judge"):
while current_try < max_trys:
result = judge_logic(current_try)
current_try += 1
if result is True:
print(f"Try {current_try} succeeded")
break
else:
print(f"Try {current_try} failed")
# The assert for pytest
assert result is True