from models import classifier, judge from dotenv import load_dotenv import logfire # Load API keys load_dotenv() # Setup Logfire # We need send_to_logfire=True to capture traces under Pytest # https://logfire.pydantic.dev/docs/reference/advanced/testing/ logfire.configure(send_to_logfire=True) def classifier_logic(i): """ Return scenario flags for heuristic/few-shot classifier outputs. Args: i: Current iteration (for logging) """ old_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer of Baroque music. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen.""" new_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer and organist of the middle Baroque era. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen.""" with logfire.span("classifier_logic {i}", i=i): # Run classifier models heuristic = classifier(old_revision, new_revision, "heuristic") few_shot = classifier(old_revision, new_revision, "few-shot") heuristic_true = heuristic["noteworthy"] is True few_shot_true = few_shot["noteworthy"] is True only_heuristic_true = heuristic_true and not few_shot_true only_few_shot_true = few_shot_true and not heuristic_true both_true = heuristic_true and few_shot_true both_false = (heuristic_true is False) and (few_shot_true is False) return ( only_heuristic_true, only_few_shot_true, both_true, both_false, ) def judge_logic(i): """ Return scenario flags for judge outputs. Args: i: Current iteration (for logging) """ old_revision = """Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Kaman District of Kırşehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]""" new_revision = """The Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Çağırkan, Kaman District, Kırşehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It opened in 2010. A Japanese garden is next to the museum building.[1][2]""" with logfire.span("judge_logic {i}", i=i): heuristic = classifier(old_revision, new_revision, "heuristic") few_shot = classifier(old_revision, new_revision, "few-shot") judge_few_shot = judge( old_revision, new_revision, heuristic["rationale"], few_shot["rationale"], mode="aligned-fewshot", ) judge_heuristic = judge( old_revision, new_revision, heuristic["rationale"], few_shot["rationale"], mode="aligned-heuristic", ) # Test condition is True if aligned judges both give False judge_condition = ( judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False ) return judge_condition # pytest -vv test_models.py::test_classifier def test_classifier(): """Run classifier logic 5 times and compare outcomes.""" tries = 5 with logfire.span("test_classifier"): outcomes = [classifier_logic(i) for i in range(tries)] only_heuristic_true = sum(result[0] for result in outcomes) only_few_shot_true = sum(result[1] for result in outcomes) both_true = sum(result[2] for result in outcomes) both_false = sum(result[3] for result in outcomes) heuristic_true_count = only_heuristic_true + both_true few_shot_true_count = only_few_shot_true + both_true disagree_count = only_heuristic_true + only_few_shot_true agree_count = both_true + both_false few_shot_more_often = few_shot_true_count > heuristic_true_count disagree_more_than_agree = disagree_count > agree_count if not few_shot_more_often: print( "Few-shot classifier did not return True more often than the heuristic classifier." ) if not disagree_more_than_agree: print("Classifiers did not disagree more often than they agreed.") assert few_shot_more_often and disagree_more_than_agree # pytest -vv test_models.py::test_judge def test_judge(): """Run judge logic up to 5 times""" current_try = 0 max_trys = 5 with logfire.span("test_judge"): while current_try < max_trys: result = judge_logic(current_try) current_try += 1 if result is True: print(f"Try {current_try} succeeded") break else: print(f"Try {current_try} failed") # The assert for pytest assert result is True