Spaces:

jedick
/

noteworthy-differences

Running

jedick

Initial commit

48c27bb 17 days ago

5.29 kB

	from models import classifier, judge
	from dotenv import load_dotenv
	import logfire


	# Load API keys
	load_dotenv()
	# Setup Logfire
	# We need send_to_logfire=True to capture traces under Pytest
	# https://logfire.pydantic.dev/docs/reference/advanced/testing/
	logfire.configure(send_to_logfire=True)


	def classifier_logic(i):
	"""
	Return scenario flags for heuristic/few-shot classifier outputs.

	Args:
	i: Current iteration (for logging)

	"""

	old_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer of Baroque music. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""

	new_revision = """Henry Purcell (/ˈpɜːrsəl/, rare: /pərˈsɛl/;[n 1] c. 10 September 1659[n 2] – 21 November 1695) was an English composer and organist of the middle Baroque era. He composed more than 100 songs, a tragic opera Dido and Aeneas, and wrote incidental music to a version of Shakespeare's A Midsummer Night's Dream called The Fairy Queen."""

	with logfire.span("classifier_logic {i}", i=i):
	# Run classifier models
	heuristic = classifier(old_revision, new_revision, "heuristic")
	few_shot = classifier(old_revision, new_revision, "few-shot")
	heuristic_true = heuristic["noteworthy"] is True
	few_shot_true = few_shot["noteworthy"] is True

	only_heuristic_true = heuristic_true and not few_shot_true
	only_few_shot_true = few_shot_true and not heuristic_true
	both_true = heuristic_true and few_shot_true
	both_false = (heuristic_true is False) and (few_shot_true is False)

	return (
	only_heuristic_true,
	only_few_shot_true,
	both_true,
	both_false,
	)


	def judge_logic(i):
	"""
	Return scenario flags for judge outputs.

	Args:
	i: Current iteration (for logging)

	"""

	old_revision = """Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Kaman District of Kırşehir Province in Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It was opened in 2010. A Japanese garden is next to the museum building.[1][2]"""

	new_revision = """The Kaman-Kalehöyük Archaeological Museum (Turkish: Kaman-Kalehöyük Arkeoloji Müzesi) is an archaeological museum in Çağırkan, Kaman District, Kırşehir Province, Turkey. It exhibits artifacts of seven civilizations excavated in the nearby multi-period mound Kaman-Kalehöyük. It opened in 2010. A Japanese garden is next to the museum building.[1][2]"""

	with logfire.span("judge_logic {i}", i=i):
	heuristic = classifier(old_revision, new_revision, "heuristic")
	few_shot = classifier(old_revision, new_revision, "few-shot")
	judge_few_shot = judge(
	old_revision,
	new_revision,
	heuristic["rationale"],
	few_shot["rationale"],
	mode="aligned-fewshot",
	)
	judge_heuristic = judge(
	old_revision,
	new_revision,
	heuristic["rationale"],
	few_shot["rationale"],
	mode="aligned-heuristic",
	)

	# Test condition is True if aligned judges both give False
	judge_condition = (
	judge_few_shot["noteworthy"] == False and judge_heuristic["noteworthy"] == False
	)

	return judge_condition


	# pytest -vv test_models.py::test_classifier
	def test_classifier():
	"""Run classifier logic 5 times and compare outcomes."""
	tries = 5
	with logfire.span("test_classifier"):
	outcomes = [classifier_logic(i) for i in range(tries)]

	only_heuristic_true = sum(result[0] for result in outcomes)
	only_few_shot_true = sum(result[1] for result in outcomes)
	both_true = sum(result[2] for result in outcomes)
	both_false = sum(result[3] for result in outcomes)

	heuristic_true_count = only_heuristic_true + both_true
	few_shot_true_count = only_few_shot_true + both_true
	disagree_count = only_heuristic_true + only_few_shot_true
	agree_count = both_true + both_false

	few_shot_more_often = few_shot_true_count > heuristic_true_count
	disagree_more_than_agree = disagree_count > agree_count

	if not few_shot_more_often:
	print(
	"Few-shot classifier did not return True more often than the heuristic classifier."
	)
	if not disagree_more_than_agree:
	print("Classifiers did not disagree more often than they agreed.")

	assert few_shot_more_often and disagree_more_than_agree


	# pytest -vv test_models.py::test_judge
	def test_judge():
	"""Run judge logic up to 5 times"""
	current_try = 0
	max_trys = 5
	with logfire.span("test_judge"):
	while current_try < max_trys:
	result = judge_logic(current_try)
	current_try += 1
	if result is True:
	print(f"Try {current_try} succeeded")
	break
	else:
	print(f"Try {current_try} failed")
	# The assert for pytest
	assert result is True