Spaces:

HarshitShri026
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / tests /test_env.py

HarshitShri026

push

973cd6f 22 days ago

raw

history blame contribute delete

10.3 kB

	import sys
	import os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from env.generator import TaskGenerationEngine
	from env.verifier import VerifierSystem
	from env.rewards import RewardSystem
	from env.environment import AutomathreasonerEnvironment
	from env.models import AutomathreasonerAction

	def test_generator():
	engine = TaskGenerationEngine()

	# Test task generation at various difficulty levels
	for diff in [1.0, 3.0, 5.0]:
	task = engine.generate_task(target_difficulty_band=diff)
	assert "problem" in task
	assert "solution" in task
	assert "difficulty" in task
	assert "technique" in task
	assert "scaffold_hints" in task
	assert task["technique"] in ['power_rule', 'u_substitution', 'by_parts',
	'trigonometric', 'exponential', 'logarithmic']
	print(f" âœ“ Difficulty {diff}: technique={task['technique']}, problem={task['problem'][:60]}...")

	# Test variant generation
	task = engine.generate_task(target_difficulty_band=4.0)
	variants = engine.generate_variants(task, count=3)
	assert len(variants) > 0
	for v in variants:
	assert "problem" in v
	assert "technique" in v
	print(f" âœ“ Generated {len(variants)} variants")

	# Test technique-focused generation
	for tech in ['power_rule', 'u_substitution', 'by_parts']:
	task = engine.generate_technique_focused_task(tech, difficulty=2.0)
	assert task["technique"] == tech
	print(f" âœ“ Technique-focused: {tech}")

	def test_verifier():
	verifier = VerifierSystem()

	# Exact match
	assert verifier.check_exact_match("42", "42")
	assert verifier.check_exact_match(" 42 ", "42")
	print(" âœ“ Exact match")

	# Numeric tolerance
	assert verifier.check_numeric_tolerance("3.14159", "3.1415")
	assert not verifier.check_numeric_tolerance("4.1415", "3.1415")
	print(" âœ“ Numeric tolerance")

	# Python execution
	assert verifier.check_python_execution("2 + 2", "4")
	print(" âœ“ Python execution")

	# Full verification â€” now returns 4 values (c, q, p, r)
	c, q, p, r = verifier.verify("Step 1: Because 2 + 2 is 4. Therefore the answer is 4.", "4", "4")
	assert c == 1.0
	assert q > 0.0
	print(f" âœ“ Full verify: C={c}, Q={q:.3f}, P={p:.3f}, R={r:.3f}")

	# Graduated correctness â€” structural similarity
	score = verifier.check_structural_similarity("x*3", "2x**3")
	assert score > 0.0 # Should get partial credit for same structure
	print(f" âœ“ Structural similarity: {score:.2f}")

	# Technique recognition
	tech_score = verifier.check_technique_recognition(
	"Let u = x^2, then du = 2x dx. By substitution we get...",
	"u_substitution"
	)
	assert tech_score > 0.5
	print(f" âœ“ Technique recognition: {tech_score:.2f}")

	# Process supervision â€” improved
	p_good = verifier.check_process_supervision(
	"Step 1: Identify the integrand. Step 2: Apply the power rule. Therefore x^3/3 + C."
	)
	p_bad = verifier.check_process_supervision("so = 42")
	assert p_good > p_bad
	print(f" âœ“ Process supervision: good={p_good:.2f}, bad={p_bad:.2f}")

	def test_rewards():
	reward_sys = RewardSystem(max_len=1000)

	# Test diversity â€” exact repeat penalty
	history = [{"final_answer": "42"}]
	d = reward_sys.compute_diversity("42", history)
	assert d == -1.0
	print(f" âœ“ Diversity repeat penalty: {d}")

	# Test diversity â€” also works with 'prediction' key (backward compat)
	history_v2 = [{"prediction": "42"}]
	d2 = reward_sys.compute_diversity("42", history_v2)
	assert d2 == -1.0
	print(f" âœ“ Diversity backward compat: {d2}")

	# Test diversity â€” unique answer
	d3 = reward_sys.compute_diversity("99", history)
	assert d3 == 1.0
	print(f" âœ“ Diversity unique bonus: {d3}")

	# Test format compliance
	f = reward_sys.compute_format_compliance(
	"Step 1: Apply power rule.\nAnswer: x^2/2",
	"Step 1: Apply power rule.",
	"x^2/2"
	)
	assert f > 0.5
	print(f" âœ“ Format compliance: {f:.2f}")

	# Full reward computation â€” new signature with all params
	r, comps = reward_sys.compute_reward(
	correctness=1.0,
	reasoning_quality=0.8,
	process_supervision=0.5,
	reflection_score=0.0,
	action_str="Step 1: Apply power rule. Step 2: Simplify. Answer: x^2/2",
	final_answer="x^2/2",
	history=[],
	times_seen_problem=0,
	reasoning="Step 1: Apply power rule. Step 2: Simplify.",
	)
	assert r > 0.0
	assert "C_correctness" in comps
	assert "F_format" in comps
	assert comps["F_format"] > 0 # Format compliance should be non-zero
	print(f" âœ“ Full reward: {r:.3f}, components: {len(comps)} fields")

	# Verify all 7+ components are tracked
	expected_keys = ["C_correctness", "Q_reasoning", "P_process_supervision",
	"R_reflection", "D_diversity", "E_efficiency",
	"X_exploration", "F_format"]
	for key in expected_keys:
	assert key in comps, f"Missing component: {key}"
	print(f" âœ“ All {len(expected_keys)} reward components present")

	# Trivial output detection
	assert reward_sys.detect_trivial_output("a")
	assert reward_sys.detect_trivial_output("aaaaaaaaaaaaa")
	assert not reward_sys.detect_trivial_output("x^2 + 2x + 1")
	print(" âœ“ Trivial output detection")

	def test_environment_step():
	env = AutomathreasonerEnvironment()
	obs = env.reset()

	assert obs.problem_text != ""
	assert obs.difficulty_level > 0
	assert len(obs.history) == 0
	print(f" âœ“ Reset: difficulty={obs.difficulty_level}, problem={obs.problem_text[:60]}...")

	# Technique metadata in observation
	assert "technique" in obs.metadata
	print(f" âœ“ Technique metadata: {obs.metadata['technique']}")

	# Dummy action step
	action = AutomathreasonerAction(
	reasoning="Step 1: I identify the integrand. Step 2: Applying the power rule.",
	final_answer="x^2/2"
	)

	obs_after = env.step(action)
	assert obs_after.reward is not None
	assert len(obs_after.history) == 1
	assert "reward_components" in obs_after.metadata
	assert "correctness_score" in obs_after.metadata
	print(f" âœ“ Step: reward={obs_after.reward:.3f}, "
	f"correct={obs_after.metadata['is_correct']}, "
	f"C={obs_after.metadata['correctness_score']:.2f}")

	# Verify history stores both keys
	assert "prediction" in obs_after.history[0]
	assert "final_answer" in obs_after.history[0]
	print(" âœ“ History backward compatibility")

	def test_curriculum_progression():
	"""Test that curriculum actually advances with good performance."""
	env = AutomathreasonerEnvironment()
	initial_diff = env.difficulty_level

	# Simulate a series of correct answers
	for _ in range(5):
	env.rolling_results.append(1)
	env.rolling_rewards.append(0.7)

	env._update_curriculum()
	assert env.difficulty_level > initial_diff, (
	f"Curriculum should advance: {initial_diff} -> {env.difficulty_level}"
	)
	print(f" âœ“ Curriculum advanced: {initial_diff} -> {env.difficulty_level:.1f}")

	def test_scaffold_hints():
	"""Test that scaffold hints are generated after failures."""
	env = AutomathreasonerEnvironment()
	env.reset()

	# No hint at 0 failures
	env.consecutive_failures = 0
	hint0 = env._get_scaffold_observation()
	assert hint0 == ""

	# Hint at 2 failures
	env.consecutive_failures = 2
	env.current_scaffold_hints = {
	'hint_level_1': 'Try u-substitution',
	'hint_level_2': 'Let u = x^2',
	'hint_level_3': 'The answer starts with sin(x^2)',
	}
	hint2 = env._get_scaffold_observation()
	assert "Hint" in hint2
	assert "u-substitution" in hint2

	# Stronger hint at 3 failures
	env.consecutive_failures = 3
	hint3 = env._get_scaffold_observation()
	assert "u = x^2" in hint3

	# Strongest hint at 4+ failures
	env.consecutive_failures = 4
	hint4 = env._get_scaffold_observation()
	assert "Strong Hint" in hint4

	print(" âœ“ Scaffold hints: level 1, 2, 3 all working")

	def test_graduated_correctness_flow():
	"""End-to-end test: partial credit flows through the whole system."""
	env = AutomathreasonerEnvironment()
	obs = env.reset()

	# Submit a plausible but wrong math answer
	action = AutomathreasonerAction(
	reasoning="Step 1: I apply the power rule. Step 2: I integrate term by term. Therefore the answer is:",
	final_answer="x**2 + x" # Almost certainly wrong, but parseable math
	)

	obs_after = env.step(action)
	c_score = obs_after.metadata.get('correctness_score', 0)

	# Should get SOME partial credit (> 0) for parseable math with right techniques
	print(f" âœ“ Graduated correctness: C={c_score:.2f}, reward={obs_after.reward:.3f}")
	# Reward should be positive even when wrong (format + reasoning + partial credit)
	assert obs_after.reward > 0.0, f"Expected positive reward for structured wrong answer, got {obs_after.reward}"
	print(f" âœ“ Positive reward for structured wrong answer: {obs_after.reward:.3f}")


	if __name__ == "__main__":
	print("=" * 60)
	print("AutoMathReasoner Test Suite (v2 - Optimized)")
	print("=" * 60)

	print("\n[TEST] test_generator")
	test_generator()

	print("\n[TEST] test_verifier")
	test_verifier()

	print("\n[TEST] test_rewards")
	test_rewards()

	print("\n[TEST] test_environment_step")
	test_environment_step()

	print("\n[TEST] test_curriculum_progression")
	test_curriculum_progression()

	print("\n[TEST] test_scaffold_hints")
	test_scaffold_hints()

	print("\n[TEST] test_graduated_correctness_flow")
	test_graduated_correctness_flow()

	print("\n" + "=" * 60)
	print("[OK] ALL TESTS PASSED")
	print("=" * 60)