Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from env.generator import TaskGenerationEngine | |
| from env.verifier import VerifierSystem | |
| from env.rewards import RewardSystem | |
| from env.environment import AutomathreasonerEnvironment | |
| from env.models import AutomathreasonerAction | |
| def test_generator(): | |
| engine = TaskGenerationEngine() | |
| # Test task generation at various difficulty levels | |
| for diff in [1.0, 3.0, 5.0]: | |
| task = engine.generate_task(target_difficulty_band=diff) | |
| assert "problem" in task | |
| assert "solution" in task | |
| assert "difficulty" in task | |
| assert "technique" in task | |
| assert "scaffold_hints" in task | |
| assert task["technique"] in ['power_rule', 'u_substitution', 'by_parts', | |
| 'trigonometric', 'exponential', 'logarithmic'] | |
| print(f" ✓ Difficulty {diff}: technique={task['technique']}, problem={task['problem'][:60]}...") | |
| # Test variant generation | |
| task = engine.generate_task(target_difficulty_band=4.0) | |
| variants = engine.generate_variants(task, count=3) | |
| assert len(variants) > 0 | |
| for v in variants: | |
| assert "problem" in v | |
| assert "technique" in v | |
| print(f" ✓ Generated {len(variants)} variants") | |
| # Test technique-focused generation | |
| for tech in ['power_rule', 'u_substitution', 'by_parts']: | |
| task = engine.generate_technique_focused_task(tech, difficulty=2.0) | |
| assert task["technique"] == tech | |
| print(f" ✓ Technique-focused: {tech}") | |
| def test_verifier(): | |
| verifier = VerifierSystem() | |
| # Exact match | |
| assert verifier.check_exact_match("42", "42") | |
| assert verifier.check_exact_match(" 42 ", "42") | |
| print(" ✓ Exact match") | |
| # Numeric tolerance | |
| assert verifier.check_numeric_tolerance("3.14159", "3.1415") | |
| assert not verifier.check_numeric_tolerance("4.1415", "3.1415") | |
| print(" ✓ Numeric tolerance") | |
| # Python execution | |
| assert verifier.check_python_execution("2 + 2", "4") | |
| print(" ✓ Python execution") | |
| # Full verification — now returns 4 values (c, q, p, r) | |
| c, q, p, r = verifier.verify("Step 1: Because 2 + 2 is 4. Therefore the answer is 4.", "4", "4") | |
| assert c == 1.0 | |
| assert q > 0.0 | |
| print(f" ✓ Full verify: C={c}, Q={q:.3f}, P={p:.3f}, R={r:.3f}") | |
| # Graduated correctness — structural similarity | |
| score = verifier.check_structural_similarity("x**3", "2*x**3") | |
| assert score > 0.0 # Should get partial credit for same structure | |
| print(f" ✓ Structural similarity: {score:.2f}") | |
| # Technique recognition | |
| tech_score = verifier.check_technique_recognition( | |
| "Let u = x^2, then du = 2x dx. By substitution we get...", | |
| "u_substitution" | |
| ) | |
| assert tech_score > 0.5 | |
| print(f" ✓ Technique recognition: {tech_score:.2f}") | |
| # Process supervision — improved | |
| p_good = verifier.check_process_supervision( | |
| "Step 1: Identify the integrand. Step 2: Apply the power rule. Therefore x^3/3 + C." | |
| ) | |
| p_bad = verifier.check_process_supervision("so = 42") | |
| assert p_good > p_bad | |
| print(f" ✓ Process supervision: good={p_good:.2f}, bad={p_bad:.2f}") | |
| def test_rewards(): | |
| reward_sys = RewardSystem(max_len=1000) | |
| # Test diversity — exact repeat penalty | |
| history = [{"final_answer": "42"}] | |
| d = reward_sys.compute_diversity("42", history) | |
| assert d == -1.0 | |
| print(f" ✓ Diversity repeat penalty: {d}") | |
| # Test diversity — also works with 'prediction' key (backward compat) | |
| history_v2 = [{"prediction": "42"}] | |
| d2 = reward_sys.compute_diversity("42", history_v2) | |
| assert d2 == -1.0 | |
| print(f" ✓ Diversity backward compat: {d2}") | |
| # Test diversity — unique answer | |
| d3 = reward_sys.compute_diversity("99", history) | |
| assert d3 == 1.0 | |
| print(f" ✓ Diversity unique bonus: {d3}") | |
| # Test format compliance | |
| f = reward_sys.compute_format_compliance( | |
| "Step 1: Apply power rule.\nAnswer: x^2/2", | |
| "Step 1: Apply power rule.", | |
| "x^2/2" | |
| ) | |
| assert f > 0.5 | |
| print(f" ✓ Format compliance: {f:.2f}") | |
| # Full reward computation — new signature with all params | |
| r, comps = reward_sys.compute_reward( | |
| correctness=1.0, | |
| reasoning_quality=0.8, | |
| process_supervision=0.5, | |
| reflection_score=0.0, | |
| action_str="Step 1: Apply power rule. Step 2: Simplify. Answer: x^2/2", | |
| final_answer="x^2/2", | |
| history=[], | |
| times_seen_problem=0, | |
| reasoning="Step 1: Apply power rule. Step 2: Simplify.", | |
| ) | |
| assert r > 0.0 | |
| assert "C_correctness" in comps | |
| assert "F_format" in comps | |
| assert comps["F_format"] > 0 # Format compliance should be non-zero | |
| print(f" ✓ Full reward: {r:.3f}, components: {len(comps)} fields") | |
| # Verify all 7+ components are tracked | |
| expected_keys = ["C_correctness", "Q_reasoning", "P_process_supervision", | |
| "R_reflection", "D_diversity", "E_efficiency", | |
| "X_exploration", "F_format"] | |
| for key in expected_keys: | |
| assert key in comps, f"Missing component: {key}" | |
| print(f" ✓ All {len(expected_keys)} reward components present") | |
| # Trivial output detection | |
| assert reward_sys.detect_trivial_output("a") | |
| assert reward_sys.detect_trivial_output("aaaaaaaaaaaaa") | |
| assert not reward_sys.detect_trivial_output("x^2 + 2x + 1") | |
| print(" ✓ Trivial output detection") | |
| def test_environment_step(): | |
| env = AutomathreasonerEnvironment() | |
| obs = env.reset() | |
| assert obs.problem_text != "" | |
| assert obs.difficulty_level > 0 | |
| assert len(obs.history) == 0 | |
| print(f" ✓ Reset: difficulty={obs.difficulty_level}, problem={obs.problem_text[:60]}...") | |
| # Technique metadata in observation | |
| assert "technique" in obs.metadata | |
| print(f" ✓ Technique metadata: {obs.metadata['technique']}") | |
| # Dummy action step | |
| action = AutomathreasonerAction( | |
| reasoning="Step 1: I identify the integrand. Step 2: Applying the power rule.", | |
| final_answer="x^2/2" | |
| ) | |
| obs_after = env.step(action) | |
| assert obs_after.reward is not None | |
| assert len(obs_after.history) == 1 | |
| assert "reward_components" in obs_after.metadata | |
| assert "correctness_score" in obs_after.metadata | |
| print(f" ✓ Step: reward={obs_after.reward:.3f}, " | |
| f"correct={obs_after.metadata['is_correct']}, " | |
| f"C={obs_after.metadata['correctness_score']:.2f}") | |
| # Verify history stores both keys | |
| assert "prediction" in obs_after.history[0] | |
| assert "final_answer" in obs_after.history[0] | |
| print(" ✓ History backward compatibility") | |
| def test_curriculum_progression(): | |
| """Test that curriculum actually advances with good performance.""" | |
| env = AutomathreasonerEnvironment() | |
| initial_diff = env.difficulty_level | |
| # Simulate a series of correct answers | |
| for _ in range(5): | |
| env.rolling_results.append(1) | |
| env.rolling_rewards.append(0.7) | |
| env._update_curriculum() | |
| assert env.difficulty_level > initial_diff, ( | |
| f"Curriculum should advance: {initial_diff} -> {env.difficulty_level}" | |
| ) | |
| print(f" ✓ Curriculum advanced: {initial_diff} -> {env.difficulty_level:.1f}") | |
| def test_scaffold_hints(): | |
| """Test that scaffold hints are generated after failures.""" | |
| env = AutomathreasonerEnvironment() | |
| env.reset() | |
| # No hint at 0 failures | |
| env.consecutive_failures = 0 | |
| hint0 = env._get_scaffold_observation() | |
| assert hint0 == "" | |
| # Hint at 2 failures | |
| env.consecutive_failures = 2 | |
| env.current_scaffold_hints = { | |
| 'hint_level_1': 'Try u-substitution', | |
| 'hint_level_2': 'Let u = x^2', | |
| 'hint_level_3': 'The answer starts with sin(x^2)', | |
| } | |
| hint2 = env._get_scaffold_observation() | |
| assert "Hint" in hint2 | |
| assert "u-substitution" in hint2 | |
| # Stronger hint at 3 failures | |
| env.consecutive_failures = 3 | |
| hint3 = env._get_scaffold_observation() | |
| assert "u = x^2" in hint3 | |
| # Strongest hint at 4+ failures | |
| env.consecutive_failures = 4 | |
| hint4 = env._get_scaffold_observation() | |
| assert "Strong Hint" in hint4 | |
| print(" ✓ Scaffold hints: level 1, 2, 3 all working") | |
| def test_graduated_correctness_flow(): | |
| """End-to-end test: partial credit flows through the whole system.""" | |
| env = AutomathreasonerEnvironment() | |
| obs = env.reset() | |
| # Submit a plausible but wrong math answer | |
| action = AutomathreasonerAction( | |
| reasoning="Step 1: I apply the power rule. Step 2: I integrate term by term. Therefore the answer is:", | |
| final_answer="x**2 + x" # Almost certainly wrong, but parseable math | |
| ) | |
| obs_after = env.step(action) | |
| c_score = obs_after.metadata.get('correctness_score', 0) | |
| # Should get SOME partial credit (> 0) for parseable math with right techniques | |
| print(f" ✓ Graduated correctness: C={c_score:.2f}, reward={obs_after.reward:.3f}") | |
| # Reward should be positive even when wrong (format + reasoning + partial credit) | |
| assert obs_after.reward > 0.0, f"Expected positive reward for structured wrong answer, got {obs_after.reward}" | |
| print(f" ✓ Positive reward for structured wrong answer: {obs_after.reward:.3f}") | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("AutoMathReasoner Test Suite (v2 - Optimized)") | |
| print("=" * 60) | |
| print("\n[TEST] test_generator") | |
| test_generator() | |
| print("\n[TEST] test_verifier") | |
| test_verifier() | |
| print("\n[TEST] test_rewards") | |
| test_rewards() | |
| print("\n[TEST] test_environment_step") | |
| test_environment_step() | |
| print("\n[TEST] test_curriculum_progression") | |
| test_curriculum_progression() | |
| print("\n[TEST] test_scaffold_hints") | |
| test_scaffold_hints() | |
| print("\n[TEST] test_graduated_correctness_flow") | |
| test_graduated_correctness_flow() | |
| print("\n" + "=" * 60) | |
| print("[OK] ALL TESTS PASSED") | |
| print("=" * 60) | |