File size: 10,309 Bytes
973cd6f
f8319a8
 
 
 
 
 
 
 
 
 
 
 
973cd6f
 
 
 
 
 
 
 
 
 
 
f8319a8
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
f8319a8
 
 
 
 
 
 
973cd6f
f8319a8
 
 
 
973cd6f
f8319a8
 
 
973cd6f
f8319a8
973cd6f
 
f8319a8
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8319a8
 
 
 
973cd6f
 
f8319a8
 
973cd6f
 
 
 
 
 
 
 
 
 
 
 
f8319a8
973cd6f
 
 
 
 
 
 
 
 
 
f8319a8
 
973cd6f
 
 
 
 
f8319a8
973cd6f
 
f8319a8
 
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8319a8
 
 
 
 
 
 
 
973cd6f
 
 
 
 
f8319a8
973cd6f
f8319a8
973cd6f
 
f8319a8
 
 
 
 
 
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from env.generator import TaskGenerationEngine
from env.verifier import VerifierSystem
from env.rewards import RewardSystem
from env.environment import AutomathreasonerEnvironment
from env.models import AutomathreasonerAction

def test_generator():
    engine = TaskGenerationEngine()
    
    # Test task generation at various difficulty levels
    for diff in [1.0, 3.0, 5.0]:
        task = engine.generate_task(target_difficulty_band=diff)
        assert "problem" in task
        assert "solution" in task
        assert "difficulty" in task
        assert "technique" in task
        assert "scaffold_hints" in task
        assert task["technique"] in ['power_rule', 'u_substitution', 'by_parts', 
                                      'trigonometric', 'exponential', 'logarithmic']
        print(f"  ✓ Difficulty {diff}: technique={task['technique']}, problem={task['problem'][:60]}...")
    
    # Test variant generation
    task = engine.generate_task(target_difficulty_band=4.0)
    variants = engine.generate_variants(task, count=3)
    assert len(variants) > 0
    for v in variants:
        assert "problem" in v
        assert "technique" in v
    print(f"  ✓ Generated {len(variants)} variants")
    
    # Test technique-focused generation
    for tech in ['power_rule', 'u_substitution', 'by_parts']:
        task = engine.generate_technique_focused_task(tech, difficulty=2.0)
        assert task["technique"] == tech
        print(f"  ✓ Technique-focused: {tech}")

def test_verifier():
    verifier = VerifierSystem()
    
    # Exact match
    assert verifier.check_exact_match("42", "42")
    assert verifier.check_exact_match(" 42 ", "42")
    print("  ✓ Exact match")
    
    # Numeric tolerance
    assert verifier.check_numeric_tolerance("3.14159", "3.1415")
    assert not verifier.check_numeric_tolerance("4.1415", "3.1415")
    print("  ✓ Numeric tolerance")
    
    # Python execution
    assert verifier.check_python_execution("2 + 2", "4")
    print("  ✓ Python execution")
    
    # Full verification — now returns 4 values (c, q, p, r)
    c, q, p, r = verifier.verify("Step 1: Because 2 + 2 is 4. Therefore the answer is 4.", "4", "4")
    assert c == 1.0
    assert q > 0.0
    print(f"  ✓ Full verify: C={c}, Q={q:.3f}, P={p:.3f}, R={r:.3f}")
    
    # Graduated correctness — structural similarity
    score = verifier.check_structural_similarity("x**3", "2*x**3")
    assert score > 0.0  # Should get partial credit for same structure
    print(f"  ✓ Structural similarity: {score:.2f}")
    
    # Technique recognition
    tech_score = verifier.check_technique_recognition(
        "Let u = x^2, then du = 2x dx. By substitution we get...",
        "u_substitution"
    )
    assert tech_score > 0.5
    print(f"  ✓ Technique recognition: {tech_score:.2f}")
    
    # Process supervision — improved
    p_good = verifier.check_process_supervision(
        "Step 1: Identify the integrand. Step 2: Apply the power rule. Therefore x^3/3 + C."
    )
    p_bad = verifier.check_process_supervision("so = 42")
    assert p_good > p_bad
    print(f"  ✓ Process supervision: good={p_good:.2f}, bad={p_bad:.2f}")

def test_rewards():
    reward_sys = RewardSystem(max_len=1000)
    
    # Test diversity — exact repeat penalty
    history = [{"final_answer": "42"}]
    d = reward_sys.compute_diversity("42", history)
    assert d == -1.0
    print(f"  ✓ Diversity repeat penalty: {d}")
    
    # Test diversity — also works with 'prediction' key (backward compat)
    history_v2 = [{"prediction": "42"}]
    d2 = reward_sys.compute_diversity("42", history_v2)
    assert d2 == -1.0
    print(f"  ✓ Diversity backward compat: {d2}")
    
    # Test diversity — unique answer
    d3 = reward_sys.compute_diversity("99", history)
    assert d3 == 1.0
    print(f"  ✓ Diversity unique bonus: {d3}")
    
    # Test format compliance
    f = reward_sys.compute_format_compliance(
        "Step 1: Apply power rule.\nAnswer: x^2/2",
        "Step 1: Apply power rule.",
        "x^2/2"
    )
    assert f > 0.5
    print(f"  ✓ Format compliance: {f:.2f}")
    
    # Full reward computation — new signature with all params
    r, comps = reward_sys.compute_reward(
        correctness=1.0, 
        reasoning_quality=0.8, 
        process_supervision=0.5,
        reflection_score=0.0,
        action_str="Step 1: Apply power rule. Step 2: Simplify. Answer: x^2/2", 
        final_answer="x^2/2",
        history=[], 
        times_seen_problem=0,
        reasoning="Step 1: Apply power rule. Step 2: Simplify.",
    )
    assert r > 0.0
    assert "C_correctness" in comps
    assert "F_format" in comps
    assert comps["F_format"] > 0  # Format compliance should be non-zero
    print(f"  ✓ Full reward: {r:.3f}, components: {len(comps)} fields")
    
    # Verify all 7+ components are tracked
    expected_keys = ["C_correctness", "Q_reasoning", "P_process_supervision", 
                     "R_reflection", "D_diversity", "E_efficiency", 
                     "X_exploration", "F_format"]
    for key in expected_keys:
        assert key in comps, f"Missing component: {key}"
    print(f"  ✓ All {len(expected_keys)} reward components present")
    
    # Trivial output detection
    assert reward_sys.detect_trivial_output("a")
    assert reward_sys.detect_trivial_output("aaaaaaaaaaaaa")
    assert not reward_sys.detect_trivial_output("x^2 + 2x + 1")
    print("  ✓ Trivial output detection")

def test_environment_step():
    env = AutomathreasonerEnvironment()
    obs = env.reset()
    
    assert obs.problem_text != ""
    assert obs.difficulty_level > 0
    assert len(obs.history) == 0
    print(f"  ✓ Reset: difficulty={obs.difficulty_level}, problem={obs.problem_text[:60]}...")
    
    # Technique metadata in observation
    assert "technique" in obs.metadata
    print(f"  ✓ Technique metadata: {obs.metadata['technique']}")
    
    # Dummy action step
    action = AutomathreasonerAction(
        reasoning="Step 1: I identify the integrand. Step 2: Applying the power rule.",
        final_answer="x^2/2"
    )
    
    obs_after = env.step(action)
    assert obs_after.reward is not None
    assert len(obs_after.history) == 1
    assert "reward_components" in obs_after.metadata
    assert "correctness_score" in obs_after.metadata
    print(f"  ✓ Step: reward={obs_after.reward:.3f}, "
          f"correct={obs_after.metadata['is_correct']}, "
          f"C={obs_after.metadata['correctness_score']:.2f}")
    
    # Verify history stores both keys
    assert "prediction" in obs_after.history[0]
    assert "final_answer" in obs_after.history[0]
    print("  ✓ History backward compatibility")

def test_curriculum_progression():
    """Test that curriculum actually advances with good performance."""
    env = AutomathreasonerEnvironment()
    initial_diff = env.difficulty_level
    
    # Simulate a series of correct answers
    for _ in range(5):
        env.rolling_results.append(1)
        env.rolling_rewards.append(0.7)
    
    env._update_curriculum()
    assert env.difficulty_level > initial_diff, (
        f"Curriculum should advance: {initial_diff} -> {env.difficulty_level}"
    )
    print(f"  ✓ Curriculum advanced: {initial_diff} -> {env.difficulty_level:.1f}")

def test_scaffold_hints():
    """Test that scaffold hints are generated after failures."""
    env = AutomathreasonerEnvironment()
    env.reset()
    
    # No hint at 0 failures
    env.consecutive_failures = 0
    hint0 = env._get_scaffold_observation()
    assert hint0 == ""
    
    # Hint at 2 failures
    env.consecutive_failures = 2
    env.current_scaffold_hints = {
        'hint_level_1': 'Try u-substitution',
        'hint_level_2': 'Let u = x^2',
        'hint_level_3': 'The answer starts with sin(x^2)',
    }
    hint2 = env._get_scaffold_observation()
    assert "Hint" in hint2
    assert "u-substitution" in hint2
    
    # Stronger hint at 3 failures
    env.consecutive_failures = 3
    hint3 = env._get_scaffold_observation()
    assert "u = x^2" in hint3
    
    # Strongest hint at 4+ failures
    env.consecutive_failures = 4
    hint4 = env._get_scaffold_observation()
    assert "Strong Hint" in hint4
    
    print("  ✓ Scaffold hints: level 1, 2, 3 all working")

def test_graduated_correctness_flow():
    """End-to-end test: partial credit flows through the whole system."""
    env = AutomathreasonerEnvironment()
    obs = env.reset()
    
    # Submit a plausible but wrong math answer
    action = AutomathreasonerAction(
        reasoning="Step 1: I apply the power rule. Step 2: I integrate term by term. Therefore the answer is:",
        final_answer="x**2 + x"  # Almost certainly wrong, but parseable math
    )
    
    obs_after = env.step(action)
    c_score = obs_after.metadata.get('correctness_score', 0)
    
    # Should get SOME partial credit (> 0) for parseable math with right techniques
    print(f"  ✓ Graduated correctness: C={c_score:.2f}, reward={obs_after.reward:.3f}")
    # Reward should be positive even when wrong (format + reasoning + partial credit)
    assert obs_after.reward > 0.0, f"Expected positive reward for structured wrong answer, got {obs_after.reward}"
    print(f"  ✓ Positive reward for structured wrong answer: {obs_after.reward:.3f}")


if __name__ == "__main__":
    print("=" * 60)
    print("AutoMathReasoner Test Suite (v2 - Optimized)")
    print("=" * 60)
    
    print("\n[TEST] test_generator")
    test_generator()
    
    print("\n[TEST] test_verifier")
    test_verifier()
    
    print("\n[TEST] test_rewards")
    test_rewards()
    
    print("\n[TEST] test_environment_step")
    test_environment_step()
    
    print("\n[TEST] test_curriculum_progression")
    test_curriculum_progression()
    
    print("\n[TEST] test_scaffold_hints")
    test_scaffold_hints()
    
    print("\n[TEST] test_graduated_correctness_flow")
    test_graduated_correctness_flow()
    
    print("\n" + "=" * 60)
    print("[OK] ALL TESTS PASSED")
    print("=" * 60)