import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from env.generator import TaskGenerationEngine
from env.verifier import VerifierSystem
from env.rewards import RewardSystem
from env.environment import AutomathreasonerEnvironment
from env.models import AutomathreasonerAction

def test_generator():
    engine = TaskGenerationEngine()
    
    # Test task generation at various difficulty levels
    for diff in [1.0, 3.0, 5.0]:
        task = engine.generate_task(target_difficulty_band=diff)
        assert "problem" in task
        assert "solution" in task
        assert "difficulty" in task
        assert "technique" in task
        assert "scaffold_hints" in task
        assert task["technique"] in ['power_rule', 'u_substitution', 'by_parts', 
                                      'trigonometric', 'exponential', 'logarithmic']
        print(f"  âœ“ Difficulty {diff}: technique={task['technique']}, problem={task['problem'][:60]}...")
    
    # Test variant generation
    task = engine.generate_task(target_difficulty_band=4.0)
    variants = engine.generate_variants(task, count=3)
    assert len(variants) > 0
    for v in variants:
        assert "problem" in v
        assert "technique" in v
    print(f"  âœ“ Generated {len(variants)} variants")
    
    # Test technique-focused generation
    for tech in ['power_rule', 'u_substitution', 'by_parts']:
        task = engine.generate_technique_focused_task(tech, difficulty=2.0)
        assert task["technique"] == tech
        print(f"  âœ“ Technique-focused: {tech}")

def test_verifier():
    verifier = VerifierSystem()
    
    # Exact match
    assert verifier.check_exact_match("42", "42")
    assert verifier.check_exact_match(" 42 ", "42")
    print("  âœ“ Exact match")
    
    # Numeric tolerance
    assert verifier.check_numeric_tolerance("3.14159", "3.1415")
    assert not verifier.check_numeric_tolerance("4.1415", "3.1415")
    print("  âœ“ Numeric tolerance")
    
    # Python execution
    assert verifier.check_python_execution("2 + 2", "4")
    print("  âœ“ Python execution")
    
    # Full verification â€” now returns 4 values (c, q, p, r)
    c, q, p, r = verifier.verify("Step 1: Because 2 + 2 is 4. Therefore the answer is 4.", "4", "4")
    assert c == 1.0
    assert q > 0.0
    print(f"  âœ“ Full verify: C={c}, Q={q:.3f}, P={p:.3f}, R={r:.3f}")
    
    # Graduated correctness â€” structural similarity
    score = verifier.check_structural_similarity("x**3", "2*x**3")
    assert score > 0.0  # Should get partial credit for same structure
    print(f"  âœ“ Structural similarity: {score:.2f}")
    
    # Technique recognition
    tech_score = verifier.check_technique_recognition(
        "Let u = x^2, then du = 2x dx. By substitution we get...",
        "u_substitution"
    )
    assert tech_score > 0.5
    print(f"  âœ“ Technique recognition: {tech_score:.2f}")
    
    # Process supervision â€” improved
    p_good = verifier.check_process_supervision(
        "Step 1: Identify the integrand. Step 2: Apply the power rule. Therefore x^3/3 + C."
    )
    p_bad = verifier.check_process_supervision("so = 42")
    assert p_good > p_bad
    print(f"  âœ“ Process supervision: good={p_good:.2f}, bad={p_bad:.2f}")

def test_rewards():
    reward_sys = RewardSystem(max_len=1000)
    
    # Test diversity â€” exact repeat penalty
    history = [{"final_answer": "42"}]
    d = reward_sys.compute_diversity("42", history)
    assert d == -1.0
    print(f"  âœ“ Diversity repeat penalty: {d}")
    
    # Test diversity â€” also works with 'prediction' key (backward compat)
    history_v2 = [{"prediction": "42"}]
    d2 = reward_sys.compute_diversity("42", history_v2)
    assert d2 == -1.0
    print(f"  âœ“ Diversity backward compat: {d2}")
    
    # Test diversity â€” unique answer
    d3 = reward_sys.compute_diversity("99", history)
    assert d3 == 1.0
    print(f"  âœ“ Diversity unique bonus: {d3}")
    
    # Test format compliance
    f = reward_sys.compute_format_compliance(
        "Step 1: Apply power rule.\nAnswer: x^2/2",
        "Step 1: Apply power rule.",
        "x^2/2"
    )
    assert f > 0.5
    print(f"  âœ“ Format compliance: {f:.2f}")
    
    # Full reward computation â€” new signature with all params
    r, comps = reward_sys.compute_reward(
        correctness=1.0, 
        reasoning_quality=0.8, 
        process_supervision=0.5,
        reflection_score=0.0,
        action_str="Step 1: Apply power rule. Step 2: Simplify. Answer: x^2/2", 
        final_answer="x^2/2",
        history=[], 
        times_seen_problem=0,
        reasoning="Step 1: Apply power rule. Step 2: Simplify.",
    )
    assert r > 0.0
    assert "C_correctness" in comps
    assert "F_format" in comps
    assert comps["F_format"] > 0  # Format compliance should be non-zero
    print(f"  âœ“ Full reward: {r:.3f}, components: {len(comps)} fields")
    
    # Verify all 7+ components are tracked
    expected_keys = ["C_correctness", "Q_reasoning", "P_process_supervision", 
                     "R_reflection", "D_diversity", "E_efficiency", 
                     "X_exploration", "F_format"]
    for key in expected_keys:
        assert key in comps, f"Missing component: {key}"
    print(f"  âœ“ All {len(expected_keys)} reward components present")
    
    # Trivial output detection
    assert reward_sys.detect_trivial_output("a")
    assert reward_sys.detect_trivial_output("aaaaaaaaaaaaa")
    assert not reward_sys.detect_trivial_output("x^2 + 2x + 1")
    print("  âœ“ Trivial output detection")

def test_environment_step():
    env = AutomathreasonerEnvironment()
    obs = env.reset()
    
    assert obs.problem_text != ""
    assert obs.difficulty_level > 0
    assert len(obs.history) == 0
    print(f"  âœ“ Reset: difficulty={obs.difficulty_level}, problem={obs.problem_text[:60]}...")
    
    # Technique metadata in observation
    assert "technique" in obs.metadata
    print(f"  âœ“ Technique metadata: {obs.metadata['technique']}")
    
    # Dummy action step
    action = AutomathreasonerAction(
        reasoning="Step 1: I identify the integrand. Step 2: Applying the power rule.",
        final_answer="x^2/2"
    )
    
    obs_after = env.step(action)
    assert obs_after.reward is not None
    assert len(obs_after.history) == 1
    assert "reward_components" in obs_after.metadata
    assert "correctness_score" in obs_after.metadata
    print(f"  âœ“ Step: reward={obs_after.reward:.3f}, "
          f"correct={obs_after.metadata['is_correct']}, "
          f"C={obs_after.metadata['correctness_score']:.2f}")
    
    # Verify history stores both keys
    assert "prediction" in obs_after.history[0]
    assert "final_answer" in obs_after.history[0]
    print("  âœ“ History backward compatibility")

def test_curriculum_progression():
    """Test that curriculum actually advances with good performance."""
    env = AutomathreasonerEnvironment()
    initial_diff = env.difficulty_level
    
    # Simulate a series of correct answers
    for _ in range(5):
        env.rolling_results.append(1)
        env.rolling_rewards.append(0.7)
    
    env._update_curriculum()
    assert env.difficulty_level > initial_diff, (
        f"Curriculum should advance: {initial_diff} -> {env.difficulty_level}"
    )
    print(f"  âœ“ Curriculum advanced: {initial_diff} -> {env.difficulty_level:.1f}")

def test_scaffold_hints():
    """Test that scaffold hints are generated after failures."""
    env = AutomathreasonerEnvironment()
    env.reset()
    
    # No hint at 0 failures
    env.consecutive_failures = 0
    hint0 = env._get_scaffold_observation()
    assert hint0 == ""
    
    # Hint at 2 failures
    env.consecutive_failures = 2
    env.current_scaffold_hints = {
        'hint_level_1': 'Try u-substitution',
        'hint_level_2': 'Let u = x^2',
        'hint_level_3': 'The answer starts with sin(x^2)',
    }
    hint2 = env._get_scaffold_observation()
    assert "Hint" in hint2
    assert "u-substitution" in hint2
    
    # Stronger hint at 3 failures
    env.consecutive_failures = 3
    hint3 = env._get_scaffold_observation()
    assert "u = x^2" in hint3
    
    # Strongest hint at 4+ failures
    env.consecutive_failures = 4
    hint4 = env._get_scaffold_observation()
    assert "Strong Hint" in hint4
    
    print("  âœ“ Scaffold hints: level 1, 2, 3 all working")

def test_graduated_correctness_flow():
    """End-to-end test: partial credit flows through the whole system."""
    env = AutomathreasonerEnvironment()
    obs = env.reset()
    
    # Submit a plausible but wrong math answer
    action = AutomathreasonerAction(
        reasoning="Step 1: I apply the power rule. Step 2: I integrate term by term. Therefore the answer is:",
        final_answer="x**2 + x"  # Almost certainly wrong, but parseable math
    )
    
    obs_after = env.step(action)
    c_score = obs_after.metadata.get('correctness_score', 0)
    
    # Should get SOME partial credit (> 0) for parseable math with right techniques
    print(f"  âœ“ Graduated correctness: C={c_score:.2f}, reward={obs_after.reward:.3f}")
    # Reward should be positive even when wrong (format + reasoning + partial credit)
    assert obs_after.reward > 0.0, f"Expected positive reward for structured wrong answer, got {obs_after.reward}"
    print(f"  âœ“ Positive reward for structured wrong answer: {obs_after.reward:.3f}")


if __name__ == "__main__":
    print("=" * 60)
    print("AutoMathReasoner Test Suite (v2 - Optimized)")
    print("=" * 60)
    
    print("\n[TEST] test_generator")
    test_generator()
    
    print("\n[TEST] test_verifier")
    test_verifier()
    
    print("\n[TEST] test_rewards")
    test_rewards()
    
    print("\n[TEST] test_environment_step")
    test_environment_step()
    
    print("\n[TEST] test_curriculum_progression")
    test_curriculum_progression()
    
    print("\n[TEST] test_scaffold_hints")
    test_scaffold_hints()
    
    print("\n[TEST] test_graduated_correctness_flow")
    test_graduated_correctness_flow()
    
    print("\n" + "=" * 60)
    print("[OK] ALL TESTS PASSED")
    print("=" * 60)