import numpy as np
import random
from models import Action, Score  # pyre-ignore
from server.env import DebugOpsEnv  # pyre-ignore


def random_agent(obs):
    actions = ["analyze_logs", "run_tests", "open_file", "edit_file"]
    action_type = random.choice(actions)
    
    target, content = None, None
    if action_type in ["open_file", "edit_file"]:
        target = random.choice(obs.visible_files) if obs.visible_files else "unknown.py"
    if action_type == "edit_file":
        content = "print('random edit\\n')"
        
    return Action(type=action_type, target=target, content=content)


def heuristic_agent(obs):
    # 1. Analyze logs if not done
    if not obs.logs: return Action(type="analyze_logs")
        
    # 2. Extract potential file names from noisy logs
    candidates = [w.strip(":,.'\"") for w in obs.logs.split() if w.endswith(".py")]
    
    # 3. Fallbacks based on known targets if parsing fails entirely due to noise
    if not candidates:
        if "discount" in obs.logs.lower(): candidates = ["utils.py"]
        elif "validator" in obs.logs.lower(): candidates = ["parser.py"]  # trap!
        elif "service" in obs.logs.lower(): candidates = ["service.py"]
        elif "inconsistent" in obs.logs.lower(): candidates = ["api.py"]
        else: candidates = ["unknown.py"]
        
    # Misleaded attribution trap handling (Heuristic agent fails on medium)
    if "validator" in obs.logs.lower():
        candidates.insert(0, "validator.py")

    target_file = candidates[0]
    
    # 4. Open file if not visible
    if target_file not in obs.visible_files:
        return Action(type="open_file", target=target_file)
        
    # 5. Blind heuristic edit (assuming we opened the right file, else it fails)
    if "edit_attempted" not in getattr(heuristic_agent, "memory", {}):
        heuristic_agent.memory = {"edit_attempted": True}
        return Action(type="edit_file", target=target_file, content="heuristic patch")
        
    # 6. Run tests
    return Action(type="run_tests")


def evaluate_agent(agent_name, agent_fn, split, seeds=5):
    difficulties = ["easy", "medium", "hard", "extreme"]
    results_str = f"{agent_name:10s} "
    
    for diff in difficulties:
        scores = []
        for seed in range(seeds):
            # Reset heuristic memory per episode
            if hasattr(heuristic_agent, "memory"): delattr(heuristic_agent, "memory")
            
            env = DebugOpsEnv(seed=seed)
            try:
                obs = env.reset(difficulty=diff, split=split)
            except Exception:
                scores.append(0.0)
                continue
                
            done = False
            trajectory = []
            
            # Simple loop limit to prevent infinity if agent bugs out
            step_count = 0
            while not done and step_count < 30:
                step_count += 1
                action = agent_fn(obs)
                obs, reward, done, info = env.step(action)
                trajectory.append(action)
                
            score = env.grade(trajectory).final()
            scores.append(score)

        mean_score = np.mean(scores)
        std_score = np.std(scores)
        results_str += f"{mean_score:.2f}±{std_score:.2f}    "
        
    print(results_str)


if __name__ == "__main__":
    print("================================================================")
    print("DebugOps-RX (Realistic eXecution) - Formal Benchmark Evaluation")
    print("================================================================")
    
    splits = ["train", "test", "ood"]
    
    for split in splits:
        print(f"\\n[{split.upper()} SPLIT]")
        print("Agent        Easy         Medium       Hard         Extreme")
        print("-" * 64)
        evaluate_agent("Random", random_agent, split, seeds=5)
        evaluate_agent("Heuristic", heuristic_agent, split, seeds=5)
    
    print("\\n\\n(Evaluation uses 5 random seeds per cell to calculate mean±std)")