import json
import os
import random

def load_data(raw_dir="data/raw/"):
    all_samples = []
    for f in os.listdir(raw_dir):
        if f.endswith(".json"):
            with open(os.path.join(raw_dir, f), 'r', encoding='utf-8') as fh:
                sample = json.load(fh)
                
                # Fix types since some fields are strings in the raw JSON
                if isinstance(sample.get("is_hallucination"), str):
                    sample["is_hallucination"] = sample["is_hallucination"].lower() == "true"
                
                if sample.get("hallucination_step") is not None:
                    sample["hallucination_step"] = int(sample["hallucination_step"])
                
                # The implementation plan uses 'trajectory', but raw data has 'history'
                if "history" in sample and "trajectory" not in sample:
                    sample["trajectory"] = sample["history"]
                    
                all_samples.append(sample)
    return all_samples

def random_baseline_accuracy(samples):
    correct = 0
    hallucinated_samples = [s for s in samples if s.get("is_hallucination")]
    
    if not hallucinated_samples:
        return 0.0

    for s in hallucinated_samples:
        n_steps = len(s.get("trajectory", []))
        if n_steps == 0:
            continue
        predicted_step = random.randint(1, n_steps)
        if predicted_step == s.get("hallucination_step"):
            correct += 1
            
    return correct / len(hallucinated_samples)

if __name__ == "__main__":
    # Ensure reproducibility
    random.seed(42)
    
    # Load from the correct path relative to the root folder
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.join(script_dir, "..", "..")
    data_dir = os.path.join(project_root, "data", "raw")
    
    samples = load_data(data_dir)
    print(f"Total samples loaded: {len(samples)}")
    
    hallucinated = [s for s in samples if s.get("is_hallucination")]
    clean = [s for s in samples if not s.get("is_hallucination")]
    print(f"Hallucinated: {len(hallucinated)}")
    print(f"Clean: {len(clean)}")
    
    # Run 1000 times and average
    scores = [random_baseline_accuracy(samples) for _ in range(1000)]
    avg_score = sum(scores) / len(scores) * 100
    print(f"Random baseline step localization accuracy: {avg_score:.2f}%")