import json import os import random def load_data(raw_dir="data/raw/"): all_samples = [] for f in os.listdir(raw_dir): if f.endswith(".json"): with open(os.path.join(raw_dir, f), 'r', encoding='utf-8') as fh: sample = json.load(fh) # Fix types since some fields are strings in the raw JSON if isinstance(sample.get("is_hallucination"), str): sample["is_hallucination"] = sample["is_hallucination"].lower() == "true" if sample.get("hallucination_step") is not None: sample["hallucination_step"] = int(sample["hallucination_step"]) # The implementation plan uses 'trajectory', but raw data has 'history' if "history" in sample and "trajectory" not in sample: sample["trajectory"] = sample["history"] all_samples.append(sample) return all_samples def random_baseline_accuracy(samples): correct = 0 hallucinated_samples = [s for s in samples if s.get("is_hallucination")] if not hallucinated_samples: return 0.0 for s in hallucinated_samples: n_steps = len(s.get("trajectory", [])) if n_steps == 0: continue predicted_step = random.randint(1, n_steps) if predicted_step == s.get("hallucination_step"): correct += 1 return correct / len(hallucinated_samples) if __name__ == "__main__": # Ensure reproducibility random.seed(42) # Load from the correct path relative to the root folder script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.join(script_dir, "..", "..") data_dir = os.path.join(project_root, "data", "raw") samples = load_data(data_dir) print(f"Total samples loaded: {len(samples)}") hallucinated = [s for s in samples if s.get("is_hallucination")] clean = [s for s in samples if not s.get("is_hallucination")] print(f"Hallucinated: {len(hallucinated)}") print(f"Clean: {len(clean)}") # Run 1000 times and average scores = [random_baseline_accuracy(samples) for _ in range(1000)] avg_score = sum(scores) / len(scores) * 100 print(f"Random baseline step localization accuracy: {avg_score:.2f}%")