Spaces:

talha1234567
/

agentsight-api

Running

Minato Namikaze

Deploy to Hugging Face Spaces

2aed081 4 days ago

2.39 kB

	import json
	import os
	import random

	def load_data(raw_dir="data/raw/"):
	all_samples = []
	for f in os.listdir(raw_dir):
	if f.endswith(".json"):
	with open(os.path.join(raw_dir, f), 'r', encoding='utf-8') as fh:
	sample = json.load(fh)

	# Fix types since some fields are strings in the raw JSON
	if isinstance(sample.get("is_hallucination"), str):
	sample["is_hallucination"] = sample["is_hallucination"].lower() == "true"

	if sample.get("hallucination_step") is not None:
	sample["hallucination_step"] = int(sample["hallucination_step"])

	# The implementation plan uses 'trajectory', but raw data has 'history'
	if "history" in sample and "trajectory" not in sample:
	sample["trajectory"] = sample["history"]

	all_samples.append(sample)
	return all_samples

	def random_baseline_accuracy(samples):
	correct = 0
	hallucinated_samples = [s for s in samples if s.get("is_hallucination")]

	if not hallucinated_samples:
	return 0.0

	for s in hallucinated_samples:
	n_steps = len(s.get("trajectory", []))
	if n_steps == 0:
	continue
	predicted_step = random.randint(1, n_steps)
	if predicted_step == s.get("hallucination_step"):
	correct += 1

	return correct / len(hallucinated_samples)

	if __name__ == "__main__":
	# Ensure reproducibility
	random.seed(42)

	# Load from the correct path relative to the root folder
	script_dir = os.path.dirname(os.path.abspath(__file__))
	project_root = os.path.join(script_dir, "..", "..")
	data_dir = os.path.join(project_root, "data", "raw")

	samples = load_data(data_dir)
	print(f"Total samples loaded: {len(samples)}")

	hallucinated = [s for s in samples if s.get("is_hallucination")]
	clean = [s for s in samples if not s.get("is_hallucination")]
	print(f"Hallucinated: {len(hallucinated)}")
	print(f"Clean: {len(clean)}")

	# Run 1000 times and average
	scores = [random_baseline_accuracy(samples) for _ in range(1000)]
	avg_score = sum(scores) / len(scores) * 100
	print(f"Random baseline step localization accuracy: {avg_score:.2f}%")