AgentTrace-Demo / evaluation /error_analysis.py
ayaanO7's picture
Upload folder using huggingface_hub
4d69237 verified
import os
import json
import sys
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import CONFIG
from detection.pipeline import DetectionPipeline
from evaluation.benchmark import BenchmarkRunner
def analyze_errors():
print("Running Qualitative Error Analysis on Synthetic Trajectories...", flush=True)
# Use Layer 1 only — no OpenRouter API key needed for error analysis
pipeline = DetectionPipeline(enable_layer2=False, enable_layer3=False)
runner = BenchmarkRunner(detector_fn=pipeline.detect)
runner.load_trajectories()
if not runner.trajectories:
print("Error: No trajectories loaded.")
return
errors = []
# Process steps and find misclassifications
total_trajs = len(runner.trajectories)
for i, traj in enumerate(runner.trajectories):
if (i + 1) % 10 == 0 or i == 0:
print(f"Processing trajectory {i+1}/{total_trajs}...", flush=True)
pipeline.reset_history()
traj_id = traj.get("trajectory_id", f"traj_{i+1:03d}")
for step in traj.get("steps", []):
step_num = step.get("step", 0)
res = pipeline.detect(step)
pred_detected = res.get("hallucination_detected", False)
gt_detected = step.get("ground_truth_label", False)
if pred_detected != gt_detected:
# We have a misclassification!
error_type = "FP" if pred_detected else "FN"
conf = res.get("confidence", 0.0)
signals = res.get("detection_signals", {})
# Rule-based logic to determine dominant signal and why wrong
dominant_signal = "None"
why_wrong = "Unknown"
if error_type == "FP":
# False Positive: predicted hallucination but actually clean
sem_sim = signals.get("semantic_similarity")
nli = signals.get("nli_score")
tcm = signals.get("tool_claim_match")
contra = signals.get("contradiction_with_prev")
# Determine dominant signal
max_signal_val = -1
if sem_sim is not None and (1.0 - sem_sim) > max_signal_val:
max_signal_val = 1.0 - sem_sim
dominant_signal = f"Low Semantic Similarity ({sem_sim:.2f})"
if nli is not None and nli > max_signal_val:
max_signal_val = nli
dominant_signal = f"High NLI Contradiction ({nli:.2f})"
if tcm is False and 1.0 > max_signal_val:
max_signal_val = 1.0
dominant_signal = "Tool Claim Mismatch"
if contra is True and 1.0 > max_signal_val:
max_signal_val = 1.0
dominant_signal = "Contradiction with Previous Steps"
# Explanation
if nli is not None and nli > 0.75:
why_wrong = "NLI model flagged benign semantic mismatch as factual contradiction."
elif sem_sim is not None and sem_sim < 0.65:
why_wrong = "Semantic similarity checker flagged synonym-rich correct reasoning as semantic drift."
elif tcm is False:
why_wrong = "Tool validator incorrectly identified formatting differences as a claim mismatch."
else:
why_wrong = "Aggressive signal fusion threshold triggered a false alarm."
else:
# False Negative: predicted clean but actually hallucinated
# Explain why the detector missed it
sem_sim = signals.get("semantic_similarity")
nli = signals.get("nli_score")
dominant_signal = "None"
if sem_sim is not None and sem_sim > 0.8:
dominant_signal = f"High Semantic Similarity ({sem_sim:.2f})"
elif nli is not None and nli < 0.3:
dominant_signal = f"Low NLI Contradiction ({nli:.2f})"
why_wrong = "The hallucination was linguistically subtle or used matching terminology, bypassing the SLM ensemble."
errors.append({
"trajectory_id": traj_id,
"step": step_num,
"error_type": error_type,
"action": step.get("action", ""),
"agent_reasoning": step.get("agent_reasoning", ""),
"tool_output": step.get("tool_output", ""),
"ground_truth_label": gt_detected,
"confidence": conf,
"error_magnitude": abs(conf - (1 if gt_detected else 0)),
"dominant_signal": dominant_signal,
"why_wrong": why_wrong
})
# Sort errors by magnitude (highest confidence wrong predictions first)
errors.sort(key=lambda x: x["error_magnitude"], reverse=True)
# Print top 10 errors
print("\n--- Top 10 Highest-Confidence Wrong Predictions ---")
for idx, err in enumerate(errors[:10]):
print(f"{idx+1}. Traj: {err['trajectory_id']}, Step: {err['step']}, Type: {err['error_type']}, "
f"Conf: {err['confidence']:.4f}, Magnitude: {err['error_magnitude']:.4f}")
print(f" Reasoning: {err['agent_reasoning'][:120]}...")
print(f" Dominant Signal: {err['dominant_signal']}")
print(f" Why Wrong: {err['why_wrong']}")
print()
# Select 6 representative cases (3 FPs, 3 FNs)
fps = [e for e in errors if e["error_type"] == "FP"]
fns = [e for e in errors if e["error_type"] == "FN"]
representative_cases = fps[:3] + fns[:3]
# Generate LaTeX table
tex = r"""\begin{table*}[t]
\centering
\caption{Qualitative Error Analysis: Representative False Positives and False Negatives}
\begin{tabular}{lp{6cm}lp{3cm}p{5cm}}
\toprule
ID & Step Reasoning & Type & Dominant Signal & Error Cause (Rule-Based) \\
\midrule
"""
for err in representative_cases:
clean_reasoning = err["agent_reasoning"].replace("%", r"\%").replace("_", r"\_").replace("&", r"\&")
if len(clean_reasoning) > 100:
clean_reasoning = clean_reasoning[:97] + "..."
row_id = err['trajectory_id'] + r"\_" + str(err['step'])
row_signal = err['dominant_signal'].replace("_", r"\_")
row_why = err['why_wrong'].replace("_", r"\_")
tex += f"{row_id} & {clean_reasoning} & {err['error_type']} & {row_signal} & {row_why} \\\\\n"
tex += r"""\bottomrule
\end{tabular}
\end{table*}"""
# Ensure output directories exist
paper_dir = os.path.join(CONFIG.paths.project_root, "paper", "figures")
os.makedirs(paper_dir, exist_ok=True)
tex_path = os.path.join(paper_dir, "error_analysis_table.tex")
with open(tex_path, "w", encoding="utf-8") as f:
f.write(tex)
print(f"LaTeX error table saved to: {tex_path}")
# Save JSON results
out_dir = os.path.join(CONFIG.paths.project_root, "evaluation", "results")
os.makedirs(out_dir, exist_ok=True)
json_path = os.path.join(out_dir, "error_analysis.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(errors, f, indent=2)
print(f"Error analysis JSON saved to: {json_path}")
if __name__ == "__main__":
analyze_errors()