""" Runs CRAG and RAPTOR baselines against a set of test queries. """ import sys import os from typing import Dict, List, Optional sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from modules.reasoning_engine import create_reasoning_engine from evaluation.metrics import MetricsRunner def _append_score(results: Dict[str, List[float]], baseline: str, score: Optional[float]) -> None: if score is None: print(f"{baseline}: Faithfulness score unavailable for this query.") return results[baseline].append(score) def run_baselines(agent_name: str, queries: List[str]): engine = create_reasoning_engine() metrics = MetricsRunner() results: Dict[str, List[float]] = {"CRAG": [], "RAPTOR": [], "MEXAR": []} for q in queries: print(f"\nProcessing query: {q}") try: # Original MEXAR res_mexar = engine.reason(agent_name, q) mexar_score = metrics.extract_faithfulness(res_mexar) _append_score(results, "MEXAR", mexar_score) # CRAG res_crag = engine.reason_crag_baseline(agent_name, q) crag_score = metrics.extract_faithfulness(res_crag) if crag_score is None: crag_score = metrics.extract_confidence(res_crag) _append_score(results, "CRAG", crag_score) # RAPTOR res_raptor = engine.reason_raptor_baseline(agent_name, q) raptor_score = metrics.extract_faithfulness(res_raptor) if raptor_score is None: raptor_score = metrics.extract_confidence(res_raptor) _append_score(results, "RAPTOR", raptor_score) print( "Scores -> " f"MEXAR: {mexar_score if mexar_score is not None else 'N/A'}, " f"CRAG: {crag_score if crag_score is not None else 'N/A'}, " f"RAPTOR: {raptor_score if raptor_score is not None else 'N/A'}" ) except Exception as e: print(f"Error evaluating query '{q}': {e}") print("\n--- Baseline Comparison (Faithfulness) ---") for b_name, scores in results.items(): if scores: avg = sum(scores) / len(scores) print(f"{b_name}: {avg:.4f} (n={len(scores)})") else: print(f"{b_name}: No results") return results if __name__ == "__main__": # Example usage test_queries = [ "What are the symptoms of a common cold?", "How do I bake a chocolate cake?" ] # Replace 'medical_agent' with an actual compiled agent name in DB run_baselines("medical_agent", test_queries)