File size: 2,665 Bytes
53bb779
 
 
 
 
29809c8
53bb779
 
 
 
 
29809c8
 
 
 
 
 
 
 
53bb779
 
29809c8
 
 
53bb779
 
29809c8
53bb779
 
 
29809c8
 
 
53bb779
 
29809c8
 
 
 
 
53bb779
 
29809c8
 
 
 
 
 
 
 
 
 
 
53bb779
 
29809c8
53bb779
29809c8
 
 
 
53bb779
 
 
29809c8
 
53bb779
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
Runs CRAG and RAPTOR baselines against a set of test queries.
"""
import sys
import os
from typing import Dict, List, Optional
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from modules.reasoning_engine import create_reasoning_engine
from evaluation.metrics import MetricsRunner

def _append_score(results: Dict[str, List[float]], baseline: str, score: Optional[float]) -> None:
    if score is None:
        print(f"{baseline}: Faithfulness score unavailable for this query.")
        return
    results[baseline].append(score)


def run_baselines(agent_name: str, queries: List[str]):
    engine = create_reasoning_engine()
    metrics = MetricsRunner()

    results: Dict[str, List[float]] = {"CRAG": [], "RAPTOR": [], "MEXAR": []}

    for q in queries:
        print(f"\nProcessing query: {q}")

        try:
            # Original MEXAR
            res_mexar = engine.reason(agent_name, q)
            mexar_score = metrics.extract_faithfulness(res_mexar)
            _append_score(results, "MEXAR", mexar_score)

            # CRAG
            res_crag = engine.reason_crag_baseline(agent_name, q)
            crag_score = metrics.extract_faithfulness(res_crag)
            if crag_score is None:
                crag_score = metrics.extract_confidence(res_crag)
            _append_score(results, "CRAG", crag_score)

            # RAPTOR
            res_raptor = engine.reason_raptor_baseline(agent_name, q)
            raptor_score = metrics.extract_faithfulness(res_raptor)
            if raptor_score is None:
                raptor_score = metrics.extract_confidence(res_raptor)
            _append_score(results, "RAPTOR", raptor_score)

            print(
                "Scores -> "
                f"MEXAR: {mexar_score if mexar_score is not None else 'N/A'}, "
                f"CRAG: {crag_score if crag_score is not None else 'N/A'}, "
                f"RAPTOR: {raptor_score if raptor_score is not None else 'N/A'}"
            )
        except Exception as e:
            print(f"Error evaluating query '{q}': {e}")

    print("\n--- Baseline Comparison (Faithfulness) ---")
    for b_name, scores in results.items():
        if scores:
            avg = sum(scores) / len(scores)
            print(f"{b_name}: {avg:.4f} (n={len(scores)})")
        else:
            print(f"{b_name}: No results")

    return results

if __name__ == "__main__":
    # Example usage
    test_queries = [
        "What are the symptoms of a common cold?",
        "How do I bake a chocolate cake?"
    ]
    # Replace 'medical_agent' with an actual compiled agent name in DB
    run_baselines("medical_agent", test_queries)