devrajsinh2012 commited on
Commit
29809c8
·
1 Parent(s): 9239751

feat: harden evaluation workflows and docs

Browse files
README.md CHANGED
@@ -140,6 +140,40 @@ npm start
140
 
141
  ---
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  ## 📁 Project Structure
144
 
145
  ```
 
140
 
141
  ---
142
 
143
+ ## 📈 Evaluation Workflows
144
+
145
+ The scripts in `backend/evaluation` support baseline comparison, guardrail checks, benchmark runs, and ablation studies.
146
+
147
+ Run from project root:
148
+
149
+ ```bash
150
+ cd backend
151
+
152
+ # Baseline comparison: MEXAR vs CRAG vs RAPTOR
153
+ python evaluation/baseline_runner.py
154
+
155
+ # Backbone comparison (restores original backbone after completion)
156
+ python evaluation/backbone_comparison.py
157
+
158
+ # Guardrail boundary query analysis
159
+ python evaluation/guardrail_analysis.py
160
+
161
+ # Benchmark dataset run (all rows by default) + save report
162
+ python evaluation/benchmark_runner.py --dataset-path ../test_data/medqa_sample.json --agent-name medical_agent --output evaluation_outputs/medqa_report.json
163
+
164
+ # Quick benchmark smoke test
165
+ python evaluation/benchmark_runner.py --dataset-path ../test_data/medqa_sample.json --agent-name medical_agent --max-samples 25
166
+
167
+ # McNemar significance helper
168
+ python evaluation/statistical_tests.py
169
+ ```
170
+
171
+ Notes:
172
+ - Faithfulness values are read from `explainability.confidence_breakdown.faithfulness` when available.
173
+ - Benchmark reports include per-query status and aggregate summary metrics.
174
+
175
+ ---
176
+
177
  ## 📁 Project Structure
178
 
179
  ```
backend/evaluation/ablation_chunk_size.py CHANGED
@@ -7,9 +7,11 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
 
8
  from modules.knowledge_compiler import create_knowledge_compiler
9
  from modules.reasoning_engine import create_reasoning_engine
 
10
 
11
  def run_chunk_ablation(agent_name: str, parsed_data: list, system_prompt: str, prompt_analysis: dict, test_queries: list):
12
  sizes = [64, 128, 256, 512, 1024]
 
13
 
14
  for size in sizes:
15
  print(f"\n=====================")
@@ -28,8 +30,12 @@ def run_chunk_ablation(agent_name: str, parsed_data: list, system_prompt: str, p
28
  engine = create_reasoning_engine()
29
  for q in test_queries:
30
  res = engine.reason(agent_name, q)
 
31
  print(f"Q: {q}")
32
- print(f"Faithfulness: {res['explainability']['faithfulness']}")
 
 
 
33
  except Exception as e:
34
  print(f"Failed ablation step for size {size}: {e}")
35
 
 
7
 
8
  from modules.knowledge_compiler import create_knowledge_compiler
9
  from modules.reasoning_engine import create_reasoning_engine
10
+ from evaluation.metrics import MetricsRunner
11
 
12
  def run_chunk_ablation(agent_name: str, parsed_data: list, system_prompt: str, prompt_analysis: dict, test_queries: list):
13
  sizes = [64, 128, 256, 512, 1024]
14
+ metrics = MetricsRunner()
15
 
16
  for size in sizes:
17
  print(f"\n=====================")
 
30
  engine = create_reasoning_engine()
31
  for q in test_queries:
32
  res = engine.reason(agent_name, q)
33
+ faithfulness = metrics.extract_faithfulness(res)
34
  print(f"Q: {q}")
35
+ if faithfulness is None:
36
+ print("Faithfulness: N/A")
37
+ else:
38
+ print(f"Faithfulness: {faithfulness:.3f}")
39
  except Exception as e:
40
  print(f"Failed ablation step for size {size}: {e}")
41
 
backend/evaluation/backbone_comparison.py CHANGED
@@ -7,24 +7,35 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
 
8
  from core.config import settings
9
  from modules.reasoning_engine import create_reasoning_engine
 
10
 
11
  def run_comparison(agent_name: str, queries: list):
12
  backbones = ["llama3", "mixtral", "gemma"]
13
-
14
- for bb in backbones:
15
- settings.LLM_BACKBONE = bb
16
- print(f"\n--- Testing Backbone: {bb} ---")
17
- try:
18
- # Must recreate engine so GroqClient picks up config
19
- engine = create_reasoning_engine()
20
-
21
- for q in queries:
22
- res = engine.reason(agent_name, q)
23
- print(f"Q: {q}")
24
- print(f"A ({bb}): {res['answer'][:100]}...")
25
- print(f"Faithfulness: {res['explainability']['faithfulness']}")
26
- except Exception as e:
27
- print(f"Failed to run with backbone {bb}: {e}")
 
 
 
 
 
 
 
 
 
 
28
 
29
  if __name__ == "__main__":
30
  test_queries = ["What are the symptoms of a common cold?"]
 
7
 
8
  from core.config import settings
9
  from modules.reasoning_engine import create_reasoning_engine
10
+ from evaluation.metrics import MetricsRunner
11
 
12
  def run_comparison(agent_name: str, queries: list):
13
  backbones = ["llama3", "mixtral", "gemma"]
14
+ metrics = MetricsRunner()
15
+ original_backbone = getattr(settings, "LLM_BACKBONE", None)
16
+
17
+ try:
18
+ for bb in backbones:
19
+ settings.LLM_BACKBONE = bb
20
+ print(f"\n--- Testing Backbone: {bb} ---")
21
+ try:
22
+ # Must recreate engine so GroqClient picks up config
23
+ engine = create_reasoning_engine()
24
+
25
+ for q in queries:
26
+ res = engine.reason(agent_name, q)
27
+ faithfulness = metrics.extract_faithfulness(res)
28
+ print(f"Q: {q}")
29
+ print(f"A ({bb}): {res['answer'][:100]}...")
30
+ if faithfulness is None:
31
+ print("Faithfulness: N/A")
32
+ else:
33
+ print(f"Faithfulness: {faithfulness:.3f}")
34
+ except Exception as e:
35
+ print(f"Failed to run with backbone {bb}: {e}")
36
+ finally:
37
+ settings.LLM_BACKBONE = original_backbone
38
+ print(f"\nRestored LLM_BACKBONE to: {original_backbone}")
39
 
40
  if __name__ == "__main__":
41
  test_queries = ["What are the symptoms of a common cold?"]
backend/evaluation/baseline_runner.py CHANGED
@@ -3,43 +3,70 @@ Runs CRAG and RAPTOR baselines against a set of test queries.
3
  """
4
  import sys
5
  import os
 
 
6
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
 
8
  from modules.reasoning_engine import create_reasoning_engine
9
  from evaluation.metrics import MetricsRunner
10
 
11
- def run_baselines(agent_name: str, queries: list):
 
 
 
 
 
 
 
 
12
  engine = create_reasoning_engine()
13
  metrics = MetricsRunner()
14
-
15
- results = {"CRAG": [], "RAPTOR": [], "MEXAR": []}
16
-
17
  for q in queries:
18
  print(f"\nProcessing query: {q}")
19
-
20
  try:
21
  # Original MEXAR
22
  res_mexar = engine.reason(agent_name, q)
23
- results["MEXAR"].append(float(res_mexar["explainability"]["faithfulness"].strip('%'))/100)
24
-
 
25
  # CRAG
26
  res_crag = engine.reason_crag_baseline(agent_name, q)
27
- results["CRAG"].append(res_crag["confidence"]) # The raw score
28
-
 
 
 
29
  # RAPTOR
30
  res_raptor = engine.reason_raptor_baseline(agent_name, q)
31
- results["RAPTOR"].append(res_raptor["confidence"])
 
 
 
 
 
 
 
 
 
 
32
  except Exception as e:
33
  print(f"Error evaluating query '{q}': {e}")
34
-
35
  print("\n--- Baseline Comparison (Faithfulness) ---")
36
- for b_name in results:
37
- if results[b_name]:
38
- avg = sum(results[b_name]) / len(results[b_name])
39
- print(f"{b_name}: {avg:.4f}")
40
  else:
41
  print(f"{b_name}: No results")
42
 
 
 
 
43
  if __name__ == "__main__":
44
  # Example usage
45
  test_queries = [
 
3
  """
4
  import sys
5
  import os
6
+ from typing import Dict, List, Optional
7
+
8
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9
 
10
  from modules.reasoning_engine import create_reasoning_engine
11
  from evaluation.metrics import MetricsRunner
12
 
13
+
14
+ def _append_score(results: Dict[str, List[float]], baseline: str, score: Optional[float]) -> None:
15
+ if score is None:
16
+ print(f"{baseline}: Faithfulness score unavailable for this query.")
17
+ return
18
+ results[baseline].append(score)
19
+
20
+
21
+ def run_baselines(agent_name: str, queries: List[str]):
22
  engine = create_reasoning_engine()
23
  metrics = MetricsRunner()
24
+
25
+ results: Dict[str, List[float]] = {"CRAG": [], "RAPTOR": [], "MEXAR": []}
26
+
27
  for q in queries:
28
  print(f"\nProcessing query: {q}")
29
+
30
  try:
31
  # Original MEXAR
32
  res_mexar = engine.reason(agent_name, q)
33
+ mexar_score = metrics.extract_faithfulness(res_mexar)
34
+ _append_score(results, "MEXAR", mexar_score)
35
+
36
  # CRAG
37
  res_crag = engine.reason_crag_baseline(agent_name, q)
38
+ crag_score = metrics.extract_faithfulness(res_crag)
39
+ if crag_score is None:
40
+ crag_score = metrics.extract_confidence(res_crag)
41
+ _append_score(results, "CRAG", crag_score)
42
+
43
  # RAPTOR
44
  res_raptor = engine.reason_raptor_baseline(agent_name, q)
45
+ raptor_score = metrics.extract_faithfulness(res_raptor)
46
+ if raptor_score is None:
47
+ raptor_score = metrics.extract_confidence(res_raptor)
48
+ _append_score(results, "RAPTOR", raptor_score)
49
+
50
+ print(
51
+ "Scores -> "
52
+ f"MEXAR: {mexar_score if mexar_score is not None else 'N/A'}, "
53
+ f"CRAG: {crag_score if crag_score is not None else 'N/A'}, "
54
+ f"RAPTOR: {raptor_score if raptor_score is not None else 'N/A'}"
55
+ )
56
  except Exception as e:
57
  print(f"Error evaluating query '{q}': {e}")
58
+
59
  print("\n--- Baseline Comparison (Faithfulness) ---")
60
+ for b_name, scores in results.items():
61
+ if scores:
62
+ avg = sum(scores) / len(scores)
63
+ print(f"{b_name}: {avg:.4f} (n={len(scores)})")
64
  else:
65
  print(f"{b_name}: No results")
66
 
67
+ return results
68
+
69
+
70
  if __name__ == "__main__":
71
  # Example usage
72
  test_queries = [
backend/evaluation/benchmark_runner.py CHANGED
@@ -4,32 +4,167 @@ Runs evaluation on public benchmarks like MedQA, LegalBench.
4
  import sys
5
  import os
6
  import json
 
 
 
 
7
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
 
9
  from modules.reasoning_engine import create_reasoning_engine
 
 
 
 
 
 
 
 
 
 
10
 
11
- def run_benchmark(dataset_path: str, agent_name: str):
 
 
 
 
 
 
 
 
 
 
 
12
  engine = create_reasoning_engine()
13
-
 
14
  if not os.path.exists(dataset_path):
15
- print(f"Dataset not found: {dataset_path}")
16
- return
17
-
18
- with open(dataset_path, "r") as f:
19
  data = json.load(f)
20
-
21
- for item in data[:10]: # Run first 10 for demo
22
- query = item.get("question") or item.get("query")
 
 
 
 
 
 
 
 
 
 
 
23
  if not query:
 
24
  continue
25
-
26
- print(f"\nQuery: {query}")
 
 
 
 
 
27
  try:
28
  result = engine.reason(agent_name, query)
29
- print(f"Answer: {result['answer'][:100]}...")
30
- print(f"Faithfulness: {result['explainability']['faithfulness']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  except Exception as e:
 
 
 
 
 
 
32
  print(f"Failed to process query: {e}")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  if __name__ == "__main__":
35
- run_benchmark(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "test_data", "medqa_sample.json"), "medical_agent")
 
 
 
 
4
  import sys
5
  import os
6
  import json
7
+ import argparse
8
+ from datetime import datetime
9
+ from typing import Any, Dict, List, Optional
10
+
11
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
 
13
  from modules.reasoning_engine import create_reasoning_engine
14
+ from evaluation.metrics import MetricsRunner
15
+
16
+
17
+ def _extract_query(item: Dict[str, Any]) -> Optional[str]:
18
+ query = item.get("question") or item.get("query")
19
+ if not isinstance(query, str):
20
+ return None
21
+ query = query.strip()
22
+ return query if query else None
23
+
24
 
25
+ def _summarize_scores(scores: List[float]) -> Optional[float]:
26
+ if not scores:
27
+ return None
28
+ return round(sum(scores) / len(scores), 4)
29
+
30
+
31
+ def run_benchmark(
32
+ dataset_path: str,
33
+ agent_name: str,
34
+ max_samples: Optional[int] = None,
35
+ output_path: Optional[str] = None,
36
+ ) -> Dict[str, Any]:
37
  engine = create_reasoning_engine()
38
+ metrics = MetricsRunner()
39
+
40
  if not os.path.exists(dataset_path):
41
+ raise FileNotFoundError(f"Dataset not found: {dataset_path}")
42
+
43
+ with open(dataset_path, "r", encoding="utf-8") as f:
 
44
  data = json.load(f)
45
+
46
+ if not isinstance(data, list):
47
+ raise ValueError("Benchmark dataset must be a JSON array of records")
48
+
49
+ items = data if not max_samples else data[:max_samples]
50
+
51
+ records: List[Dict[str, Any]] = []
52
+ faithfulness_scores: List[float] = []
53
+ succeeded = 0
54
+ failed = 0
55
+ skipped = 0
56
+
57
+ for idx, item in enumerate(items, start=1):
58
+ query = _extract_query(item)
59
  if not query:
60
+ skipped += 1
61
  continue
62
+
63
+ print(f"\n[{idx}/{len(items)}] Query: {query}")
64
+ row: Dict[str, Any] = {
65
+ "index": idx,
66
+ "query": query,
67
+ }
68
+
69
  try:
70
  result = engine.reason(agent_name, query)
71
+ faithfulness = metrics.extract_faithfulness(result)
72
+ confidence = metrics.extract_confidence(result)
73
+ answer = result.get("answer", "")
74
+
75
+ if isinstance(answer, str) and len(answer) > 120:
76
+ answer_preview = f"{answer[:120]}..."
77
+ else:
78
+ answer_preview = answer
79
+
80
+ row.update({
81
+ "status": "ok",
82
+ "in_domain": result.get("in_domain"),
83
+ "confidence": confidence,
84
+ "faithfulness": faithfulness,
85
+ "answer_preview": answer_preview,
86
+ })
87
+ records.append(row)
88
+
89
+ if faithfulness is not None:
90
+ faithfulness_scores.append(faithfulness)
91
+ succeeded += 1
92
+
93
+ print(f"Answer: {answer_preview}")
94
+ if faithfulness is None:
95
+ print("Faithfulness: N/A")
96
+ else:
97
+ print(f"Faithfulness: {faithfulness:.3f}")
98
  except Exception as e:
99
+ row.update({
100
+ "status": "error",
101
+ "error": str(e),
102
+ })
103
+ records.append(row)
104
+ failed += 1
105
  print(f"Failed to process query: {e}")
106
 
107
+ summary: Dict[str, Any] = {
108
+ "dataset_path": dataset_path,
109
+ "agent_name": agent_name,
110
+ "total_rows": len(data),
111
+ "attempted_rows": len(items),
112
+ "succeeded": succeeded,
113
+ "failed": failed,
114
+ "skipped": skipped,
115
+ "avg_faithfulness": _summarize_scores(faithfulness_scores),
116
+ "generated_at_utc": datetime.utcnow().isoformat() + "Z",
117
+ }
118
+
119
+ print("\n--- Benchmark Summary ---")
120
+ print(f"Attempted: {summary['attempted_rows']}")
121
+ print(f"Succeeded: {summary['succeeded']}")
122
+ print(f"Failed: {summary['failed']}")
123
+ print(f"Skipped: {summary['skipped']}")
124
+ print(f"Avg faithfulness: {summary['avg_faithfulness']}")
125
+
126
+ if output_path:
127
+ output_dir = os.path.dirname(output_path)
128
+ if output_dir:
129
+ os.makedirs(output_dir, exist_ok=True)
130
+ payload = {
131
+ "summary": summary,
132
+ "results": records,
133
+ }
134
+ with open(output_path, "w", encoding="utf-8") as f:
135
+ json.dump(payload, f, indent=2)
136
+ print(f"Saved report to: {output_path}")
137
+
138
+ return {
139
+ "summary": summary,
140
+ "results": records,
141
+ }
142
+
143
+
144
+ def _default_dataset_path() -> str:
145
+ return os.path.join(
146
+ os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
147
+ "test_data",
148
+ "medqa_sample.json",
149
+ )
150
+
151
+
152
+ def parse_args() -> argparse.Namespace:
153
+ parser = argparse.ArgumentParser(description="Run benchmark dataset evaluation")
154
+ parser.add_argument("--dataset-path", default=_default_dataset_path(), help="Path to benchmark JSON file")
155
+ parser.add_argument("--agent-name", default="medical_agent", help="Compiled agent name")
156
+ parser.add_argument(
157
+ "--max-samples",
158
+ type=int,
159
+ default=0,
160
+ help="Limit to first N records (0 means all)",
161
+ )
162
+ parser.add_argument("--output", default="", help="Optional output path for JSON report")
163
+ return parser.parse_args()
164
+
165
+
166
  if __name__ == "__main__":
167
+ args = parse_args()
168
+ max_samples = args.max_samples if args.max_samples > 0 else None
169
+ output_path = args.output if args.output else None
170
+ run_benchmark(args.dataset_path, args.agent_name, max_samples=max_samples, output_path=output_path)
backend/evaluation/metrics.py CHANGED
@@ -4,17 +4,20 @@ Calculates common metrics across different baselines and experiments.
4
  """
5
  import sys
6
  import os
 
 
7
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
 
9
  from utils.faithfulness import FaithfulnessScorer, BartNLIScorer, FActScoreCompat
10
 
 
11
  class MetricsRunner:
12
  def __init__(self):
13
  self.faith_scorer = FaithfulnessScorer()
14
  self.bart_nli = BartNLIScorer()
15
  self.factscore = FActScoreCompat()
16
 
17
- def evaluate_all(self, answer: str, context: str):
18
  faith_res = self.faith_scorer.score(answer, context)
19
  bart_res = self.bart_nli.score(answer, context)
20
  fact_res = self.factscore.score(answer, context)
@@ -23,3 +26,62 @@ class MetricsRunner:
23
  "bart_nli": bart_res.score,
24
  "factscore": fact_res.score
25
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
  import sys
6
  import os
7
+ from typing import Any, Dict, Optional
8
+
9
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
 
11
  from utils.faithfulness import FaithfulnessScorer, BartNLIScorer, FActScoreCompat
12
 
13
+
14
  class MetricsRunner:
15
  def __init__(self):
16
  self.faith_scorer = FaithfulnessScorer()
17
  self.bart_nli = BartNLIScorer()
18
  self.factscore = FActScoreCompat()
19
 
20
+ def evaluate_all(self, answer: str, context: str) -> Dict[str, float]:
21
  faith_res = self.faith_scorer.score(answer, context)
22
  bart_res = self.bart_nli.score(answer, context)
23
  fact_res = self.factscore.score(answer, context)
 
26
  "bart_nli": bart_res.score,
27
  "factscore": fact_res.score
28
  }
29
+
30
+ def extract_faithfulness(self, response: Dict[str, Any]) -> Optional[float]:
31
+ """Extract faithfulness score from response payloads across formats."""
32
+ if not isinstance(response, dict):
33
+ return None
34
+
35
+ explainability = response.get("explainability") or {}
36
+ confidence_breakdown = explainability.get("confidence_breakdown") or {}
37
+
38
+ for candidate in (
39
+ confidence_breakdown.get("faithfulness"),
40
+ explainability.get("faithfulness"),
41
+ ):
42
+ parsed = self._parse_numeric(candidate)
43
+ if parsed is not None:
44
+ return self._clamp(parsed)
45
+
46
+ return None
47
+
48
+ def extract_confidence(self, response: Dict[str, Any]) -> Optional[float]:
49
+ """Extract numeric confidence score if available."""
50
+ if not isinstance(response, dict):
51
+ return None
52
+
53
+ parsed = self._parse_numeric(response.get("confidence"))
54
+ if parsed is None:
55
+ return None
56
+ return self._clamp(parsed)
57
+
58
+ @staticmethod
59
+ def _clamp(value: float) -> float:
60
+ return max(0.0, min(1.0, value))
61
+
62
+ @staticmethod
63
+ def _parse_numeric(value: Any) -> Optional[float]:
64
+ if value is None:
65
+ return None
66
+
67
+ if isinstance(value, (int, float)):
68
+ return float(value)
69
+
70
+ if isinstance(value, str):
71
+ cleaned = value.strip()
72
+ if not cleaned:
73
+ return None
74
+
75
+ if cleaned.endswith("%"):
76
+ cleaned = cleaned[:-1].strip()
77
+ try:
78
+ return float(cleaned) / 100.0
79
+ except ValueError:
80
+ return None
81
+
82
+ try:
83
+ return float(cleaned)
84
+ except ValueError:
85
+ return None
86
+
87
+ return None