Strands-Agents SDK Evaluators

#8
evals/README.md CHANGED
@@ -11,7 +11,6 @@ evals/
11
  ├── task_functions.py # Agent wrapper functions
12
  ├── langfuse_reporter.py # Langfuse integration
13
  ├── dataset_loader.py # JSON to Case conversion
14
- ├── generate_experiment.py # SDK Experiment Generator
15
  ├── run_helpfulness.py # Helpfulness evaluation
16
  ├── run_trajectory.py # Trajectory evaluation
17
  └── run_full_suite.py # Full evaluation suite
@@ -36,12 +35,6 @@ python -m evals.run_trajectory
36
  python -m evals.run_full_suite
37
  ```
38
 
39
- ### Generate Synthetic Data
40
-
41
- ```bash
42
- # Generate new test cases using SDK ExperimentGenerator
43
- python -m evals.generate_experiment
44
- ```
45
  ## Features
46
 
47
  - **SDK-Aligned**: Uses `Experiment.run_evaluations()` framework
 
11
  ├── task_functions.py # Agent wrapper functions
12
  ├── langfuse_reporter.py # Langfuse integration
13
  ├── dataset_loader.py # JSON to Case conversion
 
14
  ├── run_helpfulness.py # Helpfulness evaluation
15
  ├── run_trajectory.py # Trajectory evaluation
16
  └── run_full_suite.py # Full evaluation suite
 
35
  python -m evals.run_full_suite
36
  ```
37
 
 
 
 
 
 
 
38
  ## Features
39
 
40
  - **SDK-Aligned**: Uses `Experiment.run_evaluations()` framework
evals/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (457 Bytes). View file
 
evals/__pycache__/config.cpython-314.pyc ADDED
Binary file (1.8 kB). View file
 
evals/__pycache__/dataset_loader.cpython-314.pyc ADDED
Binary file (1.55 kB). View file
 
evals/__pycache__/langfuse_reporter.cpython-314.pyc ADDED
Binary file (6 kB). View file
 
evals/__pycache__/run_helpfulness.cpython-314.pyc ADDED
Binary file (2.56 kB). View file
 
evals/__pycache__/task_functions.cpython-314.pyc ADDED
Binary file (3.04 kB). View file
 
evals/config.py CHANGED
@@ -2,7 +2,7 @@
2
  Shared configuration for evaluators and models.
3
  """
4
  from strands.models.openai import OpenAIModel
5
- from strands_evals.evaluators import OutputEvaluator, TrajectoryEvaluator, FaithfulnessEvaluator
6
 
7
  # Configure OpenAI model for evaluators
8
  eval_model = OpenAIModel(model_id="gpt-4o")
@@ -23,7 +23,7 @@ helpfulness_evaluator = OutputEvaluator(
23
  model=eval_model
24
  )
25
 
26
- # Faithfulness Evaluator (Generic for now, to enable data generation)
27
  faithfulness_evaluator = OutputEvaluator(
28
  rubric="""
29
  Evaluate if the response is faithful to the retrieved context.
@@ -43,17 +43,13 @@ trajectory_evaluator = TrajectoryEvaluator(
43
  rubric="""
44
  Evaluate the tool usage trajectory:
45
  1. Correct tool selection - Were the right tools chosen?
46
- 2. Proper sequence - Logical order (Retrieve -> Analyze -> Explain)?
47
  3. Efficiency - No unnecessary tools?
48
 
49
  Score 1.0 if optimal tools used correctly.
50
  Score 0.5 if correct tools but suboptimal sequence.
51
- SCORE 0.0 if wrong tools or major inefficiencies.
52
  """,
53
  include_inputs=True,
54
  model=eval_model
55
  )
56
-
57
- # Key Points Evaluator (Custom)
58
- from evals.key_points_evaluator import KeyPointsEvaluator
59
- key_points_evaluator = KeyPointsEvaluator()
 
2
  Shared configuration for evaluators and models.
3
  """
4
  from strands.models.openai import OpenAIModel
5
+ from strands_evals.evaluators import OutputEvaluator, TrajectoryEvaluator
6
 
7
  # Configure OpenAI model for evaluators
8
  eval_model = OpenAIModel(model_id="gpt-4o")
 
23
  model=eval_model
24
  )
25
 
26
+ # Faithfulness Evaluator
27
  faithfulness_evaluator = OutputEvaluator(
28
  rubric="""
29
  Evaluate if the response is faithful to the retrieved context.
 
43
  rubric="""
44
  Evaluate the tool usage trajectory:
45
  1. Correct tool selection - Were the right tools chosen?
46
+ 2. Proper sequence - Logical order (Retrieve Analyze Explain)?
47
  3. Efficiency - No unnecessary tools?
48
 
49
  Score 1.0 if optimal tools used correctly.
50
  Score 0.5 if correct tools but suboptimal sequence.
51
+ Score 0.0 if wrong tools or major inefficiencies.
52
  """,
53
  include_inputs=True,
54
  model=eval_model
55
  )
 
 
 
 
evals/generate_experiment.py DELETED
@@ -1,86 +0,0 @@
1
- #!/usr/bin/env python
2
- """
3
- Generate synthetic test cases using Strands ExperimentGenerator.
4
-
5
- This script replaces the custom generate_data.py and uses the SDK
6
- to generate diverse, high-quality test cases for the fraud agent.
7
- """
8
- import os
9
- import json
10
- import asyncio
11
- from dotenv import load_dotenv
12
-
13
- load_dotenv()
14
-
15
- from typing import List, Dict
16
- from strands_evals.generators import ExperimentGenerator
17
- from strands_evals.evaluators import OutputEvaluator
18
- from evals.config import eval_model
19
-
20
- # Context description for the generator
21
- CONTEXT = """
22
- You are generating test cases for a Fraud Model Explainability Assistant for a financial services company.
23
- The assistant uses RAG and tools to explain fraud scores (0-1000), SHAP values, and compliance checks.
24
-
25
- Users are typically:
26
- 1. Fraud Analysts (investigating specific cases)
27
- 2. Data Scientists (monitoring model performance)
28
- 3. Compliance Officers (checking for Fair Lending bias)
29
- 4. Executives (asking for high-level summaries)
30
-
31
- Tools available:
32
- - get_application_summary(app_id): Returns score, risk level.
33
- - explain_fraud_score(app_id): Returns SHAP feature contributions.
34
- - compare_to_population(app_id): Returns stats vs approved/denied.
35
- - check_fair_lending_flags(app_id): Returns bias analysis.
36
- - get_identity_network(app_id): Returns linked applications.
37
- """
38
-
39
- async def generate():
40
- print("🚀 Starting Experiment Generation with SDK...")
41
-
42
- # Initialize generator with str input/output
43
- generator = ExperimentGenerator[str, str](
44
- input_type=str,
45
- output_type=str,
46
- model=eval_model
47
- )
48
-
49
- # Generate experiment
50
- print(" Generating cases (this may take a minute)...")
51
- experiment = await generator.from_context_async(
52
- context=CONTEXT,
53
- num_cases=10, # Generate 10 new cases
54
- evaluator=OutputEvaluator, # Pass class, let generator create rubric
55
- task_description="Explain fraud model decisions and risk factors.",
56
- num_topics=5 # Split across different topics (High Risk, Compliance, etc.)
57
- )
58
-
59
- print(f"✅ Generated {len(experiment.cases)} new test cases.")
60
-
61
- # Convert to our JSON format
62
- new_cases = []
63
- for i, case in enumerate(experiment.cases):
64
- # Metadata might be None
65
- metadata = case.metadata if case.metadata else {}
66
- new_case = {
67
- "id": f"synth_sdk_{i+1}",
68
- "question": case.input,
69
- "expected_intent": metadata.get("topic", "General"),
70
- "expected_answer_key_points": [case.expected_output] if case.expected_output else []
71
- }
72
- new_cases.append(new_case)
73
- print(f" - [{new_case['expected_intent']}] {new_case['question'][:60]}...")
74
-
75
- # Load existing cases to append (optional, or overwrite)
76
- output_path = "evaluation/dataset_sdk.json"
77
-
78
- # Saving to a new file to avoid overwriting the main dataset during this test
79
- with open(output_path, "w") as f:
80
- json.dump(new_cases, f, indent=2)
81
-
82
- print(f"\n💾 Saved {len(new_cases)} cases to {output_path}")
83
- print(" Review the file and merge into evaluation/dataset.json if desired.")
84
-
85
- if __name__ == "__main__":
86
- asyncio.run(generate())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/key_points_evaluator.py DELETED
@@ -1,102 +0,0 @@
1
- """
2
- Custom evaluator for checking if specific key points are present in the response.
3
- """
4
- from typing import Any, List
5
- from strands_evals.evaluators import Evaluator
6
- from strands_evals.types.evaluation import EvaluationData, EvaluationOutput
7
- from strands_evals.types.trace import EvaluationLevel
8
- from typing_extensions import TypeVar
9
-
10
- InputT = TypeVar("InputT")
11
- OutputT = TypeVar("OutputT")
12
-
13
- class KeyPointsEvaluator(Evaluator[InputT, OutputT]):
14
- """Evaluates output by checking for presence of expected key points (keywords/phrases)."""
15
-
16
- evaluation_level = EvaluationLevel.TRACE_LEVEL
17
-
18
- def __init__(self, version: str = "v1"):
19
- super().__init__()
20
- self.version = version
21
-
22
- def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> List[EvaluationOutput]:
23
- """Synchronous evaluation."""
24
- return self._do_evaluation(evaluation_case)
25
-
26
- async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> List[EvaluationOutput]:
27
- """Asynchronous evaluation."""
28
- return self._do_evaluation(evaluation_case)
29
-
30
- def _do_evaluation(self, evaluation_case: EvaluationData[InputT, OutputT]) -> List[EvaluationOutput]:
31
- """
32
- Check if expected key points are present in the actual output.
33
- Expects 'expected_key_points' list in case metadata.
34
- """
35
- # Get actual output
36
- actual_output = str(evaluation_case.actual_output)
37
-
38
- # Get expectations from case metadata (which is attached to evaluation_case)
39
- # Note: The SDK passes the whole Case object or relevant parts.
40
- # However, EvaluationData typically has input/output.
41
- # Metadata is likely accessible if evaluation_case is constructed from a Case.
42
- # But SDK EvaluationData doesn't strictly carry metadata field in all versions.
43
- # We rely on how Experiment constructs it.
44
-
45
- # EXPERIMENTAL: The SDK's Experiment loop constructs EvaluationData.
46
- # If it doesn't pass metadata, we need to inspect the source 'case'.
47
- # But Evaluator.evaluate receives EvaluationData, not Case.
48
- # Wait, Strands SDK 1.22 might have metadata on EvaluationData?
49
- # Let's check the type definition if needed.
50
- # For now, assuming we can access it or we need a workaround.
51
-
52
- # Workaround: For this custom evaluator to work with Experiment,
53
- # the Experiment must pass metadata.
54
-
55
- # Actually, looking at the Experiment source (which we can't see right now but inferred),
56
- # it might be easier to pass expected_output as the key points string?
57
- # Dataset loader sets: expected_key_points in metadata.
58
-
59
- # Let's try to access metadata if it exists on EvaluationData,
60
- # Otherwise fall back to a safe default.
61
-
62
- key_points = []
63
- if hasattr(evaluation_case, 'metadata') and evaluation_case.metadata:
64
- key_points = evaluation_case.metadata.get("expected_key_points", [])
65
-
66
- # Calculate score
67
- if not key_points:
68
- return [EvaluationOutput(
69
- score=1.0,
70
- test_pass=True,
71
- reason="No key points defined for this case.",
72
- label="N/A"
73
- )]
74
-
75
- hits = 0
76
- misses = []
77
-
78
- for point in key_points:
79
- point_lower = point.lower()
80
- output_lower = actual_output.lower()
81
-
82
- if point_lower in output_lower:
83
- hits += 1
84
- # partial match check (heuristic from run_full_suite)
85
- elif any(word in output_lower for word in point_lower.split() if len(word) > 4):
86
- hits += 0.5
87
- misses.append(f"{point} (Partial)")
88
- else:
89
- misses.append(point)
90
-
91
- score = min(1.0, hits / len(key_points))
92
-
93
- reason = f"Matched {hits}/{len(key_points)} key points."
94
- if misses:
95
- reason += f" Missed: {', '.join(misses[:3])}..."
96
-
97
- return [EvaluationOutput(
98
- score=score,
99
- test_pass=score >= 0.7, # 70% threshold
100
- reason=reason,
101
- label=f"{int(score*100)}%"
102
- )]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/langfuse_reporter.py CHANGED
@@ -58,14 +58,13 @@ class LangfuseReporter:
58
  self.lf_client = langfuse_client
59
  self.tracer = trace.get_tracer("evaluation_reporter")
60
 
61
- def log_experiment_results(self, reports, case_names: List[str], evaluator_names: List[str] = None) -> Dict[str, str]:
62
  """
63
  Log SDK experiment results to Langfuse with OpenTelemetry trace correlation.
64
 
65
  Args:
66
  reports: List of evaluation reports from Experiment.run_evaluations()
67
  case_names: List of case names to create trace IDs for
68
- evaluator_names: Optional list of evaluator names. If not provided, generic names will be used.
69
 
70
  Returns:
71
  Dict mapping case names to trace IDs
@@ -81,16 +80,14 @@ class LangfuseReporter:
81
  trace_ids[case_name] = trace_id_hex
82
 
83
  # Log all evaluator scores for this case
84
- for j, report in enumerate(reports):
85
- if i < len(report.scores):
86
- eval_name = evaluator_names[j] if evaluator_names and j < len(evaluator_names) else f"Evaluator_{j}"
87
- eval_name = eval_name.replace("Evaluator", "")
88
-
89
  self.lf_client.score_trace(
90
  trace_id=trace_id_hex,
91
- name=eval_name,
92
- value=report.scores[i],
93
- comment=report.reasons[i] if i < len(report.reasons) else None
94
  )
95
 
96
  return trace_ids
 
58
  self.lf_client = langfuse_client
59
  self.tracer = trace.get_tracer("evaluation_reporter")
60
 
61
+ def log_experiment_results(self, reports, case_names: List[str]) -> Dict[str, str]:
62
  """
63
  Log SDK experiment results to Langfuse with OpenTelemetry trace correlation.
64
 
65
  Args:
66
  reports: List of evaluation reports from Experiment.run_evaluations()
67
  case_names: List of case names to create trace IDs for
 
68
 
69
  Returns:
70
  Dict mapping case names to trace IDs
 
80
  trace_ids[case_name] = trace_id_hex
81
 
82
  # Log all evaluator scores for this case
83
+ for report in reports:
84
+ if i < len(report.case_results):
85
+ case_result = report.case_results[i]
 
 
86
  self.lf_client.score_trace(
87
  trace_id=trace_id_hex,
88
+ name=type(report.evaluator).__name__.replace("Evaluator", ""),
89
+ value=case_result.evaluation_output.score,
90
+ comment=case_result.evaluation_output.reason
91
  )
92
 
93
  return trace_ids
evals/run_full_suite.py CHANGED
@@ -8,11 +8,10 @@ and logs results to Langfuse with OpenTelemetry trace correlation.
8
  import json
9
  from strands_evals import Experiment
10
  from strands_evals.types.evaluation import EvaluationData
11
- from evals.config import helpfulness_evaluator, faithfulness_evaluator, trajectory_evaluator, key_points_evaluator
12
  from evals.task_functions import get_fraud_explanation_with_trace
13
  from evals.dataset_loader import load_cases_from_json
14
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
15
- from evals.utils import get_report_summary
16
 
17
 
18
  def main():
@@ -36,8 +35,7 @@ def main():
36
  evaluators=[
37
  helpfulness_evaluator,
38
  faithfulness_evaluator,
39
- trajectory_evaluator,
40
- key_points_evaluator
41
  ]
42
  )
43
 
@@ -49,11 +47,11 @@ def main():
49
  print("="*60 + "\n")
50
 
51
  for i, report in enumerate(reports):
52
- evaluator_name = type(experiment.evaluators[i]).__name__
53
  print(f"\n### {evaluator_name} ###")
54
- report.display()
55
 
56
- summary = get_report_summary(report)
57
  print(f"\n📊 {evaluator_name} Summary:")
58
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
59
  print(f" Average Score: {summary['average_score']:.2f}")
@@ -63,12 +61,47 @@ def main():
63
  print("\n" + "="*60)
64
  print("📤 Logging to Langfuse...")
65
  case_names = [case.name for case in cases]
66
- evaluator_names = [type(e).__name__ for e in experiment.evaluators]
67
- trace_ids = reporter.log_experiment_results(reports, case_names, evaluator_names)
68
  print(f" ✅ Logged {len(trace_ids)} traces with {len(reports)} metrics each")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Save experiment
71
- experiment.to_file("experiment_files/full_suite_eval")
72
  print("\n💾 Experiment saved to ./experiment_files/full_suite_eval.json")
73
 
74
  # Final summary
@@ -76,7 +109,7 @@ def main():
76
  print("✅ EVALUATION COMPLETE")
77
  print("="*60)
78
  print(f"\nTotal Cases: {len(cases)}")
79
- print(f"Evaluators: {len(reports)}")
80
  print(f"Langfuse Traces: {len(trace_ids)}")
81
  print(f"\nView results in Langfuse dashboard")
82
 
 
8
  import json
9
  from strands_evals import Experiment
10
  from strands_evals.types.evaluation import EvaluationData
11
+ from evals.config import helpfulness_evaluator, faithfulness_evaluator, trajectory_evaluator
12
  from evals.task_functions import get_fraud_explanation_with_trace
13
  from evals.dataset_loader import load_cases_from_json
14
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
 
15
 
16
 
17
  def main():
 
35
  evaluators=[
36
  helpfulness_evaluator,
37
  faithfulness_evaluator,
38
+ trajectory_evaluator
 
39
  ]
40
  )
41
 
 
47
  print("="*60 + "\n")
48
 
49
  for i, report in enumerate(reports):
50
+ evaluator_name = type(report.evaluator).__name__
51
  print(f"\n### {evaluator_name} ###")
52
+ report.run_display()
53
 
54
+ summary = report.get_summary()
55
  print(f"\n📊 {evaluator_name} Summary:")
56
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
57
  print(f" Average Score: {summary['average_score']:.2f}")
 
61
  print("\n" + "="*60)
62
  print("📤 Logging to Langfuse...")
63
  case_names = [case.name for case in cases]
64
+ trace_ids = reporter.log_experiment_results(reports, case_names)
 
65
  print(f" ✅ Logged {len(trace_ids)} traces with {len(reports)} metrics each")
66
 
67
+ # Add Goal Success (heuristic evaluator)
68
+ print("\n🎯 Running Goal Success (heuristic)...")
69
+ goal_success_scores = []
70
+
71
+ for case in cases:
72
+ expected_key_points = case.metadata.get("expected_key_points", [])
73
+
74
+ # Get the answer for this case (from first report)
75
+ case_idx = cases.index(case)
76
+ answer = reports[0].case_results[case_idx].actual_output
77
+
78
+ # Calculate hits
79
+ hits = 0
80
+ if expected_key_points:
81
+ for point in expected_key_points:
82
+ if point.lower() in answer.lower():
83
+ hits += 1
84
+ elif any(word in answer.lower() for word in point.split() if len(word) > 4):
85
+ hits += 0.5
86
+ success_rate = min(1.0, hits / len(expected_key_points))
87
+ else:
88
+ success_rate = 1.0
89
+
90
+ goal_success_scores.append(success_rate)
91
+
92
+ # Log to Langfuse
93
+ lf_client.score_trace(
94
+ trace_id=trace_ids[case.name],
95
+ name="Goal Success",
96
+ value=success_rate,
97
+ comment=f"Matched {hits}/{len(expected_key_points)} key points"
98
+ )
99
+
100
+ avg_goal_success = sum(goal_success_scores) / len(goal_success_scores)
101
+ print(f" Average Goal Success: {avg_goal_success:.2f}")
102
+
103
  # Save experiment
104
+ experiment.to_file("full_suite_eval", "json")
105
  print("\n💾 Experiment saved to ./experiment_files/full_suite_eval.json")
106
 
107
  # Final summary
 
109
  print("✅ EVALUATION COMPLETE")
110
  print("="*60)
111
  print(f"\nTotal Cases: {len(cases)}")
112
+ print(f"Evaluators: {len(reports) + 1} (including Goal Success)")
113
  print(f"Langfuse Traces: {len(trace_ids)}")
114
  print(f"\nView results in Langfuse dashboard")
115
 
evals/run_helpfulness.py CHANGED
@@ -10,7 +10,6 @@ from evals.config import helpfulness_evaluator
10
  from evals.task_functions import get_fraud_explanation
11
  from evals.dataset_loader import load_cases_from_json
12
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
13
- from evals.utils import get_report_summary
14
 
15
 
16
  def main():
@@ -32,10 +31,10 @@ def main():
32
 
33
  # Display SDK results
34
  print("\n" + "="*60)
35
- reports[0].display()
36
 
37
  # Get summary
38
- summary = get_report_summary(reports[0])
39
  print(f"\n📊 Summary:")
40
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
41
  print(f" Average Score: {summary['average_score']:.2f}")
@@ -45,12 +44,11 @@ def main():
45
  lf_client = LangfuseClient()
46
  reporter = LangfuseReporter(lf_client)
47
  case_names = [case.name for case in cases]
48
- evaluator_names = [type(e).__name__ for e in experiment.evaluators]
49
- trace_ids = reporter.log_experiment_results(reports, case_names, evaluator_names)
50
  print(f" Logged {len(trace_ids)} traces to Langfuse")
51
 
52
  # Save experiment
53
- experiment.to_file("experiment_files/helpfulness_eval")
54
  print("\n💾 Experiment saved to ./experiment_files/helpfulness_eval.json")
55
 
56
 
 
10
  from evals.task_functions import get_fraud_explanation
11
  from evals.dataset_loader import load_cases_from_json
12
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
 
13
 
14
 
15
  def main():
 
31
 
32
  # Display SDK results
33
  print("\n" + "="*60)
34
+ reports[0].run_display()
35
 
36
  # Get summary
37
+ summary = reports[0].get_summary()
38
  print(f"\n📊 Summary:")
39
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
40
  print(f" Average Score: {summary['average_score']:.2f}")
 
44
  lf_client = LangfuseClient()
45
  reporter = LangfuseReporter(lf_client)
46
  case_names = [case.name for case in cases]
47
+ trace_ids = reporter.log_experiment_results(reports, case_names)
 
48
  print(f" Logged {len(trace_ids)} traces to Langfuse")
49
 
50
  # Save experiment
51
+ experiment.to_file("helpfulness_eval", "json")
52
  print("\n💾 Experiment saved to ./experiment_files/helpfulness_eval.json")
53
 
54
 
evals/run_trajectory.py CHANGED
@@ -9,7 +9,6 @@ from evals.config import trajectory_evaluator
9
  from evals.task_functions import get_fraud_explanation_with_trace
10
  from evals.dataset_loader import load_cases_from_json
11
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
12
- from evals.utils import get_report_summary
13
 
14
 
15
  def main():
@@ -31,10 +30,10 @@ def main():
31
 
32
  # Display SDK results
33
  print("\n" + "="*60)
34
- reports[0].display()
35
 
36
  # Get summary
37
- summary = get_report_summary(reports[0])
38
  print(f"\n📊 Summary:")
39
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
40
  print(f" Average Score: {summary['average_score']:.2f}")
@@ -44,12 +43,11 @@ def main():
44
  lf_client = LangfuseClient()
45
  reporter = LangfuseReporter(lf_client)
46
  case_names = [case.name for case in cases]
47
- evaluator_names = [type(e).__name__ for e in experiment.evaluators]
48
- trace_ids = reporter.log_experiment_results(reports, case_names, evaluator_names)
49
  print(f" Logged {len(trace_ids)} traces to Langfuse")
50
 
51
  # Save experiment
52
- experiment.to_file("experiment_files/trajectory_eval")
53
  print("\n💾 Experiment saved to ./experiment_files/trajectory_eval.json")
54
 
55
 
 
9
  from evals.task_functions import get_fraud_explanation_with_trace
10
  from evals.dataset_loader import load_cases_from_json
11
  from evals.langfuse_reporter import LangfuseClient, LangfuseReporter
 
12
 
13
 
14
  def main():
 
30
 
31
  # Display SDK results
32
  print("\n" + "="*60)
33
+ reports[0].run_display()
34
 
35
  # Get summary
36
+ summary = reports[0].get_summary()
37
  print(f"\n📊 Summary:")
38
  print(f" Pass Rate: {summary['pass_rate']:.1%}")
39
  print(f" Average Score: {summary['average_score']:.2f}")
 
43
  lf_client = LangfuseClient()
44
  reporter = LangfuseReporter(lf_client)
45
  case_names = [case.name for case in cases]
46
+ trace_ids = reporter.log_experiment_results(reports, case_names)
 
47
  print(f" Logged {len(trace_ids)} traces to Langfuse")
48
 
49
  # Save experiment
50
+ experiment.to_file("trajectory_eval", "json")
51
  print("\n💾 Experiment saved to ./experiment_files/trajectory_eval.json")
52
 
53
 
evals/utils.py DELETED
@@ -1,15 +0,0 @@
1
- """
2
- Utility functions for evaluations.
3
- """
4
-
5
- def get_report_summary(report):
6
- """
7
- Calculate summary metrics from an EvaluationReport.
8
-
9
- Returns:
10
- dict: A dictionary containing 'pass_rate' and 'average_score'.
11
- """
12
- return {
13
- "pass_rate": sum(report.test_passes) / len(report.test_passes) if report.test_passes else 0,
14
- "average_score": report.overall_score
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/dataset.json CHANGED
@@ -230,85 +230,5 @@
230
  "Feature importance ranking",
231
  "Impact of each feature on the model's decision"
232
  ]
233
- },
234
- {
235
- "id": "synth_sdk_1",
236
- "question": "Application ID: 1023456",
237
- "expected_intent": "General",
238
- "expected_answer_key_points": [
239
- "Fraud Score: 750\nRisk Level: High\nSHAP Values:\n- Transaction History Impact: +200\n- Credit Score Impact: +150\n- Recent Loan Applications Impact: -50\n- Employment History Impact: +100\nExplanation: The application has a high fraud score primarily due to the transaction history and credit score, which together contribute significantly to the high risk level. The recent loan applications positively influence the score by lowering it slightly, but not enough to offset the other risk factors. Employment history further increases the score. It's essential to focus on improving transaction patterns to lower the fraud score."
240
- ]
241
- },
242
- {
243
- "id": "synth_sdk_2",
244
- "question": "Application ID: 4321XYZ\n- Request from Fraud Analyst: \"Please provide a detailed breakdown of the fraud score for this application. Include key feature contributions and provide an explanation for any features that heavily influence the score.\"\n- Tools and Capabilities Requested:\n - use explain_fraud_score(4321XYZ) to highlight SHAP values.\n - identify high-contributing features impacting fraud score.\n - analyze borderline scores and explain if any apply.",
245
- "expected_intent": "General",
246
- "expected_answer_key_points": [
247
- "Fraud Score: 680 (Moderate Risk)\nKey SHAP Contribution Analysis:\n1. \"Credit History Length\": +45 (Positive impact due to long credit history)\n2. \"Number of Recent Applications\": +100 (Negative impact due to recent application spike)\n3. \"Unusual Geolocation Activity\": +150 (High negative impact due to recent activity in a high-risk zone)\n4. \"Income Verification\": -65 (Positive impact due to verified and stable income)\n\nExplanation:\n- The fraud score is boosted to a higher risk mainly due to the unusual geolocation activity and the number of recent applications, which are significant red flags in fraud detection. While the credit history length and verified income provide a buffer reducing the risk level somewhat, the overall risk remains moderate.\n- Due to the combination and magnitude of these factors, the borderline score reflects an increased risk with possibilities of both legitimate and fraudulent activities present."
248
- ]
249
- },
250
- {
251
- "id": "synth_sdk_3",
252
- "question": "Task: Analyze the performance of the fraud detection model over the past six months.\n\nData Provided: \n- List of application IDs from the last six months\n- Access to tools: compare_to_population, explain_fraud_score\n\nKey Question: What trends can be identified in the distribution of fraud scores and SHAP value importance over this period?",
253
- "expected_intent": "General",
254
- "expected_answer_key_points": [
255
- "Output:\n1. Statistical report indicating any significant trends or shifts in fraud scores over the six-month period using compare_to_population.\n2. Identification of any changes in SHAP feature importance that may indicate shifts in model predictions.\n3. Summary of observations highlighting potential model drift or changes in population characteristics that could impact fraud detection accuracy."
256
- ]
257
- },
258
- {
259
- "id": "synth_sdk_4",
260
- "question": "A data scientist is analyzing the fraud detection model's performance over the past six months for applications in a specific region. They want to understand how the model's scoring distribution has shifted over this period and identify any significant changes in feature importance that might correspond with this shift. Use the tools compare_to_population and explain_fraud_score. The focus is on applications in the Midwest region, with application IDs ranging from 1001 to 1050. The model should track score distribution shifts and highlight any significant changes in SHAP feature importance.",
261
- "expected_intent": "General",
262
- "expected_answer_key_points": [
263
- "1. Provide a trend report showing score distribution changes over the past six months for the specified application range in the Midwest, indicating whether fraud scores have increased, decreased, or remained stable.\n2. Use compare_to_population to determine how these changes compare to national trends, noting any significant deviations.\n3. Highlight significant shifts in SHAP value contributions for key features over the same period, indicating which features have become more or less significant in model decision-making.\n4. Correlate findings with any historical applications data to suggest possible reasons for observed shifts, such as policy changes or external economic factors."
264
- ]
265
- },
266
- {
267
- "id": "synth_sdk_5",
268
- "question": "The Compliance Officer wants to verify if there is any evidence of gender bias in fraud scoring for the application ID: F12345. Use the 'check_fair_lending_flags' to detect bias indicators and 'compare_to_population' to analyze demographic-specific fraud score patterns.",
269
- "expected_intent": "General",
270
- "expected_answer_key_points": [
271
- "The 'check_fair_lending_flags' tool indicates a potential bias against female applicants due to disparate impact ratios below threshold in the fraud scoring process, indicating females are disproportionately denied compared to males. 'compare_to_population' further shows that female applicants have consistently higher average fraud scores compared to male counterparts for similar profiles, suggesting an underlying model bias."
272
- ]
273
- },
274
- {
275
- "id": "synth_sdk_6",
276
- "question": "compliance_officer: \"I need to verify if application 1234 has any fair lending bias issues. Check for any bias flags and provide basic SHAP contributions for this case.\"",
277
- "expected_intent": "General",
278
- "expected_answer_key_points": [
279
- "1. Use check_fair_lending_flags(app_id=1234) to identify if there are any bias issues. The tool will return 'No Bias Detected' or 'Potential Bias Detected'.\n2. If bias is present, explain_fraud_score(app_id=1234) to understand which features contribute most to the score.\n3. Report: 'No Bias Detected. Main factors influencing the fraud score are {feature1, feature2} contributing {value1, value2}.' OR 'Potential Bias Detected. SHAP analysis reveals major contributors: {biased_feature} with {value}.'"
280
- ]
281
- },
282
- {
283
- "id": "synth_sdk_7",
284
- "question": "You are tasked with preparing an executive summary that highlights the current trends in fraud detection for the past quarter for the executive team of the financial services company. Use the following application IDs: ['A12345', 'B67890', 'C13579'].",
285
- "expected_intent": "General",
286
- "expected_answer_key_points": [
287
- "The executive summary should include:\n1. A concise summary of the fraud scores and risk levels for each application using 'get_application_summary' for the IDs 'A12345', 'B67890', 'C13579'.\n2. A high-level analysis showing any evident trends in fraud scores over the quarter period.\n3. A brief mention of major contributing factors to the high fraud scores, utilizing SHAP values (not the specifics, just an overview of the top features contributing).\n4. A concise explanation of the application network status for each of these IDs using 'get_identity_network'.\n\nThe summary should be formatted in a report-friendly manner, prioritizing clarity and brevity for executive review."
288
- ]
289
- },
290
- {
291
- "id": "synth_sdk_8",
292
- "question": "Generate a high-level summary for executives on the fraud score trends and key contributing factors for applications over the last quarter. Include how applications are linked within the identity network for potential collusion. Use the following application IDs: ['A123', 'B456', 'C789'].",
293
- "expected_intent": "General",
294
- "expected_answer_key_points": [
295
- "The analysis for the last quarter indicates a rising trend in fraud scores, with the average score increasing by approximately 15% compared to the previous period. Key contributing factors identified through SHAP analysis include unusually high transaction volumes and discrepancies in reported income. Specific to applications ['A123', 'B456', 'C789'], notable features contributing to high fraud scores were similar across the board, indicating potential systemic fraud patterns. Moreover, the get_identity_network for these applications reveals interconnected relationships, suggesting possible collusion attempts. The network analysis indicates that several other applications are linked through shared contact information and transaction patterns, necessitating further investigation into these connections. Overall, the data suggests an evolving strategy that involves increasing sophistication in application profiles that require continued vigilance and adaptation of fraud detection measures."
296
- ]
297
- },
298
- {
299
- "id": "synth_sdk_9",
300
- "question": "Application ID: 12345; Requested Action: Analyze impact of linked applications on fraud score using get_identity_network.",
301
- "expected_intent": "General",
302
- "expected_answer_key_points": [
303
- "get_identity_network(12345) returns {\"linked_applications\": [\"11111\", \"22222\", \"33333\"]}. explain_fraud_score(12345) returns {\"SHAP_values\": {\"income\": -100, \"credit_history\": 50, \"linked_applications\": 200}}. compare_to_population(12345) shows linked applications influence fraud score to be 20% higher than similar approved applications. Risk level classified as 'Moderate Risk' due to negative impact from linked applications."
304
- ]
305
- },
306
- {
307
- "id": "synth_sdk_10",
308
- "question": "Application ID: 12345\nTask: Investigate the fraud score and impact of linked applications using the get_identity_network tool. Provide an analysis of how these linked applications influence the fraud score and SHAP values for Application ID 12345. Use compare_to_population to understand how these impacts differ from the general population.",
309
- "expected_intent": "General",
310
- "expected_answer_key_points": [
311
- "1. Identity Network Output: Application 12345 is linked with applications 56789, 98765, and 65432.\n2. Fraud Score Analysis: The fraud score for Application 12345 is 850. Linked applications have high fraud scores: Application 56789 (900), Application 98765 (875), and Application 65432 (910).\n3. SHAP Values: Key features that increased the fraud score include 'cross-application suspicious transactions' and 'irregular account activities.'\n4. Population Comparison: Compared to the approved applications' average fraud score of 500, Application 12345 and its linked applications show significantly higher risk, suggesting a potential fraud ring.\n5. Conclusion: The linkage to high-risk applications within the identity network explains the elevated fraud score for Application 12345, highlighting the influence of network associations in detecting fraud patterns."
312
- ]
313
  }
314
  ]
 
230
  "Feature importance ranking",
231
  "Impact of each feature on the model's decision"
232
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  }
234
  ]