navaneethkrishnan commited on
Commit
b4d93ed
·
verified ·
1 Parent(s): da9c977

Create evaluators.py

Browse files
Files changed (1) hide show
  1. core/evaluators.py +101 -0
core/evaluators.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from typing import Dict, Any, Tuple
3
+
4
+ from core.providers import Provider
5
+ from core.fusion import fuse_metric
6
+ from core.schema import METRIC_ORDER
7
+
8
+ # NLP modules
9
+ from nlp import trust as nlp_trust
10
+ from nlp import accuracy as nlp_accuracy
11
+ from nlp import explain as nlp_explain
12
+ from nlp import client_first as nlp_client
13
+ from nlp import risk_safety as nlp_risk
14
+ from nlp import clarity as nlp_clarity
15
+
16
+ PROMPT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'prompts')
17
+
18
+ PROMPTS = {
19
+ 'trust': os.path.join(PROMPT_DIR, 'trust.txt'),
20
+ 'accuracy': os.path.join(PROMPT_DIR, 'accuracy.txt'),
21
+ 'explain': os.path.join(PROMPT_DIR, 'explain.txt'),
22
+ 'client_first': os.path.join(PROMPT_DIR, 'client_first.txt'),
23
+ 'risk_safety': os.path.join(PROMPT_DIR, 'risk_safety.txt'),
24
+ 'clarity': os.path.join(PROMPT_DIR, 'clarity.txt'),
25
+ }
26
+
27
+ SYSTEM_PREAMBLE = "You are a meticulous, concise finance evaluator. Always return strict JSON only."
28
+
29
+
30
+ def _load_prompt(name: str) -> str:
31
+ with open(PROMPTS[name], 'r') as f:
32
+ return f.read()
33
+
34
+
35
+ def _judge_one(provider: Provider, metric: str, conversation_text: str) -> Tuple[Dict[str, Any], Dict[str, int]]:
36
+ prompt = _load_prompt(metric)
37
+ user_prompt = f"Conversation to evaluate:\n\n{conversation_text}\n\nReturn only JSON."
38
+ result, usage = provider.judge(SYSTEM_PREAMBLE, f"{prompt}\n\n{user_prompt}")
39
+ return (result or {}), (usage or {"prompt":0,"completion":0,"total":0})
40
+
41
+
42
+ def evaluate_all_metrics(provider: Provider, conversation_text: str, alpha_map: Dict[str, float]):
43
+ out: Dict[str, Dict[str, Any]] = {}
44
+ total_usage = {"prompt": 0, "completion": 0, "total": 0}
45
+ raw_json = {}
46
+
47
+ # NLP subscores
48
+ nlp_funcs = {
49
+ 'trust': nlp_trust.score,
50
+ 'accuracy': nlp_accuracy.score,
51
+ 'explain': nlp_explain.score,
52
+ 'client_first': nlp_client.score,
53
+ 'risk_safety': nlp_risk.score,
54
+ 'clarity': nlp_clarity.score,
55
+ }
56
+
57
+ for metric in METRIC_ORDER:
58
+ judge_json, usage = _judge_one(provider, metric, conversation_text)
59
+ raw_json[metric] = judge_json
60
+
61
+ # Extract judge score and comment field per the spec keys
62
+ if metric == 'trust':
63
+ judge_score = judge_json.get('trust')
64
+ comment = judge_json.get('reason', '')
65
+ elif metric == 'accuracy':
66
+ judge_score = judge_json.get('accuracy')
67
+ comment = judge_json.get('reason', '')
68
+ elif metric == 'explain':
69
+ judge_score = judge_json.get('explain')
70
+ comment = judge_json.get('reason', '')
71
+ elif metric == 'client_first':
72
+ judge_score = judge_json.get('client_first')
73
+ comment = judge_json.get('reason', '')
74
+ elif metric == 'risk_safety':
75
+ judge_score = judge_json.get('risk_safety')
76
+ comment = judge_json.get('reason', '')
77
+ elif metric == 'clarity':
78
+ judge_score = judge_json.get('clarity')
79
+ comment = judge_json.get('reason', '')
80
+ else:
81
+ judge_score, comment = None, ''
82
+
83
+ alpha = alpha_map.get(metric, 0.5)
84
+ nlp_payload = nlp_funcs[metric](conversation_text)
85
+ nlp_subscore = float(nlp_payload.get('subscore', 0.0))
86
+
87
+ fused = fuse_metric(judge_score, nlp_subscore, alpha)
88
+
89
+ out[metric] = {
90
+ 'judge_score': judge_score,
91
+ 'nlp_subscore': nlp_subscore,
92
+ 'fused_0_10': fused,
93
+ 'comment': comment,
94
+ 'nlp_details': nlp_payload
95
+ }
96
+
97
+ # accumulate usage
98
+ for k in total_usage:
99
+ total_usage[k] += usage.get(k, 0)
100
+
101
+ return out, total_usage, raw_json