Spaces:
Running
Running
Create evaluators.py
Browse files- core/evaluators.py +101 -0
core/evaluators.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
from typing import Dict, Any, Tuple
|
| 3 |
+
|
| 4 |
+
from core.providers import Provider
|
| 5 |
+
from core.fusion import fuse_metric
|
| 6 |
+
from core.schema import METRIC_ORDER
|
| 7 |
+
|
| 8 |
+
# NLP modules
|
| 9 |
+
from nlp import trust as nlp_trust
|
| 10 |
+
from nlp import accuracy as nlp_accuracy
|
| 11 |
+
from nlp import explain as nlp_explain
|
| 12 |
+
from nlp import client_first as nlp_client
|
| 13 |
+
from nlp import risk_safety as nlp_risk
|
| 14 |
+
from nlp import clarity as nlp_clarity
|
| 15 |
+
|
| 16 |
+
PROMPT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'prompts')
|
| 17 |
+
|
| 18 |
+
PROMPTS = {
|
| 19 |
+
'trust': os.path.join(PROMPT_DIR, 'trust.txt'),
|
| 20 |
+
'accuracy': os.path.join(PROMPT_DIR, 'accuracy.txt'),
|
| 21 |
+
'explain': os.path.join(PROMPT_DIR, 'explain.txt'),
|
| 22 |
+
'client_first': os.path.join(PROMPT_DIR, 'client_first.txt'),
|
| 23 |
+
'risk_safety': os.path.join(PROMPT_DIR, 'risk_safety.txt'),
|
| 24 |
+
'clarity': os.path.join(PROMPT_DIR, 'clarity.txt'),
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
SYSTEM_PREAMBLE = "You are a meticulous, concise finance evaluator. Always return strict JSON only."
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _load_prompt(name: str) -> str:
|
| 31 |
+
with open(PROMPTS[name], 'r') as f:
|
| 32 |
+
return f.read()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _judge_one(provider: Provider, metric: str, conversation_text: str) -> Tuple[Dict[str, Any], Dict[str, int]]:
|
| 36 |
+
prompt = _load_prompt(metric)
|
| 37 |
+
user_prompt = f"Conversation to evaluate:\n\n{conversation_text}\n\nReturn only JSON."
|
| 38 |
+
result, usage = provider.judge(SYSTEM_PREAMBLE, f"{prompt}\n\n{user_prompt}")
|
| 39 |
+
return (result or {}), (usage or {"prompt":0,"completion":0,"total":0})
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def evaluate_all_metrics(provider: Provider, conversation_text: str, alpha_map: Dict[str, float]):
|
| 43 |
+
out: Dict[str, Dict[str, Any]] = {}
|
| 44 |
+
total_usage = {"prompt": 0, "completion": 0, "total": 0}
|
| 45 |
+
raw_json = {}
|
| 46 |
+
|
| 47 |
+
# NLP subscores
|
| 48 |
+
nlp_funcs = {
|
| 49 |
+
'trust': nlp_trust.score,
|
| 50 |
+
'accuracy': nlp_accuracy.score,
|
| 51 |
+
'explain': nlp_explain.score,
|
| 52 |
+
'client_first': nlp_client.score,
|
| 53 |
+
'risk_safety': nlp_risk.score,
|
| 54 |
+
'clarity': nlp_clarity.score,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
for metric in METRIC_ORDER:
|
| 58 |
+
judge_json, usage = _judge_one(provider, metric, conversation_text)
|
| 59 |
+
raw_json[metric] = judge_json
|
| 60 |
+
|
| 61 |
+
# Extract judge score and comment field per the spec keys
|
| 62 |
+
if metric == 'trust':
|
| 63 |
+
judge_score = judge_json.get('trust')
|
| 64 |
+
comment = judge_json.get('reason', '')
|
| 65 |
+
elif metric == 'accuracy':
|
| 66 |
+
judge_score = judge_json.get('accuracy')
|
| 67 |
+
comment = judge_json.get('reason', '')
|
| 68 |
+
elif metric == 'explain':
|
| 69 |
+
judge_score = judge_json.get('explain')
|
| 70 |
+
comment = judge_json.get('reason', '')
|
| 71 |
+
elif metric == 'client_first':
|
| 72 |
+
judge_score = judge_json.get('client_first')
|
| 73 |
+
comment = judge_json.get('reason', '')
|
| 74 |
+
elif metric == 'risk_safety':
|
| 75 |
+
judge_score = judge_json.get('risk_safety')
|
| 76 |
+
comment = judge_json.get('reason', '')
|
| 77 |
+
elif metric == 'clarity':
|
| 78 |
+
judge_score = judge_json.get('clarity')
|
| 79 |
+
comment = judge_json.get('reason', '')
|
| 80 |
+
else:
|
| 81 |
+
judge_score, comment = None, ''
|
| 82 |
+
|
| 83 |
+
alpha = alpha_map.get(metric, 0.5)
|
| 84 |
+
nlp_payload = nlp_funcs[metric](conversation_text)
|
| 85 |
+
nlp_subscore = float(nlp_payload.get('subscore', 0.0))
|
| 86 |
+
|
| 87 |
+
fused = fuse_metric(judge_score, nlp_subscore, alpha)
|
| 88 |
+
|
| 89 |
+
out[metric] = {
|
| 90 |
+
'judge_score': judge_score,
|
| 91 |
+
'nlp_subscore': nlp_subscore,
|
| 92 |
+
'fused_0_10': fused,
|
| 93 |
+
'comment': comment,
|
| 94 |
+
'nlp_details': nlp_payload
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# accumulate usage
|
| 98 |
+
for k in total_usage:
|
| 99 |
+
total_usage[k] += usage.get(k, 0)
|
| 100 |
+
|
| 101 |
+
return out, total_usage, raw_json
|