Spaces:
Running
Running
| import os, json | |
| from typing import Dict, Any, Tuple | |
| from core.providers import Provider | |
| from core.fusion import normalize_llm_score | |
| from core.schema import METRIC_ORDER | |
| # NLP modules | |
| from nlp import trust as nlp_trust | |
| from nlp import accuracy as nlp_accuracy | |
| from nlp import explain as nlp_explain | |
| from nlp import client_first as nlp_client | |
| from nlp import risk_safety as nlp_risk | |
| from nlp import clarity as nlp_clarity | |
| PROMPT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'prompts') | |
| PROMPTS = { | |
| 'trust': os.path.join(PROMPT_DIR, 'trust.txt'), | |
| 'accuracy': os.path.join(PROMPT_DIR, 'accuracy.txt'), | |
| 'explain': os.path.join(PROMPT_DIR, 'explain.txt'), | |
| 'client_first': os.path.join(PROMPT_DIR, 'client_first.txt'), | |
| 'risk_safety': os.path.join(PROMPT_DIR, 'risk_safety.txt'), | |
| 'clarity': os.path.join(PROMPT_DIR, 'clarity.txt'), | |
| } | |
| SYSTEM_PREAMBLE = "You are a meticulous, concise finance evaluator. Always return strict JSON only." | |
| def _load_prompt(name: str) -> str: | |
| with open(PROMPTS[name], 'r') as f: | |
| return f.read() | |
| def _judge_one(provider: Provider, metric: str, conversation_text: str) -> Tuple[Dict[str, Any], Dict[str, int]]: | |
| prompt = _load_prompt(metric) | |
| user_prompt = f"Conversation to evaluate:\n\n{conversation_text}\n\nReturn only JSON." | |
| result, usage = provider.judge(SYSTEM_PREAMBLE, f"{prompt}\n\n{user_prompt}") | |
| return (result or {}), (usage or {"prompt":0,"completion":0,"total":0}) | |
| def evaluate_all_metrics(provider: Provider, conversation_text: str, alpha_map: Dict[str, float]): | |
| out: Dict[str, Dict[str, Any]] = {} | |
| total_usage = {"prompt": 0, "completion": 0, "total": 0} | |
| raw_json = {} | |
| nlp_funcs = { | |
| 'trust': nlp_trust.score, | |
| 'accuracy': nlp_accuracy.score, | |
| 'explain': nlp_explain.score, | |
| 'client_first': nlp_client.score, | |
| 'risk_safety': nlp_risk.score, | |
| 'clarity': nlp_clarity.score, | |
| } | |
| for metric in METRIC_ORDER: | |
| judge_json, usage = _judge_one(provider, metric, conversation_text) | |
| raw_json[metric] = judge_json | |
| # Extract LLM judge score and comment | |
| if metric in judge_json: | |
| judge_score = judge_json.get(metric) | |
| else: | |
| judge_score = None | |
| comment = judge_json.get("reason", "") | |
| # Normalize LLM score 1β5 β 0β10 | |
| fused = normalize_llm_score(judge_score) | |
| # Run NLP detectors (flags only) | |
| nlp_payload = nlp_funcs[metric](conversation_text) | |
| out[metric] = { | |
| 'judge_score': judge_score, | |
| 'score_0_10': fused, | |
| 'comment': comment, | |
| 'nlp_details': nlp_payload | |
| } | |
| for k in total_usage: | |
| total_usage[k] += usage.get(k, 0) | |
| return out, total_usage, raw_json | |