Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, io, json, time, tempfile, zipfile
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
|
| 6 |
+
from core.providers import get_provider, ProviderKind
|
| 7 |
+
from core.preprocess import normalize_conversation, extract_model_utterances
|
| 8 |
+
from core.evaluators import evaluate_all_metrics
|
| 9 |
+
from core.fusion import weighted_total
|
| 10 |
+
|
| 11 |
+
# -----------------------------
|
| 12 |
+
# Defaults
|
| 13 |
+
# -----------------------------
|
| 14 |
+
DEFAULT_METRIC_WEIGHTS = {
|
| 15 |
+
"trust": 0.20,
|
| 16 |
+
"accuracy": 0.25,
|
| 17 |
+
"explain": 0.15,
|
| 18 |
+
"client_first": 0.15,
|
| 19 |
+
"risk_safety": 0.15,
|
| 20 |
+
"clarity": 0.10,
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
JUDGE_ALPHA = {
|
| 24 |
+
# α = LLM share in fusion per metric (from spec)
|
| 25 |
+
"trust": 0.70,
|
| 26 |
+
"accuracy": 0.65,
|
| 27 |
+
"explain": 0.50,
|
| 28 |
+
"client_first": 0.70,
|
| 29 |
+
"risk_safety": 0.60,
|
| 30 |
+
"clarity": 0.70,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# -----------------------------
|
| 34 |
+
# Core runner
|
| 35 |
+
# -----------------------------
|
| 36 |
+
|
| 37 |
+
def run_eval(conversation: str,
|
| 38 |
+
use_openai: bool,
|
| 39 |
+
use_anthropic: bool,
|
| 40 |
+
w_trust: float, w_accuracy: float, w_explain: float,
|
| 41 |
+
w_client: float, w_risk: float, w_clarity: float,
|
| 42 |
+
model_openai: str = "gpt-4o",
|
| 43 |
+
model_anthropic: str = "claude-3-5-sonnet-20240620"):
|
| 44 |
+
if not conversation or conversation.strip() == "":
|
| 45 |
+
return None, None, None, None, "Please paste a conversation to evaluate."
|
| 46 |
+
|
| 47 |
+
# Normalize metric weights
|
| 48 |
+
user_weights = {
|
| 49 |
+
"trust": w_trust,
|
| 50 |
+
"accuracy": w_accuracy,
|
| 51 |
+
"explain": w_explain,
|
| 52 |
+
"client_first": w_client,
|
| 53 |
+
"risk_safety": w_risk,
|
| 54 |
+
"clarity": w_clarity,
|
| 55 |
+
}
|
| 56 |
+
s = sum(user_weights.values()) or 1.0
|
| 57 |
+
for k in user_weights:
|
| 58 |
+
user_weights[k] = user_weights[k] / s
|
| 59 |
+
|
| 60 |
+
# Preprocess conversation
|
| 61 |
+
norm = normalize_conversation(conversation)
|
| 62 |
+
# Try to isolate model utterances (LLM fallback inside if ambiguous)
|
| 63 |
+
model_only = extract_model_utterances(norm, prefer_llm_provider=(model_openai if use_openai else (model_anthropic if use_anthropic else None)))
|
| 64 |
+
|
| 65 |
+
providers = []
|
| 66 |
+
if use_openai:
|
| 67 |
+
providers.append(get_provider(ProviderKind.OPENAI, model_openai))
|
| 68 |
+
if use_anthropic:
|
| 69 |
+
providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
|
| 70 |
+
|
| 71 |
+
if not providers:
|
| 72 |
+
return None, None, None, None, "Select at least one model provider."
|
| 73 |
+
|
| 74 |
+
all_tables = []
|
| 75 |
+
compare_rows = []
|
| 76 |
+
token_usage_blocks = []
|
| 77 |
+
json_blobs = {}
|
| 78 |
+
|
| 79 |
+
for p in providers:
|
| 80 |
+
metrics_out, usage, raw_json = evaluate_all_metrics(provider=p,
|
| 81 |
+
conversation_text=model_only,
|
| 82 |
+
alpha_map=JUDGE_ALPHA)
|
| 83 |
+
# Build table
|
| 84 |
+
rows = []
|
| 85 |
+
for m, payload in metrics_out.items():
|
| 86 |
+
rows.append({
|
| 87 |
+
"Metric": m,
|
| 88 |
+
"LLM Score (1-5)": payload.get("judge_score", None),
|
| 89 |
+
"NLP Subscore (0-1)": round(payload.get("nlp_subscore", 0.0), 3),
|
| 90 |
+
"Fused (0-10)": round(payload.get("fused_0_10", 0.0), 2),
|
| 91 |
+
"Comment": payload.get("comment", "")
|
| 92 |
+
})
|
| 93 |
+
df = pd.DataFrame(rows)
|
| 94 |
+
total = weighted_total({k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
|
| 95 |
+
compare_rows.append({"Model": p.label, **{r["Metric"]: r["Fused (0-10)"] for _, r in df.iterrows()}, "Total (0-10)": round(total, 2)})
|
| 96 |
+
|
| 97 |
+
# Token usage
|
| 98 |
+
usage_text = f"{p.label}: prompt_tokens={usage.get('prompt',0)}, completion_tokens={usage.get('completion',0)}, total={usage.get('total',0)}"
|
| 99 |
+
token_usage_blocks.append(usage_text)
|
| 100 |
+
|
| 101 |
+
# Persist JSON blob per model
|
| 102 |
+
json_blobs[p.label] = raw_json
|
| 103 |
+
|
| 104 |
+
all_tables.append((p.label, df, round(total, 2)))
|
| 105 |
+
|
| 106 |
+
# Comparison table
|
| 107 |
+
compare_df = pd.DataFrame(compare_rows)
|
| 108 |
+
|
| 109 |
+
# If 2 models, compute an average row
|
| 110 |
+
avg_df = None
|
| 111 |
+
if len(providers) > 1:
|
| 112 |
+
# Average across numeric columns only
|
| 113 |
+
num_cols = [c for c in compare_df.columns if c != "Model"]
|
| 114 |
+
avg_row = {"Model": "Average"}
|
| 115 |
+
for c in num_cols:
|
| 116 |
+
avg_row[c] = round(compare_df[c].mean(), 2)
|
| 117 |
+
avg_df = pd.DataFrame([avg_row])
|
| 118 |
+
|
| 119 |
+
# Build downloadable CSV and ZIP
|
| 120 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 121 |
+
out_dir = tempfile.mkdtemp(prefix="financeeval_")
|
| 122 |
+
|
| 123 |
+
# Write per-model CSVs
|
| 124 |
+
csv_paths = []
|
| 125 |
+
for label, df, total in all_tables:
|
| 126 |
+
pth = os.path.join(out_dir, f"results_{label}_{ts}.csv")
|
| 127 |
+
df2 = df.copy()
|
| 128 |
+
df2.loc[len(df2)] = {"Metric": "TOTAL", "LLM Score (1-5)": "-", "NLP Subscore (0-1)": "-", "Fused (0-10)": total, "Comment": ""}
|
| 129 |
+
df2.to_csv(pth, index=False)
|
| 130 |
+
csv_paths.append(pth)
|
| 131 |
+
|
| 132 |
+
# Comparison CSV
|
| 133 |
+
comp_path = os.path.join(out_dir, f"comparison_{ts}.csv")
|
| 134 |
+
compare_df.to_csv(comp_path, index=False)
|
| 135 |
+
|
| 136 |
+
# JSON outputs
|
| 137 |
+
json_path = os.path.join(out_dir, f"judgments_{ts}.json")
|
| 138 |
+
with open(json_path, 'w') as f:
|
| 139 |
+
json.dump(json_blobs, f, indent=2)
|
| 140 |
+
|
| 141 |
+
# Make ZIP
|
| 142 |
+
zip_path = os.path.join(out_dir, f"financeeval_{ts}.zip")
|
| 143 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
| 144 |
+
for pth in csv_paths + [comp_path, json_path]:
|
| 145 |
+
zf.write(pth, arcname=os.path.basename(pth))
|
| 146 |
+
|
| 147 |
+
# Return artifacts
|
| 148 |
+
merged_tables = []
|
| 149 |
+
for label, df, total in all_tables:
|
| 150 |
+
merged_tables.append(pd.DataFrame({"Metric": [f"— {label} —"], "LLM Score (1-5)": [""], "NLP Subscore (0-1)": [""], "Fused (0-10)": [""], "Comment": [""]}))
|
| 151 |
+
merged_tables.append(df)
|
| 152 |
+
merged_df = pd.concat(merged_tables, ignore_index=True)
|
| 153 |
+
|
| 154 |
+
usage_text_all = "\n".join(token_usage_blocks)
|
| 155 |
+
|
| 156 |
+
return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all
|
| 157 |
+
|
| 158 |
+
# -----------------------------
|
| 159 |
+
# UI
|
| 160 |
+
# -----------------------------
|
| 161 |
+
with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
|
| 162 |
+
gr.Markdown("""
|
| 163 |
+
# 🔎 FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)
|
| 164 |
+
Paste a finance conversation. Choose one or both judge models (OpenAI GPT‑4o, Claude 3.5 Sonnet). Adjust metric weights. Click **Evaluate**.
|
| 165 |
+
""")
|
| 166 |
+
|
| 167 |
+
with gr.Row():
|
| 168 |
+
conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste the full transcript here...")
|
| 169 |
+
|
| 170 |
+
with gr.Accordion("Model Selection", open=True):
|
| 171 |
+
with gr.Row():
|
| 172 |
+
use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT‑4o")
|
| 173 |
+
use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
|
| 174 |
+
with gr.Row():
|
| 175 |
+
model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
|
| 176 |
+
model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
|
| 177 |
+
gr.Markdown("**Secrets**: Set `OPENAI_API_KEY` and / or `ANTHROPIC_API_KEY` in your Space settings.")
|
| 178 |
+
|
| 179 |
+
with gr.Accordion("Metric Weights (affect only the TOTAL)", open=True):
|
| 180 |
+
with gr.Row():
|
| 181 |
+
w_trust = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["trust"], step=0.01, label="Trust")
|
| 182 |
+
w_accuracy = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["accuracy"], step=0.01, label="Accuracy")
|
| 183 |
+
w_explain = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["explain"], step=0.01, label="Explainability")
|
| 184 |
+
with gr.Row():
|
| 185 |
+
w_client = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["client_first"], step=0.01, label="Client‑First")
|
| 186 |
+
w_risk = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["risk_safety"], step=0.01, label="Risk Safety")
|
| 187 |
+
w_clarity = gr.Slider(0, 1, value=DEFAULT_METRIC_WEIGHTS["clarity"], step=0.01, label="Clarity")
|
| 188 |
+
gr.Markdown("Weights are normalized to sum to 1 before computing the TOTAL.")
|
| 189 |
+
|
| 190 |
+
run_btn = gr.Button("Evaluate")
|
| 191 |
+
|
| 192 |
+
with gr.Tab("Per‑Model Results"):
|
| 193 |
+
table_out = gr.Dataframe(label="Metric Scores & Comments (stacked per model)")
|
| 194 |
+
with gr.Tab("Comparison"):
|
| 195 |
+
compare_out = gr.Dataframe(label="Model Comparison (per metric + TOTAL)")
|
| 196 |
+
avg_out = gr.Dataframe(label="Average (if multiple models)")
|
| 197 |
+
with gr.Tab("Downloads & Usage"):
|
| 198 |
+
zip_file = gr.File(label="Download ZIP (CSVs + JSON)")
|
| 199 |
+
usage_text = gr.Textbox(label="Token Usage", lines=4)
|
| 200 |
+
|
| 201 |
+
run_btn.click(
|
| 202 |
+
fn=run_eval,
|
| 203 |
+
inputs=[conversation, use_openai, use_anthropic,
|
| 204 |
+
w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
|
| 205 |
+
model_openai, model_anthropic],
|
| 206 |
+
outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
demo.launch()
|