File size: 7,589 Bytes
f971355
239406b
524721c
2c49fd3
 
 
 
 
 
 
 
 
5e1d6a9
 
 
2c49fd3
 
 
 
 
 
 
 
 
5e1d6a9
 
 
2c49fd3
 
 
 
f971355
239406b
 
 
 
 
524721c
 
 
 
 
 
 
f971355
239406b
 
 
 
 
 
 
 
 
 
 
 
 
 
f971355
239406b
f971355
239406b
 
 
 
 
 
 
f971355
239406b
 
 
 
 
 
43aa499
 
f971355
239406b
 
f971355
 
 
 
 
239406b
 
43aa499
239406b
2c49fd3
239406b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f971355
239406b
524721c
 
239406b
 
5e1d6a9
 
 
43aa499
 
 
5e1d6a9
239406b
 
 
 
524721c
239406b
6ba4a5e
5e1d6a9
 
 
43aa499
 
 
5e1d6a9
239406b
 
 
 
524721c
239406b
 
 
 
 
 
2c49fd3
5e1d6a9
 
 
f971355
b1025b8
 
f971355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c49fd3
 
f971355
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# app.py 

import os, glob, json, zipfile, traceback
import gradio as gr
import pandas as pd
from datetime import datetime, timezone

from core.providers import get_provider, ProviderKind
from core.preprocess import normalize_conversation, extract_model_utterances
from core.evaluators import evaluate_all_metrics
from core.fusion import weighted_total

# -----------------------------
# Defaults
# -----------------------------
DEFAULT_METRIC_WEIGHTS = {
    "trust": 0.20,
    "accuracy": 0.25,
    "explain": 0.15,
    "client_first": 0.15,
    "risk_safety": 0.15,
    "clarity": 0.10,
}

# -----------------------------
# Core runner
# -----------------------------
def run_eval(conversation: str,
             use_openai: bool,
             use_anthropic: bool,
             w_trust: float, w_accuracy: float, w_explain: float,
             w_client: float, w_risk: float, w_clarity: float):

    try:
        if not conversation or conversation.strip() == "":
            return None, None, None, None, "❌ Please paste a conversation to evaluate."

        # cleanup old ZIPs
        for f in glob.glob("/tmp/financeeval_*.zip"):
            try:
                os.remove(f)
            except Exception:
                pass

        # normalize weights from sliders
        user_weights = {
            "trust": w_trust, "accuracy": w_accuracy, "explain": w_explain,
            "client_first": w_client, "risk_safety": w_risk, "clarity": w_clarity
        }
        s = sum(user_weights.values()) or 1.0
        for k in user_weights:
            user_weights[k] = user_weights[k] / s

        # preprocess
        norm = normalize_conversation(conversation)
        model_only = extract_model_utterances(norm)

        providers = []
        if use_openai:
            providers.append(get_provider(ProviderKind.OPENAI, "gpt-4o"))
        if use_anthropic:
            providers.append(get_provider(ProviderKind.ANTHROPIC, "claude-3-5-sonnet-20240620"))
        if not providers:
            return None, None, None, None, "❌ Select at least one model provider."

        all_tables, compare_rows, token_usage_blocks, json_blobs = [], [], [], {}

        for p in providers:
            metrics_out, usage, raw_json = evaluate_all_metrics(
                provider=p, conversation_text=model_only, alpha_map={}
            )
            rows = []
            for m, payload in metrics_out.items():
                rows.append({
                    "Metric": m,
                    "LLM Score (1-5)": payload.get("judge_score", None),
                    "Final Score (0-10)": round(payload.get("score_0_10", 0.0), 2),
                    "Comment": payload.get("comment", ""),
                    "NLP Flags": json.dumps(payload.get("nlp_details", {}))[:200]
                })
            df = pd.DataFrame(rows)

            # total score with weight sliders
            total = weighted_total({k: v.get("score_0_10", 0.0) for k, v in metrics_out.items()},
                                   user_weights)

            compare_rows.append({
                "Model": p.label,
                **{r["Metric"]: r["Final Score (0-10)"] for _, r in df.iterrows()},
                "Total (0-10)": round(total, 2)
            })
            token_usage_blocks.append(
                f"{p.label}: prompt={usage.get('prompt',0)}, completion={usage.get('completion',0)}, total={usage.get('total',0)}"
            )
            json_blobs[p.label] = raw_json
            all_tables.append((p.label, df, round(total, 2)))

        compare_df = pd.DataFrame(compare_rows)
        avg_df = None
        if len(providers) > 1:
            num_cols = [c for c in compare_df.columns if c != "Model"]
            avg_row = {"Model": "Average"}
            for c in num_cols:
                avg_row[c] = round(compare_df[c].mean(), 2)
            avg_df = pd.DataFrame([avg_row])

        # ---- Write ZIP into /tmp ----
        ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
        zip_path = f"/tmp/financeeval_{ts}.zip"
        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
            for label, df, total in all_tables:
                df2 = df.copy()
                df2.loc[len(df2)] = {
                    "Metric": "TOTAL",
                    "LLM Score (1-5)": "-",
                    "Final Score (0-10)": total,
                    "Comment": "",
                    "NLP Flags": ""
                }
                zf.writestr(f"results_{label}_{ts}.csv", df2.to_csv(index=False).encode("utf-8"))
            zf.writestr(f"comparison_{ts}.csv", compare_df.to_csv(index=False).encode("utf-8"))
            zf.writestr(f"judgments_{ts}.json", json.dumps(json_blobs, indent=2).encode("utf-8"))

        # merge tables for UI
        merged_tables = []
        for label, df, total in all_tables:
            merged_tables.append(pd.DataFrame({
                "Metric": [f"β€” {label} β€”"],
                "LLM Score (1-5)": [""],
                "Final Score (0-10)": [""],
                "Comment": [""],
                "NLP Flags": [""]
            }))
            merged_tables.append(df)
        merged_df = pd.concat(merged_tables, ignore_index=True)
        usage_text_all = "\n".join(token_usage_blocks)

        return merged_df, compare_df, (avg_df if avg_df is not None else pd.DataFrame()), zip_path, usage_text_all

    except Exception as e:
        tb = traceback.format_exc()
        error_text = f"❌ Error: {str(e)}\n\nTraceback:\n{tb}"
        return None, None, None, None, error_text


# -----------------------------
# Gradio UI
# -----------------------------
def create_demo():
    with gr.Blocks(title="FinanceEval – HF Spaces") as demo:
        gr.Markdown("# πŸ”Ž FinanceEval")

        conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste transcript here...")
        with gr.Accordion("Model Selection", open=True):
            use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT-4o")
            use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")

        with gr.Accordion("Metric Weights", open=True):
            w_trust = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["trust"],step=0.01,label="Trust")
            w_accuracy = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["accuracy"],step=0.01,label="Accuracy")
            w_explain = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["explain"],step=0.01,label="Explainability")
            w_client = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["client_first"],step=0.01,label="Client-First")
            w_risk = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["risk_safety"],step=0.01,label="Risk Safety")
            w_clarity = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["clarity"],step=0.01,label="Clarity")

        run_btn = gr.Button("Evaluate")
        with gr.Tab("Per-Model Results"):
            table_out = gr.Dataframe()
        with gr.Tab("Comparison"):
            compare_out = gr.Dataframe()
            avg_out = gr.Dataframe()
        with gr.Tab("Downloads & Usage"):
            zip_file = gr.File(label="Download ZIP (CSVs + JSON)", type="filepath")
            usage_text = gr.Textbox(label="Token Usage / Errors", lines=8)

        run_btn.click(
            fn=run_eval,
            inputs=[conversation, use_openai, use_anthropic,
                    w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity],
            outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
        )
    return demo


if __name__ == "__main__":
    demo = create_demo()
    demo.launch()