Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py (
|
| 2 |
|
| 3 |
import os, glob, json, zipfile, traceback
|
| 4 |
import gradio as gr
|
|
@@ -22,15 +22,6 @@ DEFAULT_METRIC_WEIGHTS = {
|
|
| 22 |
"clarity": 0.10,
|
| 23 |
}
|
| 24 |
|
| 25 |
-
JUDGE_ALPHA = {
|
| 26 |
-
"trust": 0.70,
|
| 27 |
-
"accuracy": 0.65,
|
| 28 |
-
"explain": 0.50,
|
| 29 |
-
"client_first": 0.70,
|
| 30 |
-
"risk_safety": 0.60,
|
| 31 |
-
"clarity": 0.70,
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
# -----------------------------
|
| 35 |
# Core runner
|
| 36 |
# -----------------------------
|
|
@@ -78,25 +69,22 @@ def run_eval(conversation: str,
|
|
| 78 |
|
| 79 |
for p in providers:
|
| 80 |
metrics_out, usage, raw_json = evaluate_all_metrics(
|
| 81 |
-
provider=p, conversation_text=model_only, alpha_map=
|
| 82 |
)
|
| 83 |
rows = []
|
| 84 |
for m, payload in metrics_out.items():
|
| 85 |
rows.append({
|
| 86 |
"Metric": m,
|
| 87 |
"LLM Score (1-5)": payload.get("judge_score", None),
|
| 88 |
-
"
|
| 89 |
-
"
|
| 90 |
-
"
|
| 91 |
})
|
| 92 |
df = pd.DataFrame(rows)
|
| 93 |
-
total = weighted_total(
|
| 94 |
-
{k: v.get("fused_0_10", 0.0) for k, v in metrics_out.items()},
|
| 95 |
-
user_weights
|
| 96 |
-
)
|
| 97 |
compare_rows.append({
|
| 98 |
"Model": p.label,
|
| 99 |
-
**{r["Metric"]: r["
|
| 100 |
"Total (0-10)": round(total, 2)
|
| 101 |
})
|
| 102 |
token_usage_blocks.append(
|
|
@@ -123,9 +111,9 @@ def run_eval(conversation: str,
|
|
| 123 |
df2.loc[len(df2)] = {
|
| 124 |
"Metric": "TOTAL",
|
| 125 |
"LLM Score (1-5)": "-",
|
| 126 |
-
"
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
}
|
| 130 |
zf.writestr(f"results_{label}_{ts}.csv", df2.to_csv(index=False).encode("utf-8"))
|
| 131 |
zf.writestr(f"comparison_{ts}.csv", compare_df.to_csv(index=False).encode("utf-8"))
|
|
@@ -137,9 +125,9 @@ def run_eval(conversation: str,
|
|
| 137 |
merged_tables.append(pd.DataFrame({
|
| 138 |
"Metric": [f"β {label} β"],
|
| 139 |
"LLM Score (1-5)": [""],
|
| 140 |
-
"
|
| 141 |
-
"
|
| 142 |
-
"
|
| 143 |
}))
|
| 144 |
merged_tables.append(df)
|
| 145 |
merged_df = pd.concat(merged_tables, ignore_index=True)
|
|
@@ -181,7 +169,6 @@ with gr.Blocks(title="FinanceEval β Hybrid Judge (Gradio)") as demo:
|
|
| 181 |
compare_out = gr.Dataframe()
|
| 182 |
avg_out = gr.Dataframe()
|
| 183 |
with gr.Tab("Downloads & Usage"):
|
| 184 |
-
# β
Fixed: type must be 'filepath' not 'file'
|
| 185 |
zip_file = gr.File(label="Download ZIP (CSVs + JSON)", type="filepath")
|
| 186 |
usage_text = gr.Textbox(label="Token Usage / Errors", lines=8)
|
| 187 |
|
|
@@ -194,4 +181,4 @@ with gr.Blocks(title="FinanceEval β Hybrid Judge (Gradio)") as demo:
|
|
| 194 |
)
|
| 195 |
|
| 196 |
if __name__ == "__main__":
|
| 197 |
-
demo.launch(
|
|
|
|
| 1 |
+
# app.py (LLM-only scoring, NLP as flags only, privacy-safe temp files)
|
| 2 |
|
| 3 |
import os, glob, json, zipfile, traceback
|
| 4 |
import gradio as gr
|
|
|
|
| 22 |
"clarity": 0.10,
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# -----------------------------
|
| 26 |
# Core runner
|
| 27 |
# -----------------------------
|
|
|
|
| 69 |
|
| 70 |
for p in providers:
|
| 71 |
metrics_out, usage, raw_json = evaluate_all_metrics(
|
| 72 |
+
provider=p, conversation_text=model_only, alpha_map={} # alpha_map ignored now
|
| 73 |
)
|
| 74 |
rows = []
|
| 75 |
for m, payload in metrics_out.items():
|
| 76 |
rows.append({
|
| 77 |
"Metric": m,
|
| 78 |
"LLM Score (1-5)": payload.get("judge_score", None),
|
| 79 |
+
"Final Score (0-10)": round(payload.get("score_0_10", 0.0), 2),
|
| 80 |
+
"Comment": payload.get("comment", ""),
|
| 81 |
+
"NLP Flags": json.dumps(payload.get("nlp_details", {}))[:200] # truncated
|
| 82 |
})
|
| 83 |
df = pd.DataFrame(rows)
|
| 84 |
+
total = weighted_total({k: v.get("score_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
|
|
|
|
|
|
|
|
|
|
| 85 |
compare_rows.append({
|
| 86 |
"Model": p.label,
|
| 87 |
+
**{r["Metric"]: r["Final Score (0-10)"] for _, r in df.iterrows()},
|
| 88 |
"Total (0-10)": round(total, 2)
|
| 89 |
})
|
| 90 |
token_usage_blocks.append(
|
|
|
|
| 111 |
df2.loc[len(df2)] = {
|
| 112 |
"Metric": "TOTAL",
|
| 113 |
"LLM Score (1-5)": "-",
|
| 114 |
+
"Final Score (0-10)": total,
|
| 115 |
+
"Comment": "",
|
| 116 |
+
"NLP Flags": ""
|
| 117 |
}
|
| 118 |
zf.writestr(f"results_{label}_{ts}.csv", df2.to_csv(index=False).encode("utf-8"))
|
| 119 |
zf.writestr(f"comparison_{ts}.csv", compare_df.to_csv(index=False).encode("utf-8"))
|
|
|
|
| 125 |
merged_tables.append(pd.DataFrame({
|
| 126 |
"Metric": [f"β {label} β"],
|
| 127 |
"LLM Score (1-5)": [""],
|
| 128 |
+
"Final Score (0-10)": [""],
|
| 129 |
+
"Comment": [""],
|
| 130 |
+
"NLP Flags": [""]
|
| 131 |
}))
|
| 132 |
merged_tables.append(df)
|
| 133 |
merged_df = pd.concat(merged_tables, ignore_index=True)
|
|
|
|
| 169 |
compare_out = gr.Dataframe()
|
| 170 |
avg_out = gr.Dataframe()
|
| 171 |
with gr.Tab("Downloads & Usage"):
|
|
|
|
| 172 |
zip_file = gr.File(label="Download ZIP (CSVs + JSON)", type="filepath")
|
| 173 |
usage_text = gr.Textbox(label="Token Usage / Errors", lines=8)
|
| 174 |
|
|
|
|
| 181 |
)
|
| 182 |
|
| 183 |
if __name__ == "__main__":
|
| 184 |
+
demo.launch()
|