navaneethkrishnan commited on
Commit
f971355
Β·
verified Β·
1 Parent(s): 43aa499

Update app.py "Model Bugs Fixed"

Browse files
Files changed (1) hide show
  1. app.py +50 -47
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py (LLM-only scoring, NLP as flags only, privacy-safe temp files)
2
 
3
  import os, glob, json, zipfile, traceback
4
  import gradio as gr
@@ -29,9 +29,7 @@ def run_eval(conversation: str,
29
  use_openai: bool,
30
  use_anthropic: bool,
31
  w_trust: float, w_accuracy: float, w_explain: float,
32
- w_client: float, w_risk: float, w_clarity: float,
33
- model_openai: str = "gpt-4o",
34
- model_anthropic: str = "claude-3-5-sonnet-20240620"):
35
 
36
  try:
37
  if not conversation or conversation.strip() == "":
@@ -44,7 +42,7 @@ def run_eval(conversation: str,
44
  except Exception:
45
  pass
46
 
47
- # normalize weights
48
  user_weights = {
49
  "trust": w_trust, "accuracy": w_accuracy, "explain": w_explain,
50
  "client_first": w_client, "risk_safety": w_risk, "clarity": w_clarity
@@ -59,9 +57,9 @@ def run_eval(conversation: str,
59
 
60
  providers = []
61
  if use_openai:
62
- providers.append(get_provider(ProviderKind.OPENAI, model_openai))
63
  if use_anthropic:
64
- providers.append(get_provider(ProviderKind.ANTHROPIC, model_anthropic))
65
  if not providers:
66
  return None, None, None, None, "❌ Select at least one model provider."
67
 
@@ -69,7 +67,7 @@ def run_eval(conversation: str,
69
 
70
  for p in providers:
71
  metrics_out, usage, raw_json = evaluate_all_metrics(
72
- provider=p, conversation_text=model_only, alpha_map={} # alpha_map ignored now
73
  )
74
  rows = []
75
  for m, payload in metrics_out.items():
@@ -78,10 +76,14 @@ def run_eval(conversation: str,
78
  "LLM Score (1-5)": payload.get("judge_score", None),
79
  "Final Score (0-10)": round(payload.get("score_0_10", 0.0), 2),
80
  "Comment": payload.get("comment", ""),
81
- "NLP Flags": json.dumps(payload.get("nlp_details", {}))[:200] # truncated
82
  })
83
  df = pd.DataFrame(rows)
84
- total = weighted_total({k: v.get("score_0_10", 0.0) for k, v in metrics_out.items()}, user_weights)
 
 
 
 
85
  compare_rows.append({
86
  "Model": p.label,
87
  **{r["Metric"]: r["Final Score (0-10)"] for _, r in df.iterrows()},
@@ -102,7 +104,7 @@ def run_eval(conversation: str,
102
  avg_row[c] = round(compare_df[c].mean(), 2)
103
  avg_df = pd.DataFrame([avg_row])
104
 
105
- # ---- Write ZIP into /tmp (ephemeral, privacy-safe) ----
106
  ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
107
  zip_path = f"/tmp/financeeval_{ts}.zip"
108
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
@@ -144,41 +146,42 @@ def run_eval(conversation: str,
144
  # -----------------------------
145
  # Gradio UI
146
  # -----------------------------
147
- with gr.Blocks(title="FinanceEval – Hybrid Judge (Gradio)") as demo:
148
- gr.Markdown("# πŸ”Ž FinanceEval – Hybrid Evaluation (Gradio / HF Spaces)")
149
-
150
- conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste transcript here...")
151
- with gr.Accordion("Model Selection", open=True):
152
- use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT-4o")
153
- use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
154
- model_openai = gr.Textbox(value="gpt-4o", label="OpenAI model name")
155
- model_anthropic = gr.Textbox(value="claude-3-5-sonnet-20240620", label="Anthropic model name")
156
-
157
- with gr.Accordion("Metric Weights", open=True):
158
- w_trust = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["trust"],step=0.01,label="Trust")
159
- w_accuracy = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["accuracy"],step=0.01,label="Accuracy")
160
- w_explain = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["explain"],step=0.01,label="Explainability")
161
- w_client = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["client_first"],step=0.01,label="Client-First")
162
- w_risk = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["risk_safety"],step=0.01,label="Risk Safety")
163
- w_clarity = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["clarity"],step=0.01,label="Clarity")
164
-
165
- run_btn = gr.Button("Evaluate")
166
- with gr.Tab("Per-Model Results"):
167
- table_out = gr.Dataframe()
168
- with gr.Tab("Comparison"):
169
- compare_out = gr.Dataframe()
170
- avg_out = gr.Dataframe()
171
- with gr.Tab("Downloads & Usage"):
172
- zip_file = gr.File(label="Download ZIP (CSVs + JSON)", type="filepath")
173
- usage_text = gr.Textbox(label="Token Usage / Errors", lines=8)
174
-
175
- run_btn.click(
176
- fn=run_eval,
177
- inputs=[conversation, use_openai, use_anthropic,
178
- w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity,
179
- model_openai, model_anthropic],
180
- outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
181
- )
182
 
183
  if __name__ == "__main__":
184
- demo.launch()
 
 
1
+ # app.py
2
 
3
  import os, glob, json, zipfile, traceback
4
  import gradio as gr
 
29
  use_openai: bool,
30
  use_anthropic: bool,
31
  w_trust: float, w_accuracy: float, w_explain: float,
32
+ w_client: float, w_risk: float, w_clarity: float):
 
 
33
 
34
  try:
35
  if not conversation or conversation.strip() == "":
 
42
  except Exception:
43
  pass
44
 
45
+ # normalize weights from sliders
46
  user_weights = {
47
  "trust": w_trust, "accuracy": w_accuracy, "explain": w_explain,
48
  "client_first": w_client, "risk_safety": w_risk, "clarity": w_clarity
 
57
 
58
  providers = []
59
  if use_openai:
60
+ providers.append(get_provider(ProviderKind.OPENAI, "gpt-4o"))
61
  if use_anthropic:
62
+ providers.append(get_provider(ProviderKind.ANTHROPIC, "claude-3-5-sonnet-20240620"))
63
  if not providers:
64
  return None, None, None, None, "❌ Select at least one model provider."
65
 
 
67
 
68
  for p in providers:
69
  metrics_out, usage, raw_json = evaluate_all_metrics(
70
+ provider=p, conversation_text=model_only, alpha_map={}
71
  )
72
  rows = []
73
  for m, payload in metrics_out.items():
 
76
  "LLM Score (1-5)": payload.get("judge_score", None),
77
  "Final Score (0-10)": round(payload.get("score_0_10", 0.0), 2),
78
  "Comment": payload.get("comment", ""),
79
+ "NLP Flags": json.dumps(payload.get("nlp_details", {}))[:200]
80
  })
81
  df = pd.DataFrame(rows)
82
+
83
+ # total score with weight sliders
84
+ total = weighted_total({k: v.get("score_0_10", 0.0) for k, v in metrics_out.items()},
85
+ user_weights)
86
+
87
  compare_rows.append({
88
  "Model": p.label,
89
  **{r["Metric"]: r["Final Score (0-10)"] for _, r in df.iterrows()},
 
104
  avg_row[c] = round(compare_df[c].mean(), 2)
105
  avg_df = pd.DataFrame([avg_row])
106
 
107
+ # ---- Write ZIP into /tmp ----
108
  ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
109
  zip_path = f"/tmp/financeeval_{ts}.zip"
110
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
 
146
  # -----------------------------
147
  # Gradio UI
148
  # -----------------------------
149
+ def create_demo():
150
+ with gr.Blocks(title="FinanceEval – Localhost") as demo:
151
+ gr.Markdown("# πŸ”Ž FinanceEval – Localhost Evaluation")
152
+
153
+ conversation = gr.Textbox(label="Conversation", lines=16, placeholder="Paste transcript here...")
154
+ with gr.Accordion("Model Selection", open=True):
155
+ use_openai = gr.Checkbox(value=True, label="Use OpenAI GPT-4o")
156
+ use_anthropic = gr.Checkbox(value=False, label="Use Claude 3.5 Sonnet")
157
+
158
+ with gr.Accordion("Metric Weights", open=True):
159
+ w_trust = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["trust"],step=0.01,label="Trust")
160
+ w_accuracy = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["accuracy"],step=0.01,label="Accuracy")
161
+ w_explain = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["explain"],step=0.01,label="Explainability")
162
+ w_client = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["client_first"],step=0.01,label="Client-First")
163
+ w_risk = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["risk_safety"],step=0.01,label="Risk Safety")
164
+ w_clarity = gr.Slider(0,1,value=DEFAULT_METRIC_WEIGHTS["clarity"],step=0.01,label="Clarity")
165
+
166
+ run_btn = gr.Button("Evaluate")
167
+ with gr.Tab("Per-Model Results"):
168
+ table_out = gr.Dataframe()
169
+ with gr.Tab("Comparison"):
170
+ compare_out = gr.Dataframe()
171
+ avg_out = gr.Dataframe()
172
+ with gr.Tab("Downloads & Usage"):
173
+ zip_file = gr.File(label="Download ZIP (CSVs + JSON)", type="filepath")
174
+ usage_text = gr.Textbox(label="Token Usage / Errors", lines=8)
175
+
176
+ run_btn.click(
177
+ fn=run_eval,
178
+ inputs=[conversation, use_openai, use_anthropic,
179
+ w_trust, w_accuracy, w_explain, w_client, w_risk, w_clarity],
180
+ outputs=[table_out, compare_out, avg_out, zip_file, usage_text]
181
+ )
182
+ return demo
183
+
184
 
185
  if __name__ == "__main__":
186
+ demo = create_demo()
187
+ demo.launch()