cyc00518 commited on
Commit
e33ee5a
·
1 Parent(s): 26d467f

feat: add Baseline Δ analysis settings and visualization

Browse files
Files changed (1) hide show
  1. app.py +263 -1
app.py CHANGED
@@ -106,12 +106,23 @@ with st.sidebar:
106
  df_all, meta_all = load_all(files)
107
  normalize_0_100 = st.checkbox("以 0–100 顯示", value=False)
108
  page_size = st.selectbox("每張圖顯示幾個類別", [10, 20, 30, 50, 100], index=1)
109
- sort_mode = st.selectbox("排序方式", ["依整體平均由高到低", "依整體平均由低到高", "依字母排序"])
 
 
 
 
 
 
 
 
 
 
110
 
111
  if df_all.empty:
112
  st.info("請上傳 Twinkle Eval 檔案")
113
  st.stop()
114
 
 
115
  all_datasets = sorted(df_all["dataset"].unique().tolist())
116
  selected_dataset = st.selectbox("選擇資料集", options=all_datasets)
117
  work = df_all[df_all["dataset"] == selected_dataset].copy()
@@ -132,6 +143,7 @@ work["category"] = pd.Categorical(work["category"], categories=cat_order, ordere
132
  n = len(cat_order)
133
  pages = int(np.ceil(n / page_size))
134
 
 
135
  for p in range(pages):
136
  start, end = p * page_size, min((p + 1) * page_size, n)
137
  subset_cats = cat_order[start:end]
@@ -153,3 +165,253 @@ for p in range(pages):
153
  file_name=f"twinkle_{selected_dataset}_{start+1}_{end}.csv",
154
  mime="text/csv"
155
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  df_all, meta_all = load_all(files)
107
  normalize_0_100 = st.checkbox("以 0–100 顯示", value=False)
108
  page_size = st.selectbox("每張圖顯示幾個類別", [10, 20, 30, 50, 100], index=1)
109
+ sort_mode = st.selectbox("排序方式(原始成績)", ["依整體平均由高到低", "依整體平均由低到高", "依字母排序"])
110
+
111
+ # === Baseline Δ 圖表的控制 ===
112
+ st.markdown("---")
113
+ st.subheader("差距分析設定(Baseline Δ)")
114
+ options = ["|Δ| 由大到小", "Δ 由大到小(提升最多)", "Δ 由小到大(下降最多)", "依類別名稱"]
115
+ default = "Δ 由大到小(提升最多)"
116
+ delta_sort_mode = st.selectbox("差距排序方式(per-category)", options, index=options.index(default), key="delta_sort_mode")
117
+
118
+ abs_threshold = st.number_input("只顯示 |Δ| ≥ 門檻(可選)", min_value=0.0, value=0.0, step=0.1)
119
+ st.caption("Δ = Candidate 分數 − Baseline 分數;建議以 0–100 模式計算更直觀。")
120
 
121
  if df_all.empty:
122
  st.info("請上傳 Twinkle Eval 檔案")
123
  st.stop()
124
 
125
+ # ----------------- 原始成績-----------------
126
  all_datasets = sorted(df_all["dataset"].unique().tolist())
127
  selected_dataset = st.selectbox("選擇資料集", options=all_datasets)
128
  work = df_all[df_all["dataset"] == selected_dataset].copy()
 
143
  n = len(cat_order)
144
  pages = int(np.ceil(n / page_size))
145
 
146
+ st.markdown("## 📈 原始成績(各模型 × 類別)")
147
  for p in range(pages):
148
  start, end = p * page_size, min((p + 1) * page_size, n)
149
  subset_cats = cat_order[start:end]
 
165
  file_name=f"twinkle_{selected_dataset}_{start+1}_{end}.csv",
166
  mime="text/csv"
167
  )
168
+
169
+ # ----------------- 差距(Baseline Δ)分析 -----------------
170
+
171
+ st.markdown("---")
172
+ st.markdown("## ⚖️ 差距分析:Baseline vs. Candidates(Δ = Candidate − Baseline)")
173
+
174
+ # 使用與上方相同的資料集
175
+ dataset_for_delta = selected_dataset
176
+
177
+ df_delta_scope = df_all[df_all["dataset"] == dataset_for_delta].copy()
178
+ if df_delta_scope.empty:
179
+ st.warning(f"在資料集 **{dataset_for_delta}** 找不到資料,請確認上傳的 JSON 含此資料集名稱。")
180
+ try:
181
+ st.stop()
182
+ except Exception:
183
+ raise SystemExit
184
+
185
+ # 統一與上方尺度(建議用 0–100 再做差)
186
+ score_col = "score_0100"
187
+ df_delta_scope[score_col] = df_delta_scope["accuracy_mean"] * (100.0 if normalize_0_100 else 1.0)
188
+
189
+ # 手動指定 Baseline 與 Candidates
190
+ all_sources_in_scope = sorted(df_delta_scope["source_label"].unique().tolist())
191
+ col1, col2 = st.columns([1, 2])
192
+ with col1:
193
+ baseline = st.selectbox("選擇基準模型(Baseline)", options=all_sources_in_scope)
194
+ with col2:
195
+ default_candidates = [s for s in all_sources_in_scope if s != baseline]
196
+ candidates = st.multiselect("選擇要比較的候選模型(Candidates)", options=all_sources_in_scope, default=default_candidates)
197
+
198
+ if not candidates:
199
+ st.info("請至少選擇一個 Candidate。")
200
+ try:
201
+ st.stop()
202
+ except Exception:
203
+ raise SystemExit
204
+
205
+ # 建立寬表(index=category;已固定 dataset_for_delta)
206
+ wide = df_delta_scope.pivot_table(index="category", columns="source_label", values=score_col, aggfunc="mean")
207
+
208
+ # 只比較 baseline 與 candidates 的交集列
209
+ valid_candidates = [c for c in candidates if c in wide.columns]
210
+ if baseline not in wide.columns:
211
+ st.error("Baseline 在此資料集沒有任何分數可比。請換一個 Baseline 或資料集。")
212
+ try:
213
+ st.stop()
214
+ except Exception:
215
+ raise SystemExit
216
+ if not valid_candidates:
217
+ st.error("選取的 Candidates 在此資料集沒有任何分數可比。請換一組 Candidates 或資料集。")
218
+ try:
219
+ st.stop()
220
+ except Exception:
221
+ raise SystemExit
222
+
223
+ # 計算 Δ 長表(保留 baseline/candidate 原始分數)
224
+ delta_rows = []
225
+ for c in valid_candidates:
226
+ pair = wide[[baseline, c]].dropna() # 僅兩者皆有分數的類別
227
+ if pair.empty:
228
+ continue
229
+ for cat, row in pair.iterrows():
230
+ b = float(row[baseline])
231
+ s = float(row[c])
232
+ delta = s - b
233
+ if abs(delta) < abs_threshold: # 門檻過濾
234
+ continue
235
+ delta_rows.append({
236
+ "dataset": dataset_for_delta,
237
+ "category": cat,
238
+ "baseline": baseline,
239
+ "candidate": c,
240
+ "baseline_score": b,
241
+ "candidate_score": s,
242
+ "delta": delta
243
+ })
244
+
245
+ delta_df = pd.DataFrame(delta_rows)
246
+ if delta_df.empty:
247
+ st.warning("沒有符合條件的可比較類別(可能因缺漏或門檻過高)。")
248
+ try:
249
+ st.stop()
250
+ except Exception:
251
+ raise SystemExit
252
+
253
+ # 差距排序
254
+ if delta_sort_mode == "|��| 由大到小":
255
+ delta_df = delta_df.sort_values("delta", key=lambda s: s.abs(), ascending=False)
256
+ elif delta_sort_mode == "Δ 由大到小(提升最多)":
257
+ delta_df = delta_df.sort_values("delta", ascending=False)
258
+ elif delta_sort_mode == "Δ 由小到大(下降最多)":
259
+ delta_df = delta_df.sort_values("delta", ascending=True)
260
+ else:
261
+ delta_df = delta_df.sort_values("category", ascending=True)
262
+
263
+ # 圖表(Δ 不分頁,一次顯示全部類別)
264
+ tab1, tab2 = st.tabs(["📊 差距排行(per-category)", "📜 模型總結(per-candidate)"])
265
+
266
+ with tab1:
267
+ sub = delta_df.copy()
268
+
269
+ # === 先在 Pandas 內算出每個 candidate 的排序名次 ===
270
+ if delta_sort_mode == "Δ 由大到小(提升最多)":
271
+ sub["rank_in_candidate"] = sub.groupby("candidate")["delta"].rank(ascending=False, method="first")
272
+ table_sort = lambda df: df.sort_values(["candidate", "rank_in_candidate"], ascending=[True, True])
273
+ y_sort = alt.SortField("rank_in_candidate", order="ascending")
274
+ resolve_y = "independent"
275
+
276
+ elif delta_sort_mode == "Δ 由小到大(下降最多)":
277
+ sub["rank_in_candidate"] = sub.groupby("candidate")["delta"].rank(ascending=True, method="first")
278
+ table_sort = lambda df: df.sort_values(["candidate", "rank_in_candidate"], ascending=[True, True])
279
+ y_sort = alt.SortField("rank_in_candidate", order="ascending")
280
+ resolve_y = "independent"
281
+
282
+ elif delta_sort_mode == "|Δ| 由大到小":
283
+ sub["abs_delta"] = sub["delta"].abs()
284
+ sub["rank_in_candidate"] = sub.groupby("candidate")["abs_delta"].rank(ascending=False, method="first")
285
+ table_sort = lambda df: df.sort_values(["candidate", "rank_in_candidate"], ascending=[True, True])
286
+ y_sort = alt.SortField("rank_in_candidate", order="ascending")
287
+ resolve_y = "independent"
288
+
289
+ else: # 依類別名稱(字母序),共用排序
290
+ # 不用 rank,直接字母序
291
+ table_sort = lambda df: df.sort_values(["category", "candidate"], ascending=[True, True])
292
+ y_sort = alt.SortField("category", order="ascending")
293
+ resolve_y = "shared"
294
+
295
+ st.subheader(f"🔎 {dataset_for_delta}|Δ 排行(全部 {sub['category'].nunique()} 類別)")
296
+
297
+ chart_height = 25 * max(1, sub["category"].nunique())
298
+
299
+ base = alt.Chart(sub).encode(
300
+ y=alt.Y("category:N", sort=y_sort, title="Category"),
301
+ x=alt.X("delta:Q", title="Δ = Candidate − Baseline"),
302
+ color=alt.Color("candidate:N", title="Candidate"),
303
+ tooltip=[
304
+ alt.Tooltip("category:N", title="Category"),
305
+ alt.Tooltip("candidate:N", title="Candidate"),
306
+ alt.Tooltip("baseline:N", title="Baseline"),
307
+ alt.Tooltip("baseline_score:Q", title="Baseline 分數", format=".3f"),
308
+ alt.Tooltip("candidate_score:Q", title="Candidate 分數", format=".3f"),
309
+ alt.Tooltip("delta:Q", title="Δ", format=".3f"),
310
+ ],
311
+ )
312
+
313
+ chart = (
314
+ base.mark_bar()
315
+ .encode(row=alt.Row("candidate:N", header=alt.Header(title=None)))
316
+ .properties(height=chart_height)
317
+ .resolve_scale(y=resolve_y) # 各 candidate 分面各自排序或共用
318
+ )
319
+ st.altair_chart(chart, use_container_width=True)
320
+
321
+ # 表格:依 rank_in_candidate 排序,與圖一致
322
+ table = table_sort(sub)[["category", "candidate", "baseline_score", "candidate_score", "delta"]]
323
+ st.dataframe(table, use_container_width=True)
324
+
325
+ st.download_button(
326
+ label="下載 Δ 排行 CSV(全部類別)",
327
+ data=table.to_csv(index=False).encode("utf-8"),
328
+ file_name=f"delta_{dataset_for_delta}_ALL.csv",
329
+ mime="text/csv",
330
+ )
331
+
332
+
333
+
334
+ with tab2:
335
+ # per-candidate 總結:mean/median Δ、win/lose/tie、覆蓋率、Top/Bottom-N
336
+ summaries = []
337
+ top_k = st.number_input("Top/Bottom-N(顯示每個 Candidate 的最大/最小差距分類)", min_value=1, value=10, step=1)
338
+
339
+ for c in valid_candidates:
340
+ pair = wide[[baseline, c]].dropna()
341
+ if pair.empty:
342
+ continue
343
+ deltas = pair[c] - pair[baseline]
344
+ m = float(np.mean(deltas))
345
+ med = float(np.median(deltas))
346
+ win = int((deltas > 0).sum())
347
+ lose = int((deltas < 0).sum())
348
+ tie = int((deltas == 0).sum())
349
+ coverage = f"{len(deltas)}/{wide.shape[0]}" # 有共同分數的類別數 / 全部類別數
350
+
351
+ # 取 Top/Bottom-N 類別(按 Δ)
352
+ top_rows = (pair.assign(delta=deltas)
353
+ .sort_values("delta", ascending=False)
354
+ .head(top_k)
355
+ .reset_index()[["category", baseline, c, "delta"]])
356
+ bottom_rows = (pair.assign(delta=deltas)
357
+ .sort_values("delta", ascending=True)
358
+ .head(top_k)
359
+ .reset_index()[["category", baseline, c, "delta"]])
360
+
361
+ summaries.append({
362
+ "candidate": c,
363
+ "mean_delta": m,
364
+ "median_delta": med,
365
+ "win": win,
366
+ "lose": lose,
367
+ "tie": tie,
368
+ "coverage": coverage,
369
+ "top_list": top_rows,
370
+ "bottom_list": bottom_rows
371
+ })
372
+
373
+ if not summaries:
374
+ st.warning("沒有可用的 per-candidate 總結(可能都沒有交集)。")
375
+ else:
376
+ # 概覽表
377
+ overview = pd.DataFrame([{
378
+ "Candidate": s["candidate"],
379
+ "Mean Δ": s["mean_delta"],
380
+ "Median Δ": s["median_delta"],
381
+ "Win": s["win"],
382
+ "Lose": s["lose"],
383
+ "Tie": s["tie"],
384
+ "Coverage (交集/總類別)": s["coverage"],
385
+ } for s in summaries]).sort_values("Mean Δ", ascending=False)
386
+ st.markdown("### 總覽(與 Baseline 成對比較)")
387
+ st.dataframe(overview, use_container_width=True)
388
+ st.download_button(
389
+ label="下載 per-candidate 總覽 CSV",
390
+ data=overview.to_csv(index=False).encode("utf-8"),
391
+ file_name=f"delta_overview_{dataset_for_delta}.csv",
392
+ mime="text/csv"
393
+ )
394
+
395
+ # 逐 Candidate 顯示 Top/Bottom-N 清單(可收合)
396
+ st.markdown("### 各 Candidate 的差距清單(Top/Bottom-N)")
397
+ for s in summaries:
398
+ with st.expander(f"🔸 {s['candidate']}"):
399
+ st.write("**Top-N(提升最多)**")
400
+ top_tbl = s["top_list"].rename(columns={baseline: "baseline_score", s["candidate"]: "candidate_score"})
401
+ st.dataframe(top_tbl, use_container_width=True)
402
+ st.download_button(
403
+ label=f"下載 {s['candidate']} Top-N",
404
+ data=top_tbl.to_csv(index=False).encode("utf-8"),
405
+ file_name=f"delta_top_{dataset_for_delta}_{s['candidate']}.csv",
406
+ mime="text/csv"
407
+ )
408
+
409
+ st.write("**Bottom-N(下降最多)**")
410
+ bottom_tbl = s["bottom_list"].rename(columns={baseline: "baseline_score", s["candidate"]: "candidate_score"})
411
+ st.dataframe(bottom_tbl, use_container_width=True)
412
+ st.download_button(
413
+ label=f"下載 {s['candidate']} Bottom-N",
414
+ data=bottom_tbl.to_csv(index=False).encode("utf-8"),
415
+ file_name=f"delta_bottom_{dataset_for_delta}_{s['candidate']}.csv",
416
+ mime="text/csv"
417
+ )