Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -391,16 +391,105 @@ def _build_detail_view(results, tasks):
|
|
| 391 |
return CSS + items
|
| 392 |
|
| 393 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 394 |
-
# PART 10:
|
| 395 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
| 398 |
-
max_tasks, fresh_start, progress=gr.Progress()):
|
|
|
|
| 399 |
api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 400 |
if not api_key:
|
| 401 |
yield "β API Keyλ₯Ό οΏ½οΏ½οΏ½λ ₯νμΈμ.", "", "", "", None
|
| 402 |
return
|
| 403 |
|
|
|
|
| 404 |
tasks = ALL_TASKS[:]
|
| 405 |
if pillar_filter != "μ 체":
|
| 406 |
tasks = [t for t in tasks if t.pillar == pillar_filter]
|
|
@@ -412,54 +501,100 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
|
| 412 |
if fresh_start:
|
| 413 |
_clear_run(run_id)
|
| 414 |
|
|
|
|
| 415 |
results = dict(_load_all(run_id))
|
| 416 |
total = len(tasks)
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
if done > 0 and not fresh_start:
|
| 420 |
-
yield (f"πΎ 체ν¬ν¬μΈνΈ 볡μ: {done}/{total}. μ΄μ΄μ μ§ν.",
|
| 421 |
-
_build_progress_table(results, tasks), "", "", None)
|
| 422 |
-
time.sleep(0.5)
|
| 423 |
-
|
| 424 |
-
for i, task in enumerate(tasks):
|
| 425 |
-
if task.task_id in results:
|
| 426 |
-
continue
|
| 427 |
-
|
| 428 |
-
# Step 1: νΌνκ° λͺ¨λΈ νΈμΆ
|
| 429 |
-
progress((i + 0.3) / total, desc=f"[{i+1}/{total}] {task.task_id} λͺ¨λΈμλ΅...")
|
| 430 |
-
yield (f"π€ [{i+1}/{total}] {task.task_id} ({task.difficulty}) β λͺ¨λΈ μλ΅ λκΈ°...",
|
| 431 |
-
_build_progress_table(results, tasks), "", "", None)
|
| 432 |
|
| 433 |
-
|
|
|
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
_save_result(run_id, task.task_id, model_response, "{}", 0)
|
| 438 |
-
yield (f"β οΈ {task.task_id} API μ€λ₯ β λ€μ κ³Όμ λ‘.",
|
| 439 |
-
_build_progress_table(results, tasks), "", "", None)
|
| 440 |
-
continue
|
| 441 |
-
|
| 442 |
-
# Step 2: Judge μ±μ
|
| 443 |
-
progress((i + 0.7) / total, desc=f"[{i+1}/{total}] {task.task_id} μ±μ ...")
|
| 444 |
-
yield (f"βοΈ [{i+1}/{total}] {task.task_id} β Judge μ±μ μ€...",
|
| 445 |
_build_progress_table(results, tasks), "", "", None)
|
|
|
|
| 446 |
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
| 461 |
|
| 462 |
-
# ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
progress(1.0, desc="μλ£!")
|
| 464 |
|
| 465 |
pillar_scores = {}
|
|
@@ -470,17 +605,22 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
|
| 470 |
aether = calculate_aether_score(pillar_scores)
|
| 471 |
|
| 472 |
csv_str = generate_csv(results, eval_model)
|
| 473 |
-
csv_path = f"/tmp/aether_eval_{
|
| 474 |
with open(csv_path, "w", encoding="utf-8") as f:
|
| 475 |
f.write(csv_str)
|
| 476 |
|
| 477 |
hf_status = upload_to_hf(csv_str, eval_model)
|
| 478 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
|
| 480 |
table = _build_progress_table(results, tasks)
|
| 481 |
detail = _build_detail_view(results, tasks)
|
| 482 |
|
| 483 |
-
yield (f"π νκ° μλ£! AETHER Score: {aether:.1f}",
|
|
|
|
| 484 |
|
| 485 |
|
| 486 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -526,11 +666,12 @@ def create_app():
|
|
| 526 |
pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="μ 체", label="κΈ°λ₯ νν°", scale=2)
|
| 527 |
diff_dd = gr.Dropdown(DIFF_CHOICES, value="μ 체", label="λμ΄λ νν°", scale=2)
|
| 528 |
max_tasks = gr.Slider(1, 120, value=120, step=1, label="μ΅λ κ³Όμ μ", scale=2)
|
|
|
|
| 529 |
|
| 530 |
with gr.Row():
|
| 531 |
start_btn = gr.Button("βΆοΈ νκ° μμ (μ΄μ΄νκΈ°)", variant="primary", size="lg", scale=2)
|
| 532 |
fresh_btn = gr.Button("π μλ‘ μμ", variant="secondary", size="lg", scale=2)
|
| 533 |
-
gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">βΆοΈ μ€λ¨μ μ΄μ΄μ | π μ΄κΈ°νν μ¬μμ
|
| 534 |
|
| 535 |
with gr.Tabs():
|
| 536 |
with gr.Tab("π μ§ν"):
|
|
@@ -545,13 +686,13 @@ def create_app():
|
|
| 545 |
csv_file = gr.File(label="νκ° κ²°κ³Ό CSV")
|
| 546 |
|
| 547 |
start_btn.click(
|
| 548 |
-
fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,False),
|
| 549 |
-
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
|
| 550 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 551 |
)
|
| 552 |
fresh_btn.click(
|
| 553 |
-
fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,True),
|
| 554 |
-
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
|
| 555 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 556 |
)
|
| 557 |
|
|
|
|
| 391 |
return CSS + items
|
| 392 |
|
| 393 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 394 |
+
# PART 10: λ³λ ¬ νκ° μμ§ (κΈ°λ₯λ³ λμ μ€ν)
|
| 395 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 396 |
|
| 397 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 398 |
+
|
| 399 |
+
def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state):
|
| 400 |
+
"""λ¨μΌ κ³Όμ νκ° (λͺ¨λΈνΈμΆ + Judgeμ±μ ). μ컀 μ€λ λμμ μ€ν."""
|
| 401 |
+
try:
|
| 402 |
+
# Step 1: νΌνκ° λͺ¨λΈ νΈμΆ
|
| 403 |
+
model_response = call_llm(task.prompt, api_key=api_key, model=eval_model)
|
| 404 |
+
|
| 405 |
+
if model_response.startswith("[API_ERROR]"):
|
| 406 |
+
_save_result(run_id, task.task_id, model_response, "{}", 0)
|
| 407 |
+
with state["lock"]:
|
| 408 |
+
state["done"] += 1
|
| 409 |
+
state["errors"].append(task.task_id)
|
| 410 |
+
return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
|
| 411 |
+
|
| 412 |
+
# Step 2: Judge μ±μ
|
| 413 |
+
judge_prompt = build_judge_prompt(task, model_response)
|
| 414 |
+
judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
|
| 415 |
+
model=judge_model, temperature=0.3)
|
| 416 |
+
|
| 417 |
+
rubric_keys = list(task.scoring_rubric.keys())
|
| 418 |
+
judge_data = parse_judge_response(judge_raw, rubric_keys)
|
| 419 |
+
weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
|
| 420 |
+
|
| 421 |
+
judge_json = json.dumps(judge_data, ensure_ascii=False)
|
| 422 |
+
_save_result(run_id, task.task_id, model_response, judge_json, weighted)
|
| 423 |
+
|
| 424 |
+
with state["lock"]:
|
| 425 |
+
state["done"] += 1
|
| 426 |
+
info = PILLAR_INFO.get(task.pillar, {})
|
| 427 |
+
state["active"].append(f'{info.get("icon","")} {task.task_id}')
|
| 428 |
+
if len(state["active"]) > 10:
|
| 429 |
+
state["active"] = state["active"][-10:]
|
| 430 |
+
|
| 431 |
+
return task.task_id, {"response": model_response, "judge": judge_json, "score": weighted}
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
with state["lock"]:
|
| 435 |
+
state["done"] += 1
|
| 436 |
+
state["errors"].append(f"{task.task_id}: {str(e)[:80]}")
|
| 437 |
+
_save_result(run_id, task.task_id, f"[ERROR] {e}", "{}", 0)
|
| 438 |
+
return task.task_id, {"response": f"[ERROR] {e}", "judge": "{}", "score": 0}
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def _parallel_progress_html(state, total):
|
| 442 |
+
"""λ³λ ¬ μ€ν μ§ν μν HTML"""
|
| 443 |
+
done = state["done"]
|
| 444 |
+
pct = min(int(done / max(total, 1) * 100), 100)
|
| 445 |
+
active = state.get("active", [])
|
| 446 |
+
errors = state.get("errors", [])
|
| 447 |
+
|
| 448 |
+
# κΈ°λ₯λ³ μ§ν μν λ°
|
| 449 |
+
pillar_bars = ""
|
| 450 |
+
for p, info in PILLAR_INFO.items():
|
| 451 |
+
p_total = state["pillar_total"].get(p, 0)
|
| 452 |
+
p_done = state["pillar_done"].get(p, 0)
|
| 453 |
+
if p_total == 0: continue
|
| 454 |
+
p_pct = min(int(p_done / p_total * 100), 100)
|
| 455 |
+
c = "#4caf50" if p_pct == 100 else ("#1976d2" if p_pct > 0 else "#e0e0e0")
|
| 456 |
+
pillar_bars += f'''<div style="display:flex;align-items:center;gap:8px;margin:3px 0;">
|
| 457 |
+
<span style="width:100px;font-size:0.85em">{info["icon"]} {info["name"]}</span>
|
| 458 |
+
<div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden">
|
| 459 |
+
<div style="width:{p_pct}%;height:100%;background:{c};border-radius:6px;transition:width .3s"></div>
|
| 460 |
+
</div>
|
| 461 |
+
<span style="width:60px;font-size:0.82em;text-align:right;color:{c}">{p_done}/{p_total}</span>
|
| 462 |
+
</div>'''
|
| 463 |
+
|
| 464 |
+
out = f'''<div style="margin:8px 0;">
|
| 465 |
+
<div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px;">
|
| 466 |
+
<span>β‘ <b>λ³λ ¬ νκ° μ§ν μ€</b> β {done}/{total} μλ£</span>
|
| 467 |
+
<span style="font-weight:700">{pct}%</span>
|
| 468 |
+
</div>
|
| 469 |
+
<div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>
|
| 470 |
+
<div style="margin-top:8px;">{pillar_bars}</div>'''
|
| 471 |
+
|
| 472 |
+
if active:
|
| 473 |
+
tags = " ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em;">{a}</span>' for a in active[-8:]])
|
| 474 |
+
out += f'<div style="margin-top:8px;">π μ΅κ·Ό μλ£: {tags}</div>'
|
| 475 |
+
|
| 476 |
+
if errors:
|
| 477 |
+
err_html = " Β· ".join([f"β οΈ{html.escape(e[:30])}" for e in errors[-5:]])
|
| 478 |
+
out += f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;">{err_html}</div>'
|
| 479 |
+
|
| 480 |
+
out += '</div>'
|
| 481 |
+
return out
|
| 482 |
+
|
| 483 |
+
|
| 484 |
def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
| 485 |
+
max_tasks, n_workers, fresh_start, progress=gr.Progress()):
|
| 486 |
+
"""λ©μΈ νκ° β κΈ°λ₯λ³ λ³λ ¬ μ€ν"""
|
| 487 |
api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 488 |
if not api_key:
|
| 489 |
yield "β API Keyλ₯Ό οΏ½οΏ½οΏ½λ ₯νμΈμ.", "", "", "", None
|
| 490 |
return
|
| 491 |
|
| 492 |
+
# ββ κ³Όμ νν°λ§ ββ
|
| 493 |
tasks = ALL_TASKS[:]
|
| 494 |
if pillar_filter != "μ 체":
|
| 495 |
tasks = [t for t in tasks if t.pillar == pillar_filter]
|
|
|
|
| 501 |
if fresh_start:
|
| 502 |
_clear_run(run_id)
|
| 503 |
|
| 504 |
+
# ββ κΈ°μ‘΄ κ²°κ³Ό 볡μ ββ
|
| 505 |
results = dict(_load_all(run_id))
|
| 506 |
total = len(tasks)
|
| 507 |
+
cached = sum(1 for t in tasks if t.task_id in results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
+
# λ―Έμλ£ κ³Όμ λ§ μΆμΆ
|
| 510 |
+
pending = [t for t in tasks if t.task_id not in results]
|
| 511 |
|
| 512 |
+
if cached > 0 and not fresh_start:
|
| 513 |
+
yield (f"πΎ 체ν¬ν¬μΈνΈ 볡μ: {cached}/{total} μλ£ β {len(pending)}κ° λ¨μ",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
_build_progress_table(results, tasks), "", "", None)
|
| 515 |
+
time.sleep(0.5)
|
| 516 |
|
| 517 |
+
if not pending:
|
| 518 |
+
# μ λΆ μΊμ ννΈ
|
| 519 |
+
pillar_scores = {}
|
| 520 |
+
for p in PILLAR_INFO:
|
| 521 |
+
pt = [t for t in tasks if t.pillar == p and t.task_id in results]
|
| 522 |
+
if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
|
| 523 |
+
aether = calculate_aether_score(pillar_scores)
|
| 524 |
+
csv_str = generate_csv(results, eval_model)
|
| 525 |
+
csv_path = f"/tmp/aether_eval_{run_id}.csv"
|
| 526 |
+
with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_str)
|
| 527 |
+
hf_status = upload_to_hf(csv_str, eval_model)
|
| 528 |
+
yield (f"π μ λΆ μΊμ! AETHER Score: {aether:.1f}",
|
| 529 |
+
_build_progress_table(results, tasks),
|
| 530 |
+
_build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status),
|
| 531 |
+
_build_detail_view(results, tasks), csv_path)
|
| 532 |
+
return
|
| 533 |
|
| 534 |
+
# ββ κΈ°λ₯λ³ κ³Όμ κ·Έλ£Ήν (λ³λ ¬ λ¨μ) ββ
|
| 535 |
+
pillar_tasks = {}
|
| 536 |
+
for t in pending:
|
| 537 |
+
pillar_tasks.setdefault(t.pillar, []).append(t)
|
| 538 |
+
|
| 539 |
+
n_pillars = len(pillar_tasks)
|
| 540 |
+
n_workers = int(n_workers)
|
| 541 |
+
|
| 542 |
+
# μ§ν μν (μ€λ λ μμ )
|
| 543 |
+
state = {
|
| 544 |
+
"lock": threading.Lock(),
|
| 545 |
+
"done": 0,
|
| 546 |
+
"active": [],
|
| 547 |
+
"errors": [],
|
| 548 |
+
"pillar_total": {p: len(ts) for p, ts in pillar_tasks.items()},
|
| 549 |
+
"pillar_done": {p: 0 for p in pillar_tasks},
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
yield (CSS + f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;margin:8px 0;">'
|
| 553 |
+
f'β‘ <b>λ³λ ¬ νκ° μμ!</b> {len(pending)}κ° κ³Όμ Β· {n_pillars}κ° κΈ°λ₯ λμ Β· {n_workers}κ° μ컀'
|
| 554 |
+
f'</div>', _build_progress_table(results, tasks), "", "", None)
|
| 555 |
+
|
| 556 |
+
# ββ ThreadPoolExecutor λ³λ ¬ μ€ν ββ
|
| 557 |
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 558 |
+
futures = {}
|
| 559 |
+
for task in pending:
|
| 560 |
+
fut = executor.submit(_eval_single_task, task, run_id, api_key,
|
| 561 |
+
eval_model, judge_model, state)
|
| 562 |
+
futures[fut] = task
|
| 563 |
+
|
| 564 |
+
completed = set()
|
| 565 |
+
while len(completed) < len(futures):
|
| 566 |
+
newly_done = []
|
| 567 |
+
for fut in futures:
|
| 568 |
+
if fut in completed: continue
|
| 569 |
+
if fut.done():
|
| 570 |
+
completed.add(fut)
|
| 571 |
+
newly_done.append(fut)
|
| 572 |
+
|
| 573 |
+
for fut in newly_done:
|
| 574 |
+
try:
|
| 575 |
+
tid, data = fut.result()
|
| 576 |
+
results[tid] = data
|
| 577 |
+
# κΈ°λ₯λ³ μΉ΄μ΄ν° μ
λ°μ΄νΈ
|
| 578 |
+
task_obj = futures[fut]
|
| 579 |
+
with state["lock"]:
|
| 580 |
+
state["pillar_done"][task_obj.pillar] = state["pillar_done"].get(task_obj.pillar, 0) + 1
|
| 581 |
+
except Exception as e:
|
| 582 |
+
with state["lock"]:
|
| 583 |
+
state["errors"].append(str(e)[:60])
|
| 584 |
+
|
| 585 |
+
# μ§ν UI μ
λ°μ΄νΈ
|
| 586 |
+
with state["lock"]:
|
| 587 |
+
done_now = cached + state["done"]
|
| 588 |
+
pct = min(int(done_now / total * 100), 100)
|
| 589 |
+
progress(done_now / total, desc=f"{done_now}/{total} ({pct}%)")
|
| 590 |
+
prog_html = CSS + _parallel_progress_html(state, len(pending))
|
| 591 |
+
|
| 592 |
+
yield (prog_html, _build_progress_table(results, tasks), "", "", None)
|
| 593 |
+
|
| 594 |
+
if len(completed) < len(futures):
|
| 595 |
+
time.sleep(1.0)
|
| 596 |
+
|
| 597 |
+
# ββ μ΅μ’
κ²°κ³Ό ββ
|
| 598 |
progress(1.0, desc="μλ£!")
|
| 599 |
|
| 600 |
pillar_scores = {}
|
|
|
|
| 605 |
aether = calculate_aether_score(pillar_scores)
|
| 606 |
|
| 607 |
csv_str = generate_csv(results, eval_model)
|
| 608 |
+
csv_path = f"/tmp/aether_eval_{run_id}.csv"
|
| 609 |
with open(csv_path, "w", encoding="utf-8") as f:
|
| 610 |
f.write(csv_str)
|
| 611 |
|
| 612 |
hf_status = upload_to_hf(csv_str, eval_model)
|
| 613 |
|
| 614 |
+
n_err = len(state["errors"])
|
| 615 |
+
err_msg = f" (β οΈ {n_err}κ° μ€λ₯)" if n_err > 0 else ""
|
| 616 |
+
restore_msg = f" (πΎ {cached}κ° λ³΅μ)" if cached > 0 else ""
|
| 617 |
+
|
| 618 |
summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
|
| 619 |
table = _build_progress_table(results, tasks)
|
| 620 |
detail = _build_detail_view(results, tasks)
|
| 621 |
|
| 622 |
+
yield (f"π νκ° μλ£!{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
|
| 623 |
+
table, summary, detail, csv_path)
|
| 624 |
|
| 625 |
|
| 626 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 666 |
pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="μ 체", label="κΈ°λ₯ νν°", scale=2)
|
| 667 |
diff_dd = gr.Dropdown(DIFF_CHOICES, value="μ 체", label="λμ΄λ νν°", scale=2)
|
| 668 |
max_tasks = gr.Slider(1, 120, value=120, step=1, label="μ΅λ κ³Όμ μ", scale=2)
|
| 669 |
+
n_workers = gr.Slider(1, 20, value=10, step=1, label="β‘ λ³λ ¬ μ컀 μ", scale=2)
|
| 670 |
|
| 671 |
with gr.Row():
|
| 672 |
start_btn = gr.Button("βΆοΈ νκ° μμ (μ΄μ΄νκΈ°)", variant="primary", size="lg", scale=2)
|
| 673 |
fresh_btn = gr.Button("π μλ‘ μμ", variant="secondary", size="lg", scale=2)
|
| 674 |
+
gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">β‘ κΈ°λ₯λ³ λ³λ ¬ μ€ν β 5κ° κΈ°λ₯ λμ νκ°<br>βΆοΈ μ€λ¨μ μ΄μ΄μ | π μ΄κΈ°νν μ¬μμ | CSVβHF PRIVATE</p>')
|
| 675 |
|
| 676 |
with gr.Tabs():
|
| 677 |
with gr.Tab("π μ§ν"):
|
|
|
|
| 686 |
csv_file = gr.File(label="νκ° κ²°κ³Ό CSV")
|
| 687 |
|
| 688 |
start_btn.click(
|
| 689 |
+
fn=lambda ak,em,jm,pf,df,mt,nw: run_evaluation(ak,em,jm,pf,df,mt,nw,False),
|
| 690 |
+
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
|
| 691 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 692 |
)
|
| 693 |
fresh_btn.click(
|
| 694 |
+
fn=lambda ak,em,jm,pf,df,mt,nw: run_evaluation(ak,em,jm,pf,df,mt,nw,True),
|
| 695 |
+
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
|
| 696 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 697 |
)
|
| 698 |
|