seawolf2357 commited on
Commit
4cc01c5
Β·
verified Β·
1 Parent(s): 5a9c617

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -50
app.py CHANGED
@@ -391,16 +391,105 @@ def _build_detail_view(results, tasks):
391
  return CSS + items
392
 
393
  # ════════════════════════════════════════════════════════════════
394
- # PART 10: 메인 평가 루프
395
  # ════════════════════════════════════════════════════════════════
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
398
- max_tasks, fresh_start, progress=gr.Progress()):
 
399
  api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
400
  if not api_key:
401
  yield "❌ API Keyλ₯Ό οΏ½οΏ½οΏ½λ ₯ν•˜μ„Έμš”.", "", "", "", None
402
  return
403
 
 
404
  tasks = ALL_TASKS[:]
405
  if pillar_filter != "전체":
406
  tasks = [t for t in tasks if t.pillar == pillar_filter]
@@ -412,54 +501,100 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
412
  if fresh_start:
413
  _clear_run(run_id)
414
 
 
415
  results = dict(_load_all(run_id))
416
  total = len(tasks)
417
- done = sum(1 for t in tasks if t.task_id in results)
418
-
419
- if done > 0 and not fresh_start:
420
- yield (f"πŸ’Ύ 체크포인트 볡원: {done}/{total}. μ΄μ–΄μ„œ μ§„ν–‰.",
421
- _build_progress_table(results, tasks), "", "", None)
422
- time.sleep(0.5)
423
-
424
- for i, task in enumerate(tasks):
425
- if task.task_id in results:
426
- continue
427
-
428
- # Step 1: 피평가 λͺ¨λΈ 호좜
429
- progress((i + 0.3) / total, desc=f"[{i+1}/{total}] {task.task_id} λͺ¨λΈμ‘λ‹΅...")
430
- yield (f"πŸ€– [{i+1}/{total}] {task.task_id} ({task.difficulty}) β€” λͺ¨λΈ 응닡 λŒ€κΈ°...",
431
- _build_progress_table(results, tasks), "", "", None)
432
 
433
- model_response = call_llm(task.prompt, api_key=api_key, model=eval_model)
 
434
 
435
- if model_response.startswith("[API_ERROR]"):
436
- results[task.task_id] = {"response": model_response, "judge": "{}", "score": 0}
437
- _save_result(run_id, task.task_id, model_response, "{}", 0)
438
- yield (f"⚠️ {task.task_id} API 였λ₯˜ β€” λ‹€μŒ 과제둜.",
439
- _build_progress_table(results, tasks), "", "", None)
440
- continue
441
-
442
- # Step 2: Judge 채점
443
- progress((i + 0.7) / total, desc=f"[{i+1}/{total}] {task.task_id} 채점...")
444
- yield (f"βš–οΈ [{i+1}/{total}] {task.task_id} β€” Judge 채점 쀑...",
445
  _build_progress_table(results, tasks), "", "", None)
 
446
 
447
- judge_prompt = build_judge_prompt(task, model_response)
448
- judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
449
- model=judge_model, temperature=0.3)
450
-
451
- rubric_keys = list(task.scoring_rubric.keys())
452
- judge_data = parse_judge_response(judge_raw, rubric_keys)
453
- weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
454
-
455
- judge_json = json.dumps(judge_data, ensure_ascii=False)
456
- results[task.task_id] = {"response": model_response, "judge": judge_json, "score": weighted}
457
- _save_result(run_id, task.task_id, model_response, judge_json, weighted)
458
-
459
- done = sum(1 for t in tasks if t.task_id in results)
460
- progress(done / total, desc=f"{done}/{total}")
 
 
461
 
462
- # ── μ΅œμ’… ──
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  progress(1.0, desc="μ™„λ£Œ!")
464
 
465
  pillar_scores = {}
@@ -470,17 +605,22 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
470
  aether = calculate_aether_score(pillar_scores)
471
 
472
  csv_str = generate_csv(results, eval_model)
473
- csv_path = f"/tmp/aether_eval_{_make_run_id(eval_model)}.csv"
474
  with open(csv_path, "w", encoding="utf-8") as f:
475
  f.write(csv_str)
476
 
477
  hf_status = upload_to_hf(csv_str, eval_model)
478
 
 
 
 
 
479
  summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
480
  table = _build_progress_table(results, tasks)
481
  detail = _build_detail_view(results, tasks)
482
 
483
- yield (f"🏁 평가 μ™„λ£Œ! AETHER Score: {aether:.1f}", table, summary, detail, csv_path)
 
484
 
485
 
486
  # ════════════════════════════════════════════════════════════════
@@ -526,11 +666,12 @@ def create_app():
526
  pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="전체", label="κΈ°λ‘₯ ν•„ν„°", scale=2)
527
  diff_dd = gr.Dropdown(DIFF_CHOICES, value="전체", label="λ‚œμ΄λ„ ν•„ν„°", scale=2)
528
  max_tasks = gr.Slider(1, 120, value=120, step=1, label="μ΅œλŒ€ 과제 수", scale=2)
 
529
 
530
  with gr.Row():
531
  start_btn = gr.Button("▢️ 평가 μ‹œμž‘ (μ΄μ–΄ν•˜κΈ°)", variant="primary", size="lg", scale=2)
532
  fresh_btn = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
533
- gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">▢️ μ€‘λ‹¨μ‹œ μ΄μ–΄μ„œ | πŸš€ μ΄ˆκΈ°ν™”ν›„ μž¬μ‹œμž‘<br>κ²°κ³Όβ†’CSVβ†’HF PRIVATE μžλ™ μ—…λ‘œλ“œ</p>')
534
 
535
  with gr.Tabs():
536
  with gr.Tab("πŸ“Š μ§„ν–‰"):
@@ -545,13 +686,13 @@ def create_app():
545
  csv_file = gr.File(label="평가 κ²°κ³Ό CSV")
546
 
547
  start_btn.click(
548
- fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,False),
549
- inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
550
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
551
  )
552
  fresh_btn.click(
553
- fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,True),
554
- inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
555
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
556
  )
557
 
 
391
  return CSS + items
392
 
393
  # ════════════════════════════════════════════════════════════════
394
+ # PART 10: 병렬 평가 μ—”μ§„ (κΈ°λ‘₯별 λ™μ‹œ μ‹€ν–‰)
395
  # ════════════════════════════════════════════════════════════════
396
 
397
+ from concurrent.futures import ThreadPoolExecutor, as_completed
398
+
399
+ def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state):
400
+ """단일 과제 평가 (λͺ¨λΈν˜ΈμΆœ + Judge채점). μ›Œμ»€ μŠ€λ ˆλ“œμ—μ„œ μ‹€ν–‰."""
401
+ try:
402
+ # Step 1: 피평가 λͺ¨λΈ 호좜
403
+ model_response = call_llm(task.prompt, api_key=api_key, model=eval_model)
404
+
405
+ if model_response.startswith("[API_ERROR]"):
406
+ _save_result(run_id, task.task_id, model_response, "{}", 0)
407
+ with state["lock"]:
408
+ state["done"] += 1
409
+ state["errors"].append(task.task_id)
410
+ return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
411
+
412
+ # Step 2: Judge 채점
413
+ judge_prompt = build_judge_prompt(task, model_response)
414
+ judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
415
+ model=judge_model, temperature=0.3)
416
+
417
+ rubric_keys = list(task.scoring_rubric.keys())
418
+ judge_data = parse_judge_response(judge_raw, rubric_keys)
419
+ weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
420
+
421
+ judge_json = json.dumps(judge_data, ensure_ascii=False)
422
+ _save_result(run_id, task.task_id, model_response, judge_json, weighted)
423
+
424
+ with state["lock"]:
425
+ state["done"] += 1
426
+ info = PILLAR_INFO.get(task.pillar, {})
427
+ state["active"].append(f'{info.get("icon","")} {task.task_id}')
428
+ if len(state["active"]) > 10:
429
+ state["active"] = state["active"][-10:]
430
+
431
+ return task.task_id, {"response": model_response, "judge": judge_json, "score": weighted}
432
+
433
+ except Exception as e:
434
+ with state["lock"]:
435
+ state["done"] += 1
436
+ state["errors"].append(f"{task.task_id}: {str(e)[:80]}")
437
+ _save_result(run_id, task.task_id, f"[ERROR] {e}", "{}", 0)
438
+ return task.task_id, {"response": f"[ERROR] {e}", "judge": "{}", "score": 0}
439
+
440
+
441
+ def _parallel_progress_html(state, total):
442
+ """병렬 μ‹€ν–‰ μ§„ν–‰ μƒνƒœ HTML"""
443
+ done = state["done"]
444
+ pct = min(int(done / max(total, 1) * 100), 100)
445
+ active = state.get("active", [])
446
+ errors = state.get("errors", [])
447
+
448
+ # κΈ°λ‘₯별 μ§„ν–‰ μƒνƒœ λ°”
449
+ pillar_bars = ""
450
+ for p, info in PILLAR_INFO.items():
451
+ p_total = state["pillar_total"].get(p, 0)
452
+ p_done = state["pillar_done"].get(p, 0)
453
+ if p_total == 0: continue
454
+ p_pct = min(int(p_done / p_total * 100), 100)
455
+ c = "#4caf50" if p_pct == 100 else ("#1976d2" if p_pct > 0 else "#e0e0e0")
456
+ pillar_bars += f'''<div style="display:flex;align-items:center;gap:8px;margin:3px 0;">
457
+ <span style="width:100px;font-size:0.85em">{info["icon"]} {info["name"]}</span>
458
+ <div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden">
459
+ <div style="width:{p_pct}%;height:100%;background:{c};border-radius:6px;transition:width .3s"></div>
460
+ </div>
461
+ <span style="width:60px;font-size:0.82em;text-align:right;color:{c}">{p_done}/{p_total}</span>
462
+ </div>'''
463
+
464
+ out = f'''<div style="margin:8px 0;">
465
+ <div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px;">
466
+ <span>⚑ <b>병렬 평가 μ§„ν–‰ 쀑</b> β€” {done}/{total} μ™„λ£Œ</span>
467
+ <span style="font-weight:700">{pct}%</span>
468
+ </div>
469
+ <div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>
470
+ <div style="margin-top:8px;">{pillar_bars}</div>'''
471
+
472
+ if active:
473
+ tags = " ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em;">{a}</span>' for a in active[-8:]])
474
+ out += f'<div style="margin-top:8px;">πŸ”„ 졜근 μ™„λ£Œ: {tags}</div>'
475
+
476
+ if errors:
477
+ err_html = " · ".join([f"⚠️{html.escape(e[:30])}" for e in errors[-5:]])
478
+ out += f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;">{err_html}</div>'
479
+
480
+ out += '</div>'
481
+ return out
482
+
483
+
484
  def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
485
+ max_tasks, n_workers, fresh_start, progress=gr.Progress()):
486
+ """메인 평가 β€” κΈ°λ‘₯별 병렬 μ‹€ν–‰"""
487
  api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
488
  if not api_key:
489
  yield "❌ API Keyλ₯Ό οΏ½οΏ½οΏ½λ ₯ν•˜μ„Έμš”.", "", "", "", None
490
  return
491
 
492
+ # ── 과제 필터링 ──
493
  tasks = ALL_TASKS[:]
494
  if pillar_filter != "전체":
495
  tasks = [t for t in tasks if t.pillar == pillar_filter]
 
501
  if fresh_start:
502
  _clear_run(run_id)
503
 
504
+ # ── κΈ°μ‘΄ κ²°κ³Ό 볡원 ──
505
  results = dict(_load_all(run_id))
506
  total = len(tasks)
507
+ cached = sum(1 for t in tasks if t.task_id in results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
+ # λ―Έμ™„λ£Œ 과제만 μΆ”μΆœ
510
+ pending = [t for t in tasks if t.task_id not in results]
511
 
512
+ if cached > 0 and not fresh_start:
513
+ yield (f"πŸ’Ύ 체크포인트 볡원: {cached}/{total} μ™„λ£Œ β€” {len(pending)}개 λ‚¨μŒ",
 
 
 
 
 
 
 
 
514
  _build_progress_table(results, tasks), "", "", None)
515
+ time.sleep(0.5)
516
 
517
+ if not pending:
518
+ # μ „λΆ€ μΊμ‹œ 히트
519
+ pillar_scores = {}
520
+ for p in PILLAR_INFO:
521
+ pt = [t for t in tasks if t.pillar == p and t.task_id in results]
522
+ if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
523
+ aether = calculate_aether_score(pillar_scores)
524
+ csv_str = generate_csv(results, eval_model)
525
+ csv_path = f"/tmp/aether_eval_{run_id}.csv"
526
+ with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_str)
527
+ hf_status = upload_to_hf(csv_str, eval_model)
528
+ yield (f"🏁 μ „λΆ€ μΊμ‹œ! AETHER Score: {aether:.1f}",
529
+ _build_progress_table(results, tasks),
530
+ _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status),
531
+ _build_detail_view(results, tasks), csv_path)
532
+ return
533
 
534
+ # ── κΈ°λ‘₯별 과제 κ·Έλ£Ήν•‘ (병렬 λ‹¨μœ„) ──
535
+ pillar_tasks = {}
536
+ for t in pending:
537
+ pillar_tasks.setdefault(t.pillar, []).append(t)
538
+
539
+ n_pillars = len(pillar_tasks)
540
+ n_workers = int(n_workers)
541
+
542
+ # μ§„ν–‰ μƒνƒœ (μŠ€λ ˆλ“œ μ•ˆμ „)
543
+ state = {
544
+ "lock": threading.Lock(),
545
+ "done": 0,
546
+ "active": [],
547
+ "errors": [],
548
+ "pillar_total": {p: len(ts) for p, ts in pillar_tasks.items()},
549
+ "pillar_done": {p: 0 for p in pillar_tasks},
550
+ }
551
+
552
+ yield (CSS + f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;margin:8px 0;">'
553
+ f'⚑ <b>병렬 평가 μ‹œμž‘!</b> {len(pending)}개 과제 Β· {n_pillars}개 κΈ°λ‘₯ λ™μ‹œ Β· {n_workers}개 μ›Œμ»€'
554
+ f'</div>', _build_progress_table(results, tasks), "", "", None)
555
+
556
+ # ── ThreadPoolExecutor 병렬 μ‹€ν–‰ ──
557
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
558
+ futures = {}
559
+ for task in pending:
560
+ fut = executor.submit(_eval_single_task, task, run_id, api_key,
561
+ eval_model, judge_model, state)
562
+ futures[fut] = task
563
+
564
+ completed = set()
565
+ while len(completed) < len(futures):
566
+ newly_done = []
567
+ for fut in futures:
568
+ if fut in completed: continue
569
+ if fut.done():
570
+ completed.add(fut)
571
+ newly_done.append(fut)
572
+
573
+ for fut in newly_done:
574
+ try:
575
+ tid, data = fut.result()
576
+ results[tid] = data
577
+ # κΈ°λ‘₯별 μΉ΄μš΄ν„° μ—…λ°μ΄νŠΈ
578
+ task_obj = futures[fut]
579
+ with state["lock"]:
580
+ state["pillar_done"][task_obj.pillar] = state["pillar_done"].get(task_obj.pillar, 0) + 1
581
+ except Exception as e:
582
+ with state["lock"]:
583
+ state["errors"].append(str(e)[:60])
584
+
585
+ # μ§„ν–‰ UI μ—…λ°μ΄νŠΈ
586
+ with state["lock"]:
587
+ done_now = cached + state["done"]
588
+ pct = min(int(done_now / total * 100), 100)
589
+ progress(done_now / total, desc=f"{done_now}/{total} ({pct}%)")
590
+ prog_html = CSS + _parallel_progress_html(state, len(pending))
591
+
592
+ yield (prog_html, _build_progress_table(results, tasks), "", "", None)
593
+
594
+ if len(completed) < len(futures):
595
+ time.sleep(1.0)
596
+
597
+ # ── μ΅œμ’… κ²°κ³Ό ──
598
  progress(1.0, desc="μ™„λ£Œ!")
599
 
600
  pillar_scores = {}
 
605
  aether = calculate_aether_score(pillar_scores)
606
 
607
  csv_str = generate_csv(results, eval_model)
608
+ csv_path = f"/tmp/aether_eval_{run_id}.csv"
609
  with open(csv_path, "w", encoding="utf-8") as f:
610
  f.write(csv_str)
611
 
612
  hf_status = upload_to_hf(csv_str, eval_model)
613
 
614
+ n_err = len(state["errors"])
615
+ err_msg = f" (⚠️ {n_err}개 였λ₯˜)" if n_err > 0 else ""
616
+ restore_msg = f" (πŸ’Ύ {cached}개 볡원)" if cached > 0 else ""
617
+
618
  summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
619
  table = _build_progress_table(results, tasks)
620
  detail = _build_detail_view(results, tasks)
621
 
622
+ yield (f"🏁 평가 μ™„λ£Œ!{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
623
+ table, summary, detail, csv_path)
624
 
625
 
626
  # ════════════════════════════════════════════════════════════════
 
666
  pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="전체", label="κΈ°λ‘₯ ν•„ν„°", scale=2)
667
  diff_dd = gr.Dropdown(DIFF_CHOICES, value="전체", label="λ‚œμ΄λ„ ν•„ν„°", scale=2)
668
  max_tasks = gr.Slider(1, 120, value=120, step=1, label="μ΅œλŒ€ 과제 수", scale=2)
669
+ n_workers = gr.Slider(1, 20, value=10, step=1, label="⚑ 병렬 μ›Œμ»€ 수", scale=2)
670
 
671
  with gr.Row():
672
  start_btn = gr.Button("▢️ 평가 μ‹œμž‘ (μ΄μ–΄ν•˜κΈ°)", variant="primary", size="lg", scale=2)
673
  fresh_btn = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
674
+ gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">⚑ κΈ°λ‘₯별 병렬 μ‹€ν–‰ β€” 5개 κΈ°λ‘₯ λ™μ‹œ 평가<br>▢️ μ€‘λ‹¨μ‹œ μ΄μ–΄μ„œ | πŸš€ μ΄ˆκΈ°ν™”ν›„ μž¬μ‹œμž‘ | CSVβ†’HF PRIVATE</p>')
675
 
676
  with gr.Tabs():
677
  with gr.Tab("πŸ“Š μ§„ν–‰"):
 
686
  csv_file = gr.File(label="평가 κ²°κ³Ό CSV")
687
 
688
  start_btn.click(
689
+ fn=lambda ak,em,jm,pf,df,mt,nw: run_evaluation(ak,em,jm,pf,df,mt,nw,False),
690
+ inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
691
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
692
  )
693
  fresh_btn.click(
694
+ fn=lambda ak,em,jm,pf,df,mt,nw: run_evaluation(ak,em,jm,pf,df,mt,nw,True),
695
+ inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
696
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
697
  )
698