seawolf2357 commited on
Commit
50272a6
Β·
verified Β·
1 Parent(s): 266278b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +293 -139
app.py CHANGED
@@ -908,156 +908,298 @@ def _parallel_progress_html(state, total):
908
  return out
909
 
910
 
911
- def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
912
- max_tasks, n_workers, proto_agi, fresh_start, progress=gr.Progress()):
913
- """메인 평가 β€” β˜… νƒ€μž„μ•„μ›ƒ λ°©μ§€: λΉ λ₯Έ yield μ£ΌκΈ° + μ΅œμ ν™”λœ 토큰/timeout"""
914
- api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
915
- if not api_key:
916
- yield "❌ Fireworks API Keyλ₯Ό μž…λ ₯ν•˜μ„Έμš”.", "", "", "", None
917
- return
918
 
919
- n_workers = int(n_workers)
920
- if proto_agi and n_workers > 5:
921
- n_workers = 5 # β˜… 3β†’5 (μ•½κ°„ 더 곡격적)
922
-
923
- tasks = ALL_TASKS[:]
924
- if pillar_filter != "전체":
925
- tasks = [t for t in tasks if t.pillar == pillar_filter]
926
- if diff_filter != "전체":
927
- tasks = [t for t in tasks if t.difficulty == diff_filter]
928
- tasks = tasks[:int(max_tasks)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929
 
930
- mode_suffix = "_PAGI" if proto_agi else ""
931
- run_id = _make_run_id(eval_model + mode_suffix)
932
- if fresh_start:
933
- _clear_run(run_id)
934
 
935
- results = dict(_load_all(run_id))
936
- total = len(tasks)
937
- cached = sum(1 for t in tasks if t.task_id in results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
 
939
- pending = [t for t in tasks if t.task_id not in results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
940
 
941
- if cached > 0 and not fresh_start:
942
- yield (f"πŸ’Ύ 체크포인트 볡원: {cached}/{total} μ™„λ£Œ β€” {len(pending)}개 λ‚¨μŒ",
943
- _build_progress_table(results, tasks), "", "", None)
944
 
945
- if not pending:
946
- pillar_scores = {}
947
- for p in PILLAR_INFO:
948
- pt = [t for t in tasks if t.pillar == p and t.task_id in results]
949
- if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
950
- aether = calculate_aether_score(pillar_scores)
951
- csv_str = generate_csv(results, eval_model)
952
- csv_path = f"/tmp/aether_eval_{run_id}.csv"
953
- with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_str)
954
- hf_status = upload_to_hf(csv_str, eval_model)
955
- yield (f"🏁 μ „λΆ€ μΊμ‹œ! AETHER Score: {aether:.1f}",
956
- _build_progress_table(results, tasks),
957
- _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status),
958
- _build_detail_view(results, tasks), csv_path)
959
- return
960
-
961
- pillar_tasks = {}
962
- for t in pending:
963
- pillar_tasks.setdefault(t.pillar, []).append(t)
964
-
965
- state = {
966
- "lock": threading.Lock(),
967
- "done": 0,
968
- "active": [],
969
- "errors": [],
970
- "pillar_total": {p: len(ts) for p, ts in pillar_tasks.items()},
971
- "pillar_done": {p: 0 for p in pillar_tasks},
972
- "start_time": time.time(),
973
- "parse_ok": 0,
974
- "parse_fail": 0,
975
- }
976
-
977
- mode_tag = '🌟 Proto-AGI ON' if proto_agi else 'πŸ€– 단일 LLM'
978
- yield (CSS + f'<div style="background:{"#fff3e0" if proto_agi else "#e8f5e9"};padding:12px;border-radius:8px;">'
979
- f'⚑ <b>병렬 평가 μ‹œμž‘!</b> {len(pending)}개 Β· {n_workers}μ›Œμ»€ Β· {mode_tag}</div>',
980
- _build_progress_table(results, tasks), "", "", None)
981
-
982
- # ── β˜… 핡심: ThreadPoolExecutor + λΉ λ₯Έ yield (0.3초 간격) ──
983
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
984
- futures = {}
985
- for task in pending:
986
- fut = executor.submit(_eval_single_task, task, run_id, api_key,
987
- eval_model, judge_model, state, proto_agi)
988
- futures[fut] = task
989
-
990
- completed = set()
991
- last_yield = time.time()
992
-
993
- while len(completed) < len(futures):
994
- newly_done = []
995
- for fut in futures:
996
- if fut in completed: continue
997
- if fut.done():
998
- completed.add(fut)
999
- newly_done.append(fut)
1000
-
1001
- for fut in newly_done:
1002
- try:
1003
- tid, data = fut.result()
1004
- results[tid] = data
1005
- task_obj = futures[fut]
1006
- with state["lock"]:
1007
- state["pillar_done"][task_obj.pillar] = state["pillar_done"].get(task_obj.pillar, 0) + 1
1008
- except Exception as e:
1009
- with state["lock"]:
1010
- state["errors"].append(str(e)[:60])
1011
-
1012
- # β˜… 핡심 λ³€κ²½: 0.3μ΄ˆλ§ˆλ‹€ yield (SSE heartbeat μ—­ν• )
1013
- now = time.time()
1014
- if now - last_yield >= 0.3 or newly_done:
1015
- last_yield = now
1016
- with state["lock"]:
1017
- done_now = cached + state["done"]
1018
- pct = min(int(done_now / total * 100), 100)
1019
- progress(done_now / total, desc=f"{done_now}/{total} ({pct}%)")
1020
- prog_html = CSS + _parallel_progress_html(state, len(pending))
1021
 
1022
- yield (prog_html, _build_progress_table(results, tasks), "", "", None)
 
 
 
 
1023
 
1024
- if len(completed) < len(futures):
1025
- time.sleep(0.2) # β˜… 1.0μ΄ˆβ†’0.2초 (λΉ λ₯Έ 폴링)
1026
 
1027
- # ── μ΅œμ’… κ²°κ³Ό ──
1028
- progress(1.0, desc="μ™„λ£Œ!")
 
1029
 
1030
  pillar_scores = {}
1031
  for p in PILLAR_INFO:
1032
  pt = [t for t in tasks if t.pillar == p and t.task_id in results]
1033
- if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
 
1034
 
1035
  aether = calculate_aether_score(pillar_scores)
1036
 
1037
  csv_str = generate_csv(results, eval_model)
 
1038
  csv_path = f"/tmp/aether_eval_{run_id}.csv"
1039
  with open(csv_path, "w", encoding="utf-8") as f:
1040
  f.write(csv_str)
1041
 
1042
  hf_status = upload_to_hf(csv_str, eval_model)
1043
 
1044
- n_err = len(state["errors"])
1045
- err_msg = f" (⚠️ {n_err}개 였λ₯˜)" if n_err > 0 else ""
1046
- restore_msg = f" (πŸ’Ύ {cached}개 볡원)" if cached > 0 else ""
1047
  mode_str = "🌟Proto-AGI" if proto_agi else "πŸ€–λ‹¨μΌLLM"
1048
- elapsed_total = int(time.time() - state["start_time"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
 
1050
- display_model = f"{eval_model} [{mode_str}]"
1051
- summary = _build_final_summary(results, tasks, pillar_scores, aether, display_model, hf_status)
1052
- table = _build_progress_table(results, tasks)
1053
- detail = _build_detail_view(results, tasks)
1054
 
1055
- yield (f"🏁 μ™„λ£Œ! {mode_str}{restore_msg}{err_msg} AETHER={aether:.1f} ({elapsed_total}초)",
1056
- table, summary, detail, csv_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
 
1058
 
1059
  # ════════════════════════════════════════════════════════════════
1060
- # PART 11: Gradio App β€” β˜… νƒ€μž„μ•„μ›ƒ λ°©μ§€ launch μ„€μ •
1061
  # ════════════════════════════════════════════════════════════════
1062
 
1063
  PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
@@ -1065,12 +1207,13 @@ DIFF_CHOICES = ["전체", "basic", "intermediate", "advanced", "expert", "fronti
1065
 
1066
  HEADER = """
1067
  <div style="text-align:center;padding:16px 0;">
1068
- <h1 style="margin:0;font-size:1.8em;">πŸŒ€ AETHER-Bench v0.3.0</h1>
1069
  <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ</h2>
1070
  <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
1071
  120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
1072
  🌟 <b>Proto-AGI</b>: ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄ μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ + λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀<br>
1073
- πŸ€– <b>단일 LLM</b>: 순수 μ‹œν—˜ 평가 | CSV β†’ HuggingFace PRIVATE 기둝
 
1074
  </p>
1075
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
1076
  <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 λ°œμƒ(仁)</span>
@@ -1081,6 +1224,7 @@ HEADER = """
1081
  </div>
1082
  </div>"""
1083
 
 
1084
  def create_app():
1085
  with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
1086
  css=".gradio-container{max-width:1100px !important}") as app:
@@ -1115,7 +1259,12 @@ def create_app():
1115
  with gr.Row():
1116
  start_btn = gr.Button("▢️ 평가 μ‹œμž‘ (μ΄μ–΄ν•˜κΈ°)", variant="primary", size="lg", scale=2)
1117
  fresh_btn = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
1118
- gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">⚑ Fireworks 단일 λ°±μ—”λ“œ<br>▢️ μ€‘λ‹¨μ‹œ μ΄μ–΄μ„œ | πŸš€ μ΄ˆκΈ°ν™”ν›„ μž¬μ‹œμž‘ | CSVβ†’HF PRIVATE</p>')
 
 
 
 
 
1119
 
1120
  with gr.Tabs():
1121
  with gr.Tab("πŸ“Š μ§„ν–‰"):
@@ -1129,29 +1278,36 @@ def create_app():
1129
  with gr.Tab("πŸ’Ύ CSV"):
1130
  csv_file = gr.File(label="평가 κ²°κ³Ό CSV")
1131
 
1132
- def _run_resume(ak,em,jm,pagi,pf,df,mt,nw):
1133
- yield from run_evaluation(ak,em,jm,pf,df,mt,nw,pagi,False)
1134
- def _run_fresh(ak,em,jm,pagi,pf,df,mt,nw):
1135
- yield from run_evaluation(ak,em,jm,pf,df,mt,nw,pagi,True)
 
 
1136
 
 
1137
  all_inputs = [api_key, eval_model, judge_model,
1138
  proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
1139
 
1140
  start_btn.click(
1141
- fn=_run_resume,
1142
  inputs=all_inputs,
1143
- outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
1144
  )
1145
  fresh_btn.click(
1146
- fn=_run_fresh,
1147
  inputs=all_inputs,
1148
- outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
 
 
 
 
1149
  )
1150
 
1151
  gr.Markdown("""---
1152
- <center>AETHER-Bench v0.3.0 Β· Apache 2.0 Β· Ginigen AI (μ§€λ‹ˆμ  AI)<br>
1153
  🌟 Proto-AGI μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ | Fireworks: <b>kimi-k2p5</b> (피평가) + <b>kimi-k2p5</b> (Judge)<br>
1154
- <code>HF_TOKEN</code> μ„€μ • μ‹œ PRIVATE μžλ™ 기둝</center>""")
1155
  return app
1156
 
1157
  # ════════════════════════════════════════════════════════════════
@@ -1162,17 +1318,15 @@ if __name__ == "__main__":
1162
  stats = {}
1163
  for t in ALL_TASKS:
1164
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
1165
- print(f"AETHER-Bench v0.3.0 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
1166
  print(f" Proto-AGI: 木_λ°œμƒβ†’η«_ν‘œν˜„β†’εœŸ_톡합→金_μ‹¬νŒβ†’ζ°΄_μ„±μ°° (5 agents)")
 
1167
  for p, n in stats.items():
1168
  info = PILLAR_INFO[p]
1169
  print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
1170
 
1171
  app = create_app()
1172
- # β˜… νƒ€μž„μ•„μ›ƒ λ°©μ§€ 핡심 μ„€μ •
1173
- app.queue(
1174
- default_concurrency_limit=1, # λ™μ‹œ μ‚¬μš©μž 1λͺ…μ”©
1175
- )
1176
  app.launch(
1177
  server_name="0.0.0.0",
1178
  server_port=7860,
 
908
  return out
909
 
910
 
 
 
 
 
 
 
 
911
 
912
+ # ════════════════════════════════════════════════════════════════
913
+ # PART 10-B: λ°±κ·ΈλΌμš΄λ“œ 평가 μ—”μ§„ (μ„Έμ…˜ νƒ€μž„μ•„μ›ƒ λ°©μ§€)
914
+ # ════════════════════════════════════════════════════════════════
915
+ # β˜… 핡심 λ³€κ²½: generator(yield) β†’ λ°±κ·ΈλΌμš΄λ“œ μŠ€λ ˆλ“œ + Timer 폴링
916
+ # - λ²„νŠΌ 클릭 β†’ μ¦‰μ‹œ 리턴 (SSE λŠκΉ€ μ—†μŒ)
917
+ # - gr.Timer(2초) β†’ μƒνƒœ 폴링 β†’ UI κ°±μ‹  (각 폴링은 독립 μš”μ²­)
918
+ # - νŽ˜μ΄μ§€ μƒˆλ‘œκ³ μΉ¨ β†’ Timer μžλ™ 재개 β†’ μ§„ν–‰ 상황 μ¦‰μ‹œ ν‘œμ‹œ
919
+ # - DB 체크포인트 β†’ μ–΄λ–€ μƒν™©μ—μ„œλ„ μ΄μ–΄ν•˜κΈ° κ°€λŠ₯
920
+
921
+ _EVAL_STATE = {
922
+ "running": False,
923
+ "stop_requested": False,
924
+ "finished": False,
925
+ "run_id": "",
926
+ "model": "",
927
+ "proto_agi": False,
928
+ "done": 0,
929
+ "total": 0,
930
+ "cached": 0,
931
+ "pending_count": 0,
932
+ "errors": [],
933
+ "active": [],
934
+ "parse_ok": 0,
935
+ "parse_fail": 0,
936
+ "start_time": 0,
937
+ "results": {},
938
+ "tasks": [],
939
+ "pillar_done": {},
940
+ "pillar_total": {},
941
+ "n_workers": 5,
942
+ "lock": threading.Lock(),
943
+ "message": "",
944
+ "csv_path": None,
945
+ "hf_status": "",
946
+ }
947
 
 
 
 
 
948
 
949
+ def _reset_eval_state():
950
+ """평가 μƒνƒœ μ΄ˆκΈ°ν™”"""
951
+ global _EVAL_STATE
952
+ with _EVAL_STATE["lock"]:
953
+ _EVAL_STATE.update({
954
+ "running": False,
955
+ "stop_requested": False,
956
+ "finished": False,
957
+ "done": 0,
958
+ "cached": 0,
959
+ "pending_count": 0,
960
+ "errors": [],
961
+ "active": [],
962
+ "parse_ok": 0,
963
+ "parse_fail": 0,
964
+ "start_time": 0,
965
+ "results": {},
966
+ "tasks": [],
967
+ "pillar_done": {},
968
+ "pillar_total": {},
969
+ "message": "",
970
+ "csv_path": None,
971
+ "hf_status": "",
972
+ })
973
+
974
+
975
+ def _bg_evaluate(api_key, eval_model, judge_model, tasks, run_id,
976
+ n_workers, proto_agi):
977
+ """λ°±κ·ΈλΌμš΄λ“œ μŠ€λ ˆλ“œ: 평가 μ‹€ν–‰ β†’ _EVAL_STATE μ—…λ°μ΄νŠΈ β†’ DB μ €μž₯"""
978
+ global _EVAL_STATE
979
 
980
+ try:
981
+ results = dict(_load_all(run_id))
982
+ cached = sum(1 for t in tasks if t.task_id in results)
983
+ pending = [t for t in tasks if t.task_id not in results]
984
+
985
+ pillar_tasks = {}
986
+ for t in pending:
987
+ pillar_tasks.setdefault(t.pillar, []).append(t)
988
+
989
+ with _EVAL_STATE["lock"]:
990
+ _EVAL_STATE["results"] = results
991
+ _EVAL_STATE["cached"] = cached
992
+ _EVAL_STATE["pending_count"] = len(pending)
993
+ _EVAL_STATE["total"] = len(tasks)
994
+ _EVAL_STATE["pillar_total"] = {p: len(ts) for p, ts in pillar_tasks.items()}
995
+ _EVAL_STATE["pillar_done"] = {p: 0 for p in pillar_tasks}
996
+ _EVAL_STATE["start_time"] = time.time()
997
+
998
+ if not pending:
999
+ with _EVAL_STATE["lock"]:
1000
+ _EVAL_STATE["message"] = f"πŸ’Ύ μ „λΆ€ μΊμ‹œ μ™„λ£Œ! ({cached}개)"
1001
+ _finalize_results(tasks, results, eval_model, proto_agi)
1002
+ return
1003
+
1004
+ with _EVAL_STATE["lock"]:
1005
+ _EVAL_STATE["message"] = f"⚑ μ‹œμž‘! {len(pending)}개 과제 Β· {n_workers}μ›Œμ»€"
1006
+
1007
+ # ── ThreadPoolExecutor ──
1008
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
1009
+ futures = {}
1010
+ for task in pending:
1011
+ if _EVAL_STATE["stop_requested"]:
1012
+ break
1013
+ fut = executor.submit(_eval_single_task, task, run_id, api_key,
1014
+ eval_model, judge_model, _EVAL_STATE, proto_agi)
1015
+ futures[fut] = task
1016
+
1017
+ completed = set()
1018
+ while len(completed) < len(futures):
1019
+ if _EVAL_STATE["stop_requested"]:
1020
+ executor.shutdown(wait=False, cancel_futures=True)
1021
+ with _EVAL_STATE["lock"]:
1022
+ _EVAL_STATE["message"] = "⏹️ 쀑단됨 (DB에 μ €μž₯된 κ²°κ³ΌλŠ” 보쑴)"
1023
+ _EVAL_STATE["running"] = False
1024
+ _EVAL_STATE["finished"] = True
1025
+ return
1026
+
1027
+ for fut in list(futures):
1028
+ if fut in completed:
1029
+ continue
1030
+ if fut.done():
1031
+ completed.add(fut)
1032
+ try:
1033
+ tid, data = fut.result()
1034
+ with _EVAL_STATE["lock"]:
1035
+ _EVAL_STATE["results"][tid] = data
1036
+ task_obj = futures[fut]
1037
+ _EVAL_STATE["pillar_done"][task_obj.pillar] = \
1038
+ _EVAL_STATE["pillar_done"].get(task_obj.pillar, 0) + 1
1039
+ except Exception as e:
1040
+ with _EVAL_STATE["lock"]:
1041
+ _EVAL_STATE["errors"].append(str(e)[:60])
1042
 
1043
+ time.sleep(0.5)
 
 
1044
 
1045
+ # ── μ™„λ£Œ β†’ κ²°κ³Ό 집계 ──
1046
+ with _EVAL_STATE["lock"]:
1047
+ results = dict(_EVAL_STATE["results"])
1048
+ _finalize_results(tasks, results, eval_model, proto_agi)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
 
1050
+ except Exception as e:
1051
+ with _EVAL_STATE["lock"]:
1052
+ _EVAL_STATE["message"] = f"❌ 치λͺ…적 였λ₯˜: {str(e)[:100]}"
1053
+ _EVAL_STATE["running"] = False
1054
+ _EVAL_STATE["finished"] = True
1055
 
 
 
1056
 
1057
+ def _finalize_results(tasks, results, eval_model, proto_agi):
1058
+ """μ΅œμ’… κ²°κ³Ό 집계 + CSV + HF μ—…λ‘œλ“œ"""
1059
+ global _EVAL_STATE
1060
 
1061
  pillar_scores = {}
1062
  for p in PILLAR_INFO:
1063
  pt = [t for t in tasks if t.pillar == p and t.task_id in results]
1064
+ if pt:
1065
+ pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
1066
 
1067
  aether = calculate_aether_score(pillar_scores)
1068
 
1069
  csv_str = generate_csv(results, eval_model)
1070
+ run_id = _EVAL_STATE["run_id"]
1071
  csv_path = f"/tmp/aether_eval_{run_id}.csv"
1072
  with open(csv_path, "w", encoding="utf-8") as f:
1073
  f.write(csv_str)
1074
 
1075
  hf_status = upload_to_hf(csv_str, eval_model)
1076
 
1077
+ elapsed = int(time.time() - _EVAL_STATE["start_time"]) if _EVAL_STATE["start_time"] else 0
 
 
1078
  mode_str = "🌟Proto-AGI" if proto_agi else "πŸ€–λ‹¨μΌLLM"
1079
+ cached = _EVAL_STATE["cached"]
1080
+ n_err = len(_EVAL_STATE["errors"])
1081
+ err_msg = f" (⚠️ {n_err}개 였λ₯˜)" if n_err else ""
1082
+ restore_msg = f" (πŸ’Ύ {cached}개 볡원)" if cached else ""
1083
+
1084
+ with _EVAL_STATE["lock"]:
1085
+ _EVAL_STATE["csv_path"] = csv_path
1086
+ _EVAL_STATE["hf_status"] = hf_status
1087
+ _EVAL_STATE["message"] = f"🏁 μ™„λ£Œ! {mode_str}{restore_msg}{err_msg} AETHER={aether:.1f} ({elapsed}초)"
1088
+ _EVAL_STATE["running"] = False
1089
+ _EVAL_STATE["finished"] = True
1090
+
1091
+
1092
+ def _start_eval(api_key, eval_model, judge_model, proto_agi,
1093
+ pillar_filter, diff_filter, max_tasks, n_workers, fresh_start):
1094
+ """λ²„νŠΌ 클릭 ν•Έλ“€λŸ¬ β€” μ¦‰μ‹œ 리턴 (λ°±κ·ΈλΌμš΄λ“œ μŠ€λ ˆλ“œ μ‹œμž‘)"""
1095
+ global _EVAL_STATE
1096
 
1097
+ if _EVAL_STATE["running"]:
1098
+ return "⚠️ 이미 평가가 μ§„ν–‰ μ€‘μž…λ‹ˆλ‹€. 쀑단 ν›„ λ‹€μ‹œ μ‹œμž‘ν•˜μ„Έμš”."
 
 
1099
 
1100
+ api_key = (api_key or "").strip() or os.getenv("FIREWORKS_API_KEY", "")
1101
+ if not api_key:
1102
+ return "❌ Fireworks API Keyλ₯Ό μž…λ ₯ν•˜μ„Έμš”."
1103
+
1104
+ n_workers = int(n_workers)
1105
+ if proto_agi and n_workers > 5:
1106
+ n_workers = 5
1107
+
1108
+ tasks = ALL_TASKS[:]
1109
+ if pillar_filter != "전체":
1110
+ tasks = [t for t in tasks if t.pillar == pillar_filter]
1111
+ if diff_filter != "전체":
1112
+ tasks = [t for t in tasks if t.difficulty == diff_filter]
1113
+ tasks = tasks[:int(max_tasks)]
1114
+
1115
+ mode_suffix = "_PAGI" if proto_agi else ""
1116
+ run_id = _make_run_id(eval_model + mode_suffix)
1117
+
1118
+ if fresh_start:
1119
+ _clear_run(run_id)
1120
+
1121
+ _reset_eval_state()
1122
+ with _EVAL_STATE["lock"]:
1123
+ _EVAL_STATE["running"] = True
1124
+ _EVAL_STATE["run_id"] = run_id
1125
+ _EVAL_STATE["model"] = eval_model
1126
+ _EVAL_STATE["proto_agi"] = proto_agi
1127
+ _EVAL_STATE["tasks"] = tasks
1128
+ _EVAL_STATE["total"] = len(tasks)
1129
+ _EVAL_STATE["n_workers"] = n_workers
1130
+ _EVAL_STATE["message"] = "πŸ”„ 평가 μ€€λΉ„ 쀑..."
1131
+
1132
+ thread = threading.Thread(
1133
+ target=_bg_evaluate,
1134
+ args=(api_key, eval_model, judge_model, tasks, run_id, n_workers, proto_agi),
1135
+ daemon=True,
1136
+ )
1137
+ thread.start()
1138
+
1139
+ mode_tag = '🌟 Proto-AGI' if proto_agi else 'πŸ€– 단일LLM'
1140
+ return f"⚑ {mode_tag} 평가 μ‹œμž‘! ({len(tasks)}개 과제, {n_workers}μ›Œμ»€)"
1141
+
1142
+
1143
+ def _stop_eval():
1144
+ """쀑단 λ²„νŠΌ ν•Έλ“€λŸ¬"""
1145
+ global _EVAL_STATE
1146
+ if _EVAL_STATE["running"]:
1147
+ _EVAL_STATE["stop_requested"] = True
1148
+ return "⏹️ 쀑단 μš”μ²­λ¨... (ν˜„μž¬ μ§„ν–‰ 쀑인 과제 μ™„λ£Œ ν›„ 쀑단)"
1149
+ return "ℹ️ μ‹€ν–‰ 쀑인 평가가 μ—†μŠ΅λ‹ˆλ‹€."
1150
+
1151
+
1152
+ def _poll_status():
1153
+ """Timer 콜백 β€” 2μ΄ˆλ§ˆλ‹€ 호좜 β†’ UI 전체 κ°±μ‹ """
1154
+ global _EVAL_STATE
1155
+
1156
+ with _EVAL_STATE["lock"]:
1157
+ running = _EVAL_STATE["running"]
1158
+ finished = _EVAL_STATE["finished"]
1159
+ tasks = _EVAL_STATE.get("tasks", [])
1160
+ results = dict(_EVAL_STATE.get("results", {}))
1161
+ message = _EVAL_STATE.get("message", "")
1162
+ csv_path = _EVAL_STATE.get("csv_path")
1163
+
1164
+ # 아무것도 μ•ˆ ν•˜κ³  있으면 μ΅œμ†Œ UI
1165
+ if not running and not finished and not results:
1166
+ return ("ℹ️ ▢️ 평가 μ‹œμž‘ λ˜λŠ” πŸš€ μƒˆλ‘œ μ‹œμž‘μ„ λˆŒλŸ¬μ£Όμ„Έμš”.",
1167
+ "", "", "", None)
1168
+
1169
+ # μ§„ν–‰ μ€‘μ΄κ±°λ‚˜ μ™„λ£Œ
1170
+ if running:
1171
+ prog_html = CSS + _parallel_progress_html(_EVAL_STATE, _EVAL_STATE.get("pending_count", 0))
1172
+ elif finished:
1173
+ prog_html = f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;">{message}</div>'
1174
+ else:
1175
+ prog_html = message
1176
+
1177
+ table_html = _build_progress_table(results, tasks) if tasks else ""
1178
+
1179
+ summary_html = ""
1180
+ detail_html = ""
1181
+ csv_out = None
1182
+
1183
+ if finished and tasks:
1184
+ pillar_scores = {}
1185
+ for p in PILLAR_INFO:
1186
+ pt = [t for t in tasks if t.pillar == p and t.task_id in results]
1187
+ if pt:
1188
+ pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
1189
+ aether = calculate_aether_score(pillar_scores)
1190
+
1191
+ display_model = f'{_EVAL_STATE.get("model", "?")} [{"🌟Proto-AGI" if _EVAL_STATE.get("proto_agi") else "πŸ€–λ‹¨μΌLLM"}]'
1192
+ hf_status = _EVAL_STATE.get("hf_status", "")
1193
+ summary_html = _build_final_summary(results, tasks, pillar_scores, aether,
1194
+ display_model, hf_status)
1195
+ detail_html = _build_detail_view(results, tasks)
1196
+ csv_out = csv_path
1197
+
1198
+ return (prog_html, table_html, summary_html, detail_html, csv_out)
1199
 
1200
 
1201
  # ════════════════════════════════════════════════════════════════
1202
+ # PART 11: Gradio App β€” β˜… Timer 폴링 기반 (μ„Έμ…˜ λŠκΉ€ μ™„μ „ λ°©μ§€)
1203
  # ════════════════════════════════════════════════════════════════
1204
 
1205
  PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
 
1207
 
1208
  HEADER = """
1209
  <div style="text-align:center;padding:16px 0;">
1210
+ <h1 style="margin:0;font-size:1.8em;">πŸŒ€ AETHER-Bench v0.3.3</h1>
1211
  <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ</h2>
1212
  <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
1213
  120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
1214
  🌟 <b>Proto-AGI</b>: ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄ μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ + λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀<br>
1215
+ πŸ€– <b>단일 LLM</b>: 순수 μ‹œν—˜ 평가 | CSV β†’ HuggingFace PRIVATE 기둝<br>
1216
+ ⚑ <b>v0.3.3</b>: λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ β€” μ„Έμ…˜ λŠκΉ€/μƒˆλ‘œκ³ μΉ¨ μ‹œμ—λ„ 평가 계속 μ§„ν–‰
1217
  </p>
1218
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
1219
  <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 λ°œμƒ(仁)</span>
 
1224
  </div>
1225
  </div>"""
1226
 
1227
+
1228
  def create_app():
1229
  with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
1230
  css=".gradio-container{max-width:1100px !important}") as app:
 
1259
  with gr.Row():
1260
  start_btn = gr.Button("▢️ 평가 μ‹œμž‘ (μ΄μ–΄ν•˜κΈ°)", variant="primary", size="lg", scale=2)
1261
  fresh_btn = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
1262
+ stop_btn = gr.Button("⏹️ 쀑단", variant="stop", size="lg", scale=1)
1263
+ gr.HTML('''<p style="color:#888;font-size:0.8em;margin:auto 0;">
1264
+ ⚑ λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ β€” νŽ˜μ΄μ§€ μƒˆλ‘œκ³ μΉ¨ν•΄λ„ 평가 계속 μ§„ν–‰<br>
1265
+ ▢️ μ€‘λ‹¨μ‹œ μ΄μ–΄μ„œ | πŸš€ μ΄ˆκΈ°ν™”ν›„ μž¬μ‹œμž‘ | ⏹️ κΈ΄κΈ‰ 쀑단</p>''')
1266
+
1267
+ status_msg = gr.Textbox(label="μƒνƒœ", interactive=False, max_lines=1)
1268
 
1269
  with gr.Tabs():
1270
  with gr.Tab("πŸ“Š μ§„ν–‰"):
 
1278
  with gr.Tab("πŸ’Ύ CSV"):
1279
  csv_file = gr.File(label="평가 κ²°κ³Ό CSV")
1280
 
1281
+ # ── Timer: 2μ΄ˆλ§ˆλ‹€ UI κ°±μ‹  (SSE λŠκΉ€κ³Ό 무관) ──
1282
+ timer = gr.Timer(value=2, active=True)
1283
+ timer.tick(
1284
+ fn=_poll_status,
1285
+ outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
1286
+ )
1287
 
1288
+ # ── λ²„νŠΌ: μ¦‰μ‹œ 리턴 (generator μ•„λ‹˜!) ──
1289
  all_inputs = [api_key, eval_model, judge_model,
1290
  proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
1291
 
1292
  start_btn.click(
1293
+ fn=lambda *args: _start_eval(*args, fresh_start=False),
1294
  inputs=all_inputs,
1295
+ outputs=[status_msg],
1296
  )
1297
  fresh_btn.click(
1298
+ fn=lambda *args: _start_eval(*args, fresh_start=True),
1299
  inputs=all_inputs,
1300
+ outputs=[status_msg],
1301
+ )
1302
+ stop_btn.click(
1303
+ fn=_stop_eval,
1304
+ outputs=[status_msg],
1305
  )
1306
 
1307
  gr.Markdown("""---
1308
+ <center>AETHER-Bench v0.3.3 Β· Apache 2.0 Β· Ginigen AI (μ§€λ‹ˆμ  AI)<br>
1309
  🌟 Proto-AGI μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ | Fireworks: <b>kimi-k2p5</b> (피평가) + <b>kimi-k2p5</b> (Judge)<br>
1310
+ ⚑ λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ β€” μ„Έμ…˜ λŠκΉ€ μ™„μ „ λ°©μ§€ | <code>HF_TOKEN</code> μ„€μ • μ‹œ PRIVATE μžλ™ 기둝</center>""")
1311
  return app
1312
 
1313
  # ════════════════════════════════════════════════════════════════
 
1318
  stats = {}
1319
  for t in ALL_TASKS:
1320
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
1321
+ print(f"AETHER-Bench v0.3.3 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
1322
  print(f" Proto-AGI: 木_λ°œμƒβ†’η«_ν‘œν˜„β†’εœŸ_톡합→金_μ‹¬νŒβ†’ζ°΄_μ„±μ°° (5 agents)")
1323
+ print(f" β˜… Background thread + Timer polling (session-safe)")
1324
  for p, n in stats.items():
1325
  info = PILLAR_INFO[p]
1326
  print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
1327
 
1328
  app = create_app()
1329
+ app.queue(default_concurrency_limit=2)
 
 
 
1330
  app.launch(
1331
  server_name="0.0.0.0",
1332
  server_port=7860,