EXAM-FINALBENCH3

Running

App Files Files Community

seawolf2357 commited on Feb 18

Commit

50272a6

verified ·

1 Parent(s): 266278b

Update app.py

Browse files

Files changed (1) hide show

app.py +293 -139

app.py CHANGED Viewed

@@ -908,156 +908,298 @@ def _parallel_progress_html(state, total):
     return out
-def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
-                   max_tasks, n_workers, proto_agi, fresh_start, progress=gr.Progress()):
-    """메인 평가 — ★ 타임아웃 방지: 빠른 yield 주기 + 최적화된 토큰/timeout"""
-    api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
-    if not api_key:
-        yield "❌ Fireworks API Key를 입력하세요.", "", "", "", None
-        return
-    n_workers = int(n_workers)
-    if proto_agi and n_workers > 5:
-        n_workers = 5  # ★ 3→5 (약간 더 공격적)
-    tasks = ALL_TASKS[:]
-    if pillar_filter != "전체":
-        tasks = [t for t in tasks if t.pillar == pillar_filter]
-    if diff_filter != "전체":
-        tasks = [t for t in tasks if t.difficulty == diff_filter]
-    tasks = tasks[:int(max_tasks)]
-    mode_suffix = "_PAGI" if proto_agi else ""
-    run_id = _make_run_id(eval_model + mode_suffix)
-    if fresh_start:
-        _clear_run(run_id)
-    results = dict(_load_all(run_id))
-    total = len(tasks)
-    cached = sum(1 for t in tasks if t.task_id in results)
-    pending = [t for t in tasks if t.task_id not in results]
-    if cached > 0 and not fresh_start:
-        yield (f"💾 체크포인트 복원: {cached}/{total} 완료 — {len(pending)}개 남음",
-               _build_progress_table(results, tasks), "", "", None)
-    if not pending:
-        pillar_scores = {}
-        for p in PILLAR_INFO:
-            pt = [t for t in tasks if t.pillar == p and t.task_id in results]
-            if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
-        aether = calculate_aether_score(pillar_scores)
-        csv_str = generate_csv(results, eval_model)
-        csv_path = f"/tmp/aether_eval_{run_id}.csv"
-        with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_str)
-        hf_status = upload_to_hf(csv_str, eval_model)
-        yield (f"🏁 전부 캐시! AETHER Score: {aether:.1f}",
-               _build_progress_table(results, tasks),
-               _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status),
-               _build_detail_view(results, tasks), csv_path)
-        return
-    pillar_tasks = {}
-    for t in pending:
-        pillar_tasks.setdefault(t.pillar, []).append(t)
-    state = {
-        "lock": threading.Lock(),
-        "done": 0,
-        "active": [],
-        "errors": [],
-        "pillar_total": {p: len(ts) for p, ts in pillar_tasks.items()},
-        "pillar_done": {p: 0 for p in pillar_tasks},
-        "start_time": time.time(),
-        "parse_ok": 0,
-        "parse_fail": 0,
-    }
-    mode_tag = '🌟 Proto-AGI ON' if proto_agi else '🤖 단일 LLM'
-    yield (CSS + f'<div style="background:{"#fff3e0" if proto_agi else "#e8f5e9"};padding:12px;border-radius:8px;">'
-           f'⚡ <b>병렬 평가 시작!</b> {len(pending)}개 · {n_workers}워커 · {mode_tag}</div>',
-           _build_progress_table(results, tasks), "", "", None)
-    # ── ★ 핵심: ThreadPoolExecutor + 빠른 yield (0.3초 간격) ──
-    with ThreadPoolExecutor(max_workers=n_workers) as executor:
-        futures = {}
-        for task in pending:
-            fut = executor.submit(_eval_single_task, task, run_id, api_key,
-                                  eval_model, judge_model, state, proto_agi)
-            futures[fut] = task
-        completed = set()
-        last_yield = time.time()
-        while len(completed) < len(futures):
-            newly_done = []
-            for fut in futures:
-                if fut in completed: continue
-                if fut.done():
-                    completed.add(fut)
-                    newly_done.append(fut)
-            for fut in newly_done:
-                try:
-                    tid, data = fut.result()
-                    results[tid] = data
-                    task_obj = futures[fut]
-                    with state["lock"]:
-                        state["pillar_done"][task_obj.pillar] = state["pillar_done"].get(task_obj.pillar, 0) + 1
-                except Exception as e:
-                    with state["lock"]:
-                        state["errors"].append(str(e)[:60])
-            # ★ 핵심 변경: 0.3초마다 yield (SSE heartbeat 역할)
-            now = time.time()
-            if now - last_yield >= 0.3 or newly_done:
-                last_yield = now
-                with state["lock"]:
-                    done_now = cached + state["done"]
-                    pct = min(int(done_now / total * 100), 100)
-                    progress(done_now / total, desc=f"{done_now}/{total} ({pct}%)")
-                    prog_html = CSS + _parallel_progress_html(state, len(pending))
-                yield (prog_html, _build_progress_table(results, tasks), "", "", None)
-            if len(completed) < len(futures):
-                time.sleep(0.2)  # ★ 1.0초→0.2초 (빠른 폴링)
-    # ── 최종 결과 ──
-    progress(1.0, desc="완료!")
     pillar_scores = {}
     for p in PILLAR_INFO:
         pt = [t for t in tasks if t.pillar == p and t.task_id in results]
-        if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
     aether = calculate_aether_score(pillar_scores)
     csv_str = generate_csv(results, eval_model)
     csv_path = f"/tmp/aether_eval_{run_id}.csv"
     with open(csv_path, "w", encoding="utf-8") as f:
         f.write(csv_str)
     hf_status = upload_to_hf(csv_str, eval_model)
-    n_err = len(state["errors"])
-    err_msg = f" (⚠️ {n_err}개 오류)" if n_err > 0 else ""
-    restore_msg = f" (💾 {cached}개 복원)" if cached > 0 else ""
     mode_str = "🌟Proto-AGI" if proto_agi else "🤖단일LLM"
-    elapsed_total = int(time.time() - state["start_time"])
-    display_model = f"{eval_model} [{mode_str}]"
-    summary = _build_final_summary(results, tasks, pillar_scores, aether, display_model, hf_status)
-    table = _build_progress_table(results, tasks)
-    detail = _build_detail_view(results, tasks)
-    yield (f"🏁 완료! {mode_str}{restore_msg}{err_msg} AETHER={aether:.1f} ({elapsed_total}초)",
-           table, summary, detail, csv_path)
 # ════════════════════════════════════════════════════════════════
-# PART 11: Gradio App — ★ 타임아웃 방지 launch 설정
 # ════════════════════════════════════════════════════════════════
 PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
@@ -1065,12 +1207,13 @@ DIFF_CHOICES = ["전체", "basic", "intermediate", "advanced", "expert", "fronti
 HEADER = """
 <div style="text-align:center;padding:16px 0;">
-    <h1 style="margin:0;font-size:1.8em;">🌀 AETHER-Bench v0.3.0</h1>
     <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 시스템 + Proto-AGI 오행 멀티에이전트</h2>
     <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
         120 Tasks · 5 Pillars · 19 Sub-dimensions · HAR Metric<br>
         🌟 <b>Proto-AGI</b>: 木→火→土→金→水 오행 파이프라인 + 마방진 소통 매트릭스<br>
-        🤖 <b>단일 LLM</b>: 순수 시험 평가 | CSV → HuggingFace PRIVATE 기록
     </p>
     <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
         <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 발상(仁)</span>
@@ -1081,6 +1224,7 @@ HEADER = """
     </div>
 </div>"""
 def create_app():
     with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
                    css=".gradio-container{max-width:1100px !important}") as app:
@@ -1115,7 +1259,12 @@ def create_app():
         with gr.Row():
             start_btn = gr.Button("▶️ 평가 시작 (이어하기)", variant="primary", size="lg", scale=2)
             fresh_btn = gr.Button("🚀 새로 시작", variant="secondary", size="lg", scale=2)
-            gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">⚡ Fireworks 단일 백엔드<br>▶️ 중단시 이어서 | 🚀 초기화후 재시작 | CSV→HF PRIVATE</p>')
         with gr.Tabs():
             with gr.Tab("📊 진행"):
@@ -1129,29 +1278,36 @@ def create_app():
             with gr.Tab("💾 CSV"):
                 csv_file = gr.File(label="평가 결과 CSV")
-        def _run_resume(ak,em,jm,pagi,pf,df,mt,nw):
-            yield from run_evaluation(ak,em,jm,pf,df,mt,nw,pagi,False)
-        def _run_fresh(ak,em,jm,pagi,pf,df,mt,nw):
-            yield from run_evaluation(ak,em,jm,pf,df,mt,nw,pagi,True)
         all_inputs = [api_key, eval_model, judge_model,
                       proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
         start_btn.click(
-            fn=_run_resume,
             inputs=all_inputs,
-            outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
         )
         fresh_btn.click(
-            fn=_run_fresh,
             inputs=all_inputs,
-            outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
         )
         gr.Markdown("""---
-<center>AETHER-Bench v0.3.0 · Apache 2.0 · Ginigen AI (지니젠AI)<br>
 🌟 Proto-AGI 오행 파이프라인 | Fireworks: <b>kimi-k2p5</b> (피평가) + <b>kimi-k2p5</b> (Judge)<br>
-<code>HF_TOKEN</code> 설정 시 PRIVATE 자동 기록</center>""")
     return app
 # ════════════════════════════════════════════════════════════════
@@ -1162,17 +1318,15 @@ if __name__ == "__main__":
     stats = {}
     for t in ALL_TASKS:
         stats[t.pillar] = stats.get(t.pillar, 0) + 1
-    print(f"AETHER-Bench v0.3.0 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
     print(f"  Proto-AGI: 木_발상→火_표현→土_통합→金_심판→水_성찰 (5 agents)")
     for p, n in stats.items():
         info = PILLAR_INFO[p]
         print(f"  {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
     app = create_app()
-    # ★ 타임아웃 방지 핵심 설정
-    app.queue(
-        default_concurrency_limit=1,  # 동시 사용자 1명씩
-    )
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,

     return out
+# ════════════════════════════════════════════════════════════════
+# PART 10-B: 백그라운드 평가 엔진 (세션 타임아웃 방지)
+# ════════════════════════════════════════════════════════════════
+# ★ 핵심 변경: generator(yield) → 백그라운드 스레드 + Timer 폴링
+#   - 버튼 클릭 → 즉시 리턴 (SSE 끊김 없음)
+#   - gr.Timer(2초) → 상태 폴링 → UI 갱신 (각 폴링은 독립 요청)
+#   - 페이지 새로고침 → Timer 자동 재개 → 진행 상황 즉시 표시
+#   - DB 체크포인트 → 어떤 상황에서도 이어하기 가능
+_EVAL_STATE = {
+    "running": False,
+    "stop_requested": False,
+    "finished": False,
+    "run_id": "",
+    "model": "",
+    "proto_agi": False,
+    "done": 0,
+    "total": 0,
+    "cached": 0,
+    "pending_count": 0,
+    "errors": [],
+    "active": [],
+    "parse_ok": 0,
+    "parse_fail": 0,
+    "start_time": 0,
+    "results": {},
+    "tasks": [],
+    "pillar_done": {},
+    "pillar_total": {},
+    "n_workers": 5,
+    "lock": threading.Lock(),
+    "message": "",
+    "csv_path": None,
+    "hf_status": "",
+}
+def _reset_eval_state():
+    """평가 상태 초기화"""
+    global _EVAL_STATE
+    with _EVAL_STATE["lock"]:
+        _EVAL_STATE.update({
+            "running": False,
+            "stop_requested": False,
+            "finished": False,
+            "done": 0,
+            "cached": 0,
+            "pending_count": 0,
+            "errors": [],
+            "active": [],
+            "parse_ok": 0,
+            "parse_fail": 0,
+            "start_time": 0,
+            "results": {},
+            "tasks": [],
+            "pillar_done": {},
+            "pillar_total": {},
+            "message": "",
+            "csv_path": None,
+            "hf_status": "",
+        })
+def _bg_evaluate(api_key, eval_model, judge_model, tasks, run_id,
+                 n_workers, proto_agi):
+    """백그라운드 스레드: 평가 실행 → _EVAL_STATE 업데이트 → DB 저장"""
+    global _EVAL_STATE
+    try:
+        results = dict(_load_all(run_id))
+        cached = sum(1 for t in tasks if t.task_id in results)
+        pending = [t for t in tasks if t.task_id not in results]
+        pillar_tasks = {}
+        for t in pending:
+            pillar_tasks.setdefault(t.pillar, []).append(t)
+        with _EVAL_STATE["lock"]:
+            _EVAL_STATE["results"] = results
+            _EVAL_STATE["cached"] = cached
+            _EVAL_STATE["pending_count"] = len(pending)
+            _EVAL_STATE["total"] = len(tasks)
+            _EVAL_STATE["pillar_total"] = {p: len(ts) for p, ts in pillar_tasks.items()}
+            _EVAL_STATE["pillar_done"] = {p: 0 for p in pillar_tasks}
+            _EVAL_STATE["start_time"] = time.time()
+        if not pending:
+            with _EVAL_STATE["lock"]:
+                _EVAL_STATE["message"] = f"💾 전부 캐시 완료! ({cached}개)"
+            _finalize_results(tasks, results, eval_model, proto_agi)
+            return
+        with _EVAL_STATE["lock"]:
+            _EVAL_STATE["message"] = f"⚡ 시작! {len(pending)}개 과제 · {n_workers}워커"
+        # ── ThreadPoolExecutor ──
+        with ThreadPoolExecutor(max_workers=n_workers) as executor:
+            futures = {}
+            for task in pending:
+                if _EVAL_STATE["stop_requested"]:
+                    break
+                fut = executor.submit(_eval_single_task, task, run_id, api_key,
+                                      eval_model, judge_model, _EVAL_STATE, proto_agi)
+                futures[fut] = task
+            completed = set()
+            while len(completed) < len(futures):
+                if _EVAL_STATE["stop_requested"]:
+                    executor.shutdown(wait=False, cancel_futures=True)
+                    with _EVAL_STATE["lock"]:
+                        _EVAL_STATE["message"] = "⏹️ 중단됨 (DB에 저장된 결과는 보존)"
+                        _EVAL_STATE["running"] = False
+                        _EVAL_STATE["finished"] = True
+                    return
+                for fut in list(futures):
+                    if fut in completed:
+                        continue
+                    if fut.done():
+                        completed.add(fut)
+                        try:
+                            tid, data = fut.result()
+                            with _EVAL_STATE["lock"]:
+                                _EVAL_STATE["results"][tid] = data
+                                task_obj = futures[fut]
+                                _EVAL_STATE["pillar_done"][task_obj.pillar] = \
+                                    _EVAL_STATE["pillar_done"].get(task_obj.pillar, 0) + 1
+                        except Exception as e:
+                            with _EVAL_STATE["lock"]:
+                                _EVAL_STATE["errors"].append(str(e)[:60])
+                time.sleep(0.5)
+        # ── 완료 → 결과 집계 ──
+        with _EVAL_STATE["lock"]:
+            results = dict(_EVAL_STATE["results"])
+        _finalize_results(tasks, results, eval_model, proto_agi)
+    except Exception as e:
+        with _EVAL_STATE["lock"]:
+            _EVAL_STATE["message"] = f"❌ 치명적 오류: {str(e)[:100]}"
+            _EVAL_STATE["running"] = False
+            _EVAL_STATE["finished"] = True
+def _finalize_results(tasks, results, eval_model, proto_agi):
+    """최종 결과 집계 + CSV + HF 업로드"""
+    global _EVAL_STATE
     pillar_scores = {}
     for p in PILLAR_INFO:
         pt = [t for t in tasks if t.pillar == p and t.task_id in results]
+        if pt:
+            pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
     aether = calculate_aether_score(pillar_scores)
     csv_str = generate_csv(results, eval_model)
+    run_id = _EVAL_STATE["run_id"]
     csv_path = f"/tmp/aether_eval_{run_id}.csv"
     with open(csv_path, "w", encoding="utf-8") as f:
         f.write(csv_str)
     hf_status = upload_to_hf(csv_str, eval_model)
+    elapsed = int(time.time() - _EVAL_STATE["start_time"]) if _EVAL_STATE["start_time"] else 0
     mode_str = "🌟Proto-AGI" if proto_agi else "🤖단일LLM"
+    cached = _EVAL_STATE["cached"]
+    n_err = len(_EVAL_STATE["errors"])
+    err_msg = f" (⚠️ {n_err}개 오류)" if n_err else ""
+    restore_msg = f" (💾 {cached}개 복원)" if cached else ""
+    with _EVAL_STATE["lock"]:
+        _EVAL_STATE["csv_path"] = csv_path
+        _EVAL_STATE["hf_status"] = hf_status
+        _EVAL_STATE["message"] = f"🏁 완료! {mode_str}{restore_msg}{err_msg} AETHER={aether:.1f} ({elapsed}초)"
+        _EVAL_STATE["running"] = False
+        _EVAL_STATE["finished"] = True
+def _start_eval(api_key, eval_model, judge_model, proto_agi,
+                pillar_filter, diff_filter, max_tasks, n_workers, fresh_start):
+    """버튼 클릭 핸들러 — 즉시 리턴 (백그라운드 스레드 시작)"""
+    global _EVAL_STATE
+    if _EVAL_STATE["running"]:
+        return "⚠️ 이미 평가가 진행 중입니다. 중단 후 다시 시작하세요."
+    api_key = (api_key or "").strip() or os.getenv("FIREWORKS_API_KEY", "")
+    if not api_key:
+        return "❌ Fireworks API Key를 입력하세요."
+    n_workers = int(n_workers)
+    if proto_agi and n_workers > 5:
+        n_workers = 5
+    tasks = ALL_TASKS[:]
+    if pillar_filter != "전체":
+        tasks = [t for t in tasks if t.pillar == pillar_filter]
+    if diff_filter != "전체":
+        tasks = [t for t in tasks if t.difficulty == diff_filter]
+    tasks = tasks[:int(max_tasks)]
+    mode_suffix = "_PAGI" if proto_agi else ""
+    run_id = _make_run_id(eval_model + mode_suffix)
+    if fresh_start:
+        _clear_run(run_id)
+    _reset_eval_state()
+    with _EVAL_STATE["lock"]:
+        _EVAL_STATE["running"] = True
+        _EVAL_STATE["run_id"] = run_id
+        _EVAL_STATE["model"] = eval_model
+        _EVAL_STATE["proto_agi"] = proto_agi
+        _EVAL_STATE["tasks"] = tasks
+        _EVAL_STATE["total"] = len(tasks)
+        _EVAL_STATE["n_workers"] = n_workers
+        _EVAL_STATE["message"] = "🔄 평가 준비 중..."
+    thread = threading.Thread(
+        target=_bg_evaluate,
+        args=(api_key, eval_model, judge_model, tasks, run_id, n_workers, proto_agi),
+        daemon=True,
+    )
+    thread.start()
+    mode_tag = '🌟 Proto-AGI' if proto_agi else '🤖 단일LLM'
+    return f"⚡ {mode_tag} 평가 시작! ({len(tasks)}개 과제, {n_workers}워커)"
+def _stop_eval():
+    """중단 버튼 핸들러"""
+    global _EVAL_STATE
+    if _EVAL_STATE["running"]:
+        _EVAL_STATE["stop_requested"] = True
+        return "⏹️ 중단 요청됨... (현재 진행 중인 과제 완료 후 중단)"
+    return "ℹ️ 실행 중인 평가가 없습니다."
+def _poll_status():
+    """Timer 콜백 — 2초마다 호출 → UI 전체 갱신"""
+    global _EVAL_STATE
+    with _EVAL_STATE["lock"]:
+        running = _EVAL_STATE["running"]
+        finished = _EVAL_STATE["finished"]
+        tasks = _EVAL_STATE.get("tasks", [])
+        results = dict(_EVAL_STATE.get("results", {}))
+        message = _EVAL_STATE.get("message", "")
+        csv_path = _EVAL_STATE.get("csv_path")
+    # 아무것도 안 하고 있으면 최소 UI
+    if not running and not finished and not results:
+        return ("ℹ️ ▶️ 평가 시작 또는 🚀 새로 시작을 눌러주세요.",
+                "", "", "", None)
+    # 진행 중이거나 완료
+    if running:
+        prog_html = CSS + _parallel_progress_html(_EVAL_STATE, _EVAL_STATE.get("pending_count", 0))
+    elif finished:
+        prog_html = f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;">{message}</div>'
+    else:
+        prog_html = message
+    table_html = _build_progress_table(results, tasks) if tasks else ""
+    summary_html = ""
+    detail_html = ""
+    csv_out = None
+    if finished and tasks:
+        pillar_scores = {}
+        for p in PILLAR_INFO:
+            pt = [t for t in tasks if t.pillar == p and t.task_id in results]
+            if pt:
+                pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
+        aether = calculate_aether_score(pillar_scores)
+        display_model = f'{_EVAL_STATE.get("model", "?")} [{"🌟Proto-AGI" if _EVAL_STATE.get("proto_agi") else "🤖단일LLM"}]'
+        hf_status = _EVAL_STATE.get("hf_status", "")
+        summary_html = _build_final_summary(results, tasks, pillar_scores, aether,
+                                             display_model, hf_status)
+        detail_html = _build_detail_view(results, tasks)
+        csv_out = csv_path
+    return (prog_html, table_html, summary_html, detail_html, csv_out)
 # ════════════════════════════════════════════════════════════════
+# PART 11: Gradio App — ★ Timer 폴링 기반 (세션 끊김 완전 방지)
 # ════════════════════════════════════════════════════════════════
 PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
 HEADER = """
 <div style="text-align:center;padding:16px 0;">
+    <h1 style="margin:0;font-size:1.8em;">🌀 AETHER-Bench v0.3.3</h1>
     <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 시스템 + Proto-AGI 오행 멀티에이전트</h2>
     <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
         120 Tasks · 5 Pillars · 19 Sub-dimensions · HAR Metric<br>
         🌟 <b>Proto-AGI</b>: 木→火→土→金→水 오행 파이프라인 + 마방진 소통 매트릭스<br>
+        🤖 <b>단일 LLM</b>: 순수 시험 평가 | CSV → HuggingFace PRIVATE 기록<br>
+        ⚡ <b>v0.3.3</b>: 백그라운드 실행 — 세션 끊김/새로고침 시에도 평가 계속 진행
     </p>
     <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
         <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 발상(仁)</span>
     </div>
 </div>"""
 def create_app():
     with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
                    css=".gradio-container{max-width:1100px !important}") as app:
         with gr.Row():
             start_btn = gr.Button("▶️ 평가 시작 (이어하기)", variant="primary", size="lg", scale=2)
             fresh_btn = gr.Button("🚀 새로 시작", variant="secondary", size="lg", scale=2)
+            stop_btn = gr.Button("⏹️ 중단", variant="stop", size="lg", scale=1)
+            gr.HTML('''<p style="color:#888;font-size:0.8em;margin:auto 0;">
+                ⚡ 백그라운드 실행 — 페이지 새로고침해도 평가 계속 진행<br>
+                ▶️ 중단시 이어서 | 🚀 초기화후 재시작 | ⏹️ 긴급 중단</p>''')
+        status_msg = gr.Textbox(label="상태", interactive=False, max_lines=1)
         with gr.Tabs():
             with gr.Tab("📊 진행"):
             with gr.Tab("💾 CSV"):
                 csv_file = gr.File(label="평가 결과 CSV")
+        # ── Timer: 2초마다 UI 갱신 (SSE 끊김과 무관) ──
+        timer = gr.Timer(value=2, active=True)
+        timer.tick(
+            fn=_poll_status,
+            outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
+        )
+        # ── 버튼: 즉시 리턴 (generator 아님!) ──
         all_inputs = [api_key, eval_model, judge_model,
                       proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
         start_btn.click(
+            fn=lambda *args: _start_eval(*args, fresh_start=False),
             inputs=all_inputs,
+            outputs=[status_msg],
         )
         fresh_btn.click(
+            fn=lambda *args: _start_eval(*args, fresh_start=True),
             inputs=all_inputs,
+            outputs=[status_msg],
+        )
+        stop_btn.click(
+            fn=_stop_eval,
+            outputs=[status_msg],
         )
         gr.Markdown("""---
+<center>AETHER-Bench v0.3.3 · Apache 2.0 · Ginigen AI (지니젠AI)<br>
 🌟 Proto-AGI 오행 파이프라인 | Fireworks: <b>kimi-k2p5</b> (피평가) + <b>kimi-k2p5</b> (Judge)<br>
+⚡ 백그라운드 실행 — 세션 끊김 완전 방지 | <code>HF_TOKEN</code> 설정 시 PRIVATE 자동 기록</center>""")
     return app
 # ════════════════════════════════════════════════════════════════
     stats = {}
     for t in ALL_TASKS:
         stats[t.pillar] = stats.get(t.pillar, 0) + 1
+    print(f"AETHER-Bench v0.3.3 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
     print(f"  Proto-AGI: 木_발상→火_표현→土_통합→金_심판→水_성찰 (5 agents)")
+    print(f"  ★ Background thread + Timer polling (session-safe)")
     for p, n in stats.items():
         info = PILLAR_INFO[p]
         print(f"  {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
     app = create_app()
+    app.queue(default_concurrency_limit=2)
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,