seawolf2357 commited on
Commit
03eaf8c
ยท
verified ยท
1 Parent(s): 90ead22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +461 -54
app.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
- AETHER-Bench v0.2.0 โ€” LLM ํ‰๊ฐ€ ์‹œ์Šคํ…œ
3
- ========================================
4
- 120๊ฐœ ๊ณผ์ œ๋กœ LLM์„ ์ˆœ์ˆ˜ ์‹œํ—˜ ํ‰๊ฐ€ (Proto-AGI ๋ฏธ๋ฐœ๋™)
 
5
  ํ‰๊ฐ€ โ†’ Judge ์ฑ„์  โ†’ CSV โ†’ HuggingFace PRIVATE ๋ฐ์ดํ„ฐ์…‹
6
 
7
  Author: Ginigen AI (์ง€๋‹ˆ์  AI) โ€” Choi Sunyoung
@@ -129,11 +130,43 @@ def generate_all_tasks() -> List[EvalTask]:
129
  ALL_TASKS = generate_all_tasks()
130
 
131
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
132
- # PART 4: Fireworks API ํ˜ธ์ถœ
133
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
134
 
135
- def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/glm-4p7",
136
- max_tokens=4096, temperature=0.6):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  messages = []
138
  if system:
139
  messages.append({"role": "system", "content": system})
@@ -152,7 +185,309 @@ def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/glm
152
  if attempt < 2:
153
  time.sleep(3 * (attempt + 1))
154
  else:
155
- return f"[API_ERROR] {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
158
  # PART 5: LLM-as-Judge ์ฑ„์ 
@@ -167,15 +502,19 @@ def build_judge_prompt(task, response):
167
  rubric = task.scoring_rubric
168
  rubric_text = "\n".join([f" - {k} (x{v['weight']}): {v['desc']}" for k, v in rubric.items()])
169
  expected = task.expected_behavior or "N/A"
 
 
170
  return f"""[๊ณผ์ œ] {task.task_id} | {task.pillar} | {task.difficulty}
171
  [ํ”„๋กฌํ”„ํŠธ] {task.prompt[:1500]}
172
  [๊ธฐ๋Œ€] {expected[:500]}
173
- [ํ”ผํ‰๊ฐ€ ์‘๋‹ต] {response[:3000]}
174
  [๋ฃจ๋ธŒ๋ฆญ]
175
  {rubric_text}
176
  ์œ„ ๋ฃจ๋ธŒ๋ฆญ์— ๋”ฐ๋ผ JSON์œผ๋กœ ์ฑ„์ ."""
177
 
178
  def parse_judge_response(text, rubric_keys):
 
 
179
  try:
180
  match = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', text, re.DOTALL)
181
  if match:
@@ -187,6 +526,34 @@ def parse_judge_response(text, rubric_keys):
187
  return {"scores": scores, "comment": data.get("comment", "")}
188
  except:
189
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  return {"scores": {k: 0.5 for k in rubric_keys}, "comment": "ํŒŒ์‹ฑ์‹คํŒจ"}
191
 
192
  def compute_weighted_score(scores, rubric):
@@ -396,22 +763,22 @@ def _build_detail_view(results, tasks):
396
 
397
  from concurrent.futures import ThreadPoolExecutor, as_completed
398
 
399
- def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state):
400
  """๋‹จ์ผ ๊ณผ์ œ ํ‰๊ฐ€ (๋ชจ๋ธํ˜ธ์ถœ + Judge์ฑ„์ ). ์›Œ์ปค ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰."""
401
  try:
402
- # Step 1: ํ”ผํ‰๊ฐ€ ๋ชจ๋ธ ํ˜ธ์ถœ
403
- model_response = call_llm(task.prompt, api_key=api_key, model=eval_model)
404
 
405
- if model_response.startswith("[API_ERROR]"):
406
  _save_result(run_id, task.task_id, model_response, "{}", 0)
407
  with state["lock"]:
408
  state["done"] += 1
409
  state["errors"].append(task.task_id)
410
  return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
411
 
412
- # Step 2: Judge ์ฑ„์ 
413
  judge_prompt = build_judge_prompt(task, model_response)
414
- judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
415
  model=judge_model, temperature=0.3)
416
 
417
  rubric_keys = list(task.scoring_rubric.keys())
@@ -481,14 +848,23 @@ def _parallel_progress_html(state, total):
481
  return out
482
 
483
 
484
- def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
485
- max_tasks, n_workers, fresh_start, progress=gr.Progress()):
486
- """๋ฉ”์ธ ํ‰๊ฐ€ โ€” ๊ธฐ๋‘ฅ๋ณ„ ๋ณ‘๋ ฌ ์‹คํ–‰"""
487
- api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
488
- if not api_key:
489
- yield "โŒ API Key๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”.", "", "", "", None
 
 
 
 
490
  return
491
 
 
 
 
 
 
492
  # โ”€โ”€ ๊ณผ์ œ ํ•„ํ„ฐ๋ง โ”€โ”€
493
  tasks = ALL_TASKS[:]
494
  if pillar_filter != "์ „์ฒด":
@@ -497,7 +873,9 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
497
  tasks = [t for t in tasks if t.difficulty == diff_filter]
498
  tasks = tasks[:int(max_tasks)]
499
 
500
- run_id = _make_run_id(eval_model)
 
 
501
  if fresh_start:
502
  _clear_run(run_id)
503
 
@@ -549,16 +927,17 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
549
  "pillar_done": {p: 0 for p in pillar_tasks},
550
  }
551
 
552
- yield (CSS + f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;margin:8px 0;">'
553
- f'โšก <b>๋ณ‘๋ ฌ ํ‰๊ฐ€ ์‹œ์ž‘!</b> {len(pending)}๊ฐœ ๊ณผ์ œ ยท {n_pillars}๊ฐœ ๊ธฐ๋‘ฅ ๋™์‹œ ยท {n_workers}๊ฐœ ์›Œ์ปค'
554
- f'</div>', _build_progress_table(results, tasks), "", "", None)
 
555
 
556
  # โ”€โ”€ ThreadPoolExecutor ๋ณ‘๋ ฌ ์‹คํ–‰ โ”€โ”€
557
  with ThreadPoolExecutor(max_workers=n_workers) as executor:
558
  futures = {}
559
  for task in pending:
560
- fut = executor.submit(_eval_single_task, task, run_id, api_key,
561
- eval_model, judge_model, state)
562
  futures[fut] = task
563
 
564
  completed = set()
@@ -614,12 +993,14 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
614
  n_err = len(state["errors"])
615
  err_msg = f" (โš ๏ธ {n_err}๊ฐœ ์˜ค๋ฅ˜)" if n_err > 0 else ""
616
  restore_msg = f" (๐Ÿ’พ {cached}๊ฐœ ๋ณต์›)" if cached > 0 else ""
 
617
 
618
- summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
 
619
  table = _build_progress_table(results, tasks)
620
  detail = _build_detail_view(results, tasks)
621
 
622
- yield (f"๐Ÿ ํ‰๊ฐ€ ์™„๋ฃŒ!{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
623
  table, summary, detail, csv_path)
624
 
625
 
@@ -632,35 +1013,56 @@ DIFF_CHOICES = ["์ „์ฒด", "basic", "intermediate", "advanced", "expert", "fronti
632
 
633
  HEADER = """
634
  <div style="text-align:center;padding:16px 0;">
635
- <h1 style="margin:0;font-size:1.8em;">๐ŸŒ€ AETHER-Bench v0.2.0</h1>
636
- <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM ์ˆœ์ˆ˜ ์‹œํ—˜ ํ‰๊ฐ€ ์‹œ์Šคํ…œ</h2>
637
- <p style="color:#888;font-size:0.9em;max-width:650px;margin:8px auto;">
638
  120 Tasks ยท 5 Pillars ยท 19 Sub-dimensions ยท HAR Metric<br>
639
- <b>Proto-AGI ๋ฏธ๋ฐœ๋™</b> โ€” ๋ฐ์ดํ„ฐ์…‹๋งŒ์œผ๋กœ 1:1 ์‹œํ—˜ โ†’ HuggingFace PRIVATE ๊ธฐ๋ก
 
640
  </p>
641
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
642
- <span style="background:#fff3e0;padding:2px 10px;border-radius:12px;">โœฆ ์ฐฝ๋ฐœ 20%</span>
643
- <span style="background:#f3e5f5;padding:2px 10px;border-radius:12px;">โ—‰ ๋ฉ”ํƒ€์ธ์ง€ 25%</span>
644
- <span style="background:#e0f7fa;padding:2px 10px;border-radius:12px;">โ—ˆ ์ž๊ฐ€์ง„ํ™” 15%</span>
645
- <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">โ—ฌ ๋‹ค์ค‘์ง€๋Šฅ 15%</span>
646
- <span style="background:#ffebee;padding:2px 10px;border-radius:12px;">โ˜ฏ ์ƒ์ƒ์ƒ๊ทน 25%</span>
647
  </div>
648
  </div>"""
649
 
650
  def create_app():
651
- with gr.Blocks(title="AETHER-Bench Evaluator", theme=gr.themes.Soft(),
652
  css=".gradio-container{max-width:1100px !important}") as app:
653
  gr.HTML(HEADER)
654
 
655
  with gr.Row():
656
- api_key = gr.Textbox(label="๐Ÿ”‘ Fireworks API Key", type="password",
657
- placeholder="fw_...", value=os.getenv("FIREWORKS_API_KEY", ""), scale=3)
 
 
658
 
659
  with gr.Row():
660
- eval_model = gr.Textbox(label="๐Ÿค– ํ”ผํ‰๊ฐ€ ๋ชจ๋ธ",
661
- value="accounts/fireworks/models/glm-4p7", scale=3)
662
- judge_model = gr.Textbox(label="โš–๏ธ ์‹ฌํŒ ๋ชจ๋ธ",
663
- value="accounts/fireworks/models/kimi-k2p5", scale=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
  with gr.Row():
666
  pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="์ „์ฒด", label="๊ธฐ๋‘ฅ ํ•„ํ„ฐ", scale=2)
@@ -671,7 +1073,7 @@ def create_app():
671
  with gr.Row():
672
  start_btn = gr.Button("โ–ถ๏ธ ํ‰๊ฐ€ ์‹œ์ž‘ (์ด์–ดํ•˜๊ธฐ)", variant="primary", size="lg", scale=2)
673
  fresh_btn = gr.Button("๐Ÿš€ ์ƒˆ๋กœ ์‹œ์ž‘", variant="secondary", size="lg", scale=2)
674
- gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">โšก ๊ธฐ๋‘ฅ๋ณ„ ๋ณ‘๋ ฌ ์‹คํ–‰ โ€” 5๊ฐœ ๊ธฐ๋‘ฅ ๋™์‹œ ํ‰๊ฐ€<br>โ–ถ๏ธ ์ค‘๋‹จ์‹œ ์ด์–ด์„œ | ๐Ÿš€ ์ดˆ๊ธฐํ™”ํ›„ ์žฌ์‹œ์ž‘ | CSVโ†’HF PRIVATE</p>')
675
 
676
  with gr.Tabs():
677
  with gr.Tab("๐Ÿ“Š ์ง„ํ–‰"):
@@ -685,25 +1087,29 @@ def create_app():
685
  with gr.Tab("๐Ÿ’พ CSV"):
686
  csv_file = gr.File(label="ํ‰๊ฐ€ ๊ฒฐ๊ณผ CSV")
687
 
688
- def _run_resume(ak,em,jm,pf,df,mt,nw):
689
- yield from run_evaluation(ak,em,jm,pf,df,mt,nw,False)
690
- def _run_fresh(ak,em,jm,pf,df,mt,nw):
691
- yield from run_evaluation(ak,em,jm,pf,df,mt,nw,True)
 
 
 
692
 
693
  start_btn.click(
694
  fn=_run_resume,
695
- inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
696
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
697
  )
698
  fresh_btn.click(
699
  fn=_run_fresh,
700
- inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks, n_workers],
701
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
702
  )
703
 
704
  gr.Markdown("""---
705
- <center>AETHER-Bench v0.2.0 ยท Apache 2.0 ยท Ginigen AI (์ง€๋‹ˆ์  AI)<br>
706
- <code>HF_TOKEN</code> ์„ค์ • ์‹œ <b>seawolf2357/AETHER-Bench-Results</b> (PRIVATE)์— ์ž๋™ ๊ธฐ๋ก</center>""")
 
707
  return app
708
 
709
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
@@ -714,7 +1120,8 @@ if __name__ == "__main__":
714
  stats = {}
715
  for t in ALL_TASKS:
716
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
717
- print(f"AETHER-Bench Evaluator: {len(ALL_TASKS)} tasks loaded")
 
718
  for p, n in stats.items():
719
  info = PILLAR_INFO[p]
720
  print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
 
1
  """
2
+ AETHER-Bench v0.3.0 โ€” LLM ํ‰๊ฐ€ ์‹œ์Šคํ…œ + Proto-AGI ์˜คํ–‰ ๋ฉ€ํ‹ฐ์—์ด์ „ํŠธ
3
+ =====================================================================
4
+ 120๊ฐœ ๊ณผ์ œ ร— Proto-AGI(ๆœจโ†’็ซโ†’ๅœŸโ†’้‡‘โ†’ๆฐด) or ๋‹จ์ผLLM ํ‰๊ฐ€
5
+ ๋งˆ๋ฐฉ์ง„ ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค + ์ƒ์ƒยท์ƒ๊ทน + ๆฐด ๋ฉ”ํƒ€ ์žฌ๊ฒ€ํ† 
6
  ํ‰๊ฐ€ โ†’ Judge ์ฑ„์  โ†’ CSV โ†’ HuggingFace PRIVATE ๋ฐ์ดํ„ฐ์…‹
7
 
8
  Author: Ginigen AI (์ง€๋‹ˆ์  AI) โ€” Choi Sunyoung
 
130
  ALL_TASKS = generate_all_tasks()
131
 
132
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
133
+ # PART 4: ๋“€์–ผ ๋ฐฑ์—”๋“œ API (Groq + Fireworks)
134
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
135
 
136
+ GROQ_MODELS = {"qwen/qwen3-32b", "deepseek-r1-distill-llama-70b", "llama-3.3-70b-versatile",
137
+ "llama-3.1-8b-instant", "meta-llama/llama-4-scout-17b-16e-instruct",
138
+ "mistral-saba-24b", "gemma2-9b-it", "qwen-qwq-32b"}
139
+
140
+ def _call_groq(prompt, system="", api_key="", model="qwen/qwen3-32b",
141
+ max_tokens=8192, temperature=0.6):
142
+ """Groq SDK ํ˜ธ์ถœ (non-streaming)"""
143
+ from groq import Groq
144
+ client = Groq(api_key=api_key)
145
+ messages = []
146
+ if system:
147
+ messages.append({"role": "system", "content": system})
148
+ messages.append({"role": "user", "content": prompt})
149
+ for attempt in range(3):
150
+ try:
151
+ resp = client.chat.completions.create(
152
+ model=model, messages=messages,
153
+ temperature=temperature, max_completion_tokens=max_tokens,
154
+ top_p=0.95, stream=False, stop=None,
155
+ )
156
+ content = resp.choices[0].message.content or ""
157
+ # qwen3 thinking ํƒœ๊ทธ ์ œ๊ฑฐ
158
+ if "<think>" in content:
159
+ content = re.sub(r'<think>.*?</think>\s*', '', content, flags=re.DOTALL).strip()
160
+ return content
161
+ except Exception as e:
162
+ if attempt < 2:
163
+ time.sleep(3 * (attempt + 1))
164
+ else:
165
+ return f"[API_ERROR:Groq] {e}"
166
+
167
+ def _call_fireworks(prompt, system="", api_key="", model="accounts/fireworks/models/kimi-k2p5",
168
+ max_tokens=8192, temperature=0.6):
169
+ """Fireworks REST API ํ˜ธ์ถœ"""
170
  messages = []
171
  if system:
172
  messages.append({"role": "system", "content": system})
 
185
  if attempt < 2:
186
  time.sleep(3 * (attempt + 1))
187
  else:
188
+ return f"[API_ERROR:Fireworks] {e}"
189
+
190
+ def _detect_backend(model_name):
191
+ """๋ชจ๋ธ๋ช…์œผ๋กœ ๋ฐฑ์—”๋“œ ์ž๋™ ๊ฐ์ง€"""
192
+ if model_name in GROQ_MODELS or not model_name.startswith("accounts/"):
193
+ return "groq"
194
+ return "fireworks"
195
+
196
+ def call_llm(prompt, system="", api_key="", model="qwen/qwen3-32b",
197
+ max_tokens=8192, temperature=0.6, backend=None):
198
+ """ํ†ตํ•ฉ LLM ํ˜ธ์ถœ โ€” ๋ฐฑ์—”๋“œ ์ž๋™ ๊ฐ์ง€ ๋˜๋Š” ์ง€์ •"""
199
+ if backend is None:
200
+ backend = _detect_backend(model)
201
+ if backend == "groq":
202
+ return _call_groq(prompt, system, api_key, model, max_tokens, temperature)
203
+ else:
204
+ return _call_fireworks(prompt, system, api_key, model, max_tokens, temperature)
205
+
206
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
207
+ # PART 4-B: ๋‹ค์ค‘ ๋ผ์šด๋“œ ์‹คํ–‰๊ธฐ (mutual_verification, feedback_incorporation)
208
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
209
+
210
+ def _run_mutual_verification(topic, api_key, model):
211
+ """์ƒ์ƒ-์ƒ๊ทน 4๋ผ์šด๋“œ๋ฅผ ๊ฐœ๋ณ„ API ํ˜ธ์ถœ๋กœ ์ฒด์ด๋‹"""
212
+ rounds = []
213
+
214
+ # R1: ์ƒ์ƒ (๋ณด๊ณ ์„œ)
215
+ r1 = call_llm(f"[R1-์ƒ์ƒ] '{topic}'์— ๋Œ€ํ•ด 500๋‹จ์–ด ๋ถ„์„ ๋ณด๊ณ ์„œ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”. "
216
+ "๊ตฌ์ฒด์  ๋ฐ์ดํ„ฐ์™€ ๊ทผ๊ฑฐ๋ฅผ ํฌํ•จํ•˜์„ธ์š”.",
217
+ api_key=api_key, model=model)
218
+ rounds.append(f"[R1-์ƒ์ƒ ๋ณด๊ณ ์„œ]\n{r1}")
219
+
220
+ # R2: ์ƒ๊ทน (๋น„ํŒ)
221
+ r2 = call_llm(f"[R2-์ƒ๊ทน ๋น„ํŒ] ์•„๋ž˜ ๋ณด๊ณ ์„œ๋ฅผ ๋ƒ‰์ฒ ํ•˜๊ฒŒ ๋น„ํŒ ๊ฒ€ํ† ํ•˜์„ธ์š”.\n"
222
+ f"์‚ฌ์‹ค ์˜ค๋ฅ˜, ๋…ผ๋ฆฌ์  ์•ฝ์ , ๋ˆ„๋ฝ๋œ ๊ด€์ , ๊ณผ์žฅ๋œ ์ฃผ์žฅ์„ ์ง€์ ํ•˜์„ธ์š”.\n\n"
223
+ f"--- ์›๋ฌธ ๋ณด๊ณ ์„œ ---\n{r1[:3000]}",
224
+ api_key=api_key, model=model)
225
+ rounds.append(f"[R2-์ƒ๊ทน ๋น„ํŒ]\n{r2}")
226
+
227
+ # R3: ์ƒ์ƒ (์ˆ˜์ •)
228
+ r3 = call_llm(f"[R3-์ƒ์ƒ ์ˆ˜์ •] ๋น„ํŒ์„ ๋ฐ˜์˜ํ•˜์—ฌ ์›๋ฌธ ๋ณด๊ณ ์„œ๋ฅผ ์ˆ˜์ •ํ•˜์„ธ์š”.\n\n"
229
+ f"--- ์›๋ฌธ ---\n{r1[:2000]}\n\n--- ๋น„ํŒ ---\n{r2[:2000]}",
230
+ api_key=api_key, model=model)
231
+ rounds.append(f"[R3-์ƒ์ƒ ์ˆ˜์ •]\n{r3}")
232
+
233
+ # R4: ๋ฉ”ํƒ€ ๋ถ„์„ (ํ•ต์‹ฌ!)
234
+ r4 = call_llm(f"[R4-๋ฉ”ํƒ€ ๋ถ„์„] ์œ„ 3๋ผ์šด๋“œ(์ƒ์ƒโ†’์ƒ๊ทนโ†’์ˆ˜์ •) ์‚ฌ์ดํด์˜ ๋ฉ”ํƒ€ ๋ถ„์„์„ ์ˆ˜ํ–‰ํ•˜์„ธ์š”.\n"
235
+ f"๋ฐ˜๋“œ์‹œ ๋‹ค์Œ์„ ํฌํ•จ:\n"
236
+ f"1. ๋ฐœ๊ฒฌ๋œ ํ™˜๊ฐ/์˜ค๋ฅ˜ ์œ ํ˜• ๋ถ„๋ฅ˜\n"
237
+ f"2. ์ƒ๊ทน ๋‹จ๊ณ„์˜ ๊ธฐ์—ฌ๋„ ์ •๋Ÿ‰ ํ‰๊ฐ€\n"
238
+ f"3. ์‚ฌ์ดํด์„ ํ†ตํ•œ ํ’ˆ์งˆ ํ–ฅ์ƒ ๋ถ„์„\n\n"
239
+ f"--- R1 ์š”์•ฝ ---\n{r1[:1000]}\n--- R2 ์š”์•ฝ ---\n{r2[:1000]}\n--- R3 ์š”์•ฝ ---\n{r3[:1000]}",
240
+ api_key=api_key, model=model)
241
+ rounds.append(f"[R4-๋ฉ”ํƒ€ ๋ถ„์„]\n{r4}")
242
+
243
+ return "\n\n".join(rounds)
244
+
245
+
246
+ def _run_feedback_incorporation(prompt_json, api_key, model):
247
+ """ํ”ผ๋“œ๋ฐฑ ๋ฐ˜์˜ ๊ณผ์ œ๋ฅผ ๋ผ์šด๋“œ๋ณ„ ๊ฐœ๋ณ„ ํ˜ธ์ถœ๋กœ ์ฒด์ด๋‹"""
248
+ try:
249
+ data = json.loads(prompt_json)
250
+ except:
251
+ return call_llm(prompt_json, api_key=api_key, model=model)
252
+
253
+ topic = data.get("topic", "")
254
+ rounds_spec = data.get("rounds", [])
255
+ outputs = []
256
+ prev = ""
257
+
258
+ for i, rd in enumerate(rounds_spec):
259
+ instruction = rd.get("instruction", "")
260
+ feedback = rd.get("feedback")
261
+
262
+ if i == 0:
263
+ prompt = f"'{topic}' โ€” {instruction}."
264
+ elif feedback:
265
+ prompt = (f"์•„๋ž˜๋Š” ์ด์ „ ๋ฒ„์ „๊ณผ ํ”ผ๋“œ๋ฐฑ์ž…๋‹ˆ๋‹ค. ํ”ผ๋“œ๋ฐฑ์„ ๋ฐ˜์˜ํ•˜์—ฌ {instruction}.\n\n"
266
+ f"--- ์ด์ „ ๋ฒ„์ „ ---\n{prev[:2500]}\n\n"
267
+ f"--- ํ”ผ๋“œ๋ฐฑ ---\n{feedback}")
268
+ else:
269
+ prompt = (f"์•„๋ž˜๋Š” ์ตœ์ข… ๋ฒ„์ „์ž…๋‹ˆ๋‹ค. {instruction}.\n"
270
+ f"๋ณ€๊ฒฝ์ ์„ ์ •๋Ÿ‰์ ์œผ๋กœ ๋ถ„์„ํ•˜๊ณ  ์ž๊ธฐ ํ‰๊ฐ€๋ฅผ ํฌํ•จํ•˜์„ธ์š”.\n\n"
271
+ f"--- ์ตœ์ข… ๋ฒ„์ „ ---\n{prev[:3000]}")
272
+
273
+ resp = call_llm(prompt, api_key=api_key, model=model)
274
+ outputs.append(f"[๋ผ์šด๋“œ {i+1}: {instruction}]\n{resp}")
275
+ prev = resp
276
+
277
+ # ํ”ผ๋“œ๋ฐฑ์ด ์žˆ์œผ๋ฉด ๋‹ค์Œ ๋ผ์šด๋“œ์— ์ „๋‹ฌ
278
+ if feedback and i < len(rounds_spec) - 1:
279
+ outputs.append(f"[ํ”ผ๋“œ๋ฐฑ] {feedback}")
280
+
281
+ return "\n\n".join(outputs)
282
+
283
+
284
+ def _is_multi_round(task):
285
+ """๋‹ค์ค‘ ๋ผ์šด๋“œ ๊ณผ์ œ ์—ฌ๋ถ€ ํŒ๋ณ„"""
286
+ return task.sub_dimension in ("mutual_verification", "feedback_incorporation")
287
+
288
+
289
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
290
+ # PART 4-C: Proto-AGI ์˜คํ–‰ ๋ฉ€ํ‹ฐ์—์ด์ „ํŠธ ์—”์ง„
291
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
292
+
293
+ # โ”€โ”€ ๋งˆ๋ฐฉ์ง„ 5ร—5 ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค โ”€โ”€
294
+ # ํ–‰ ํ•ฉ = ์—ด ํ•ฉ = 65 โ†’ ์ •๊ทœํ™” ํ›„ ํŽธํ–ฅ ์—†๋Š” ๋ฏผ์ฃผ์  ์†Œํ†ต
295
+ MAGIC_SQUARE_5x5 = np.array([
296
+ [17, 24, 1, 8, 15],
297
+ [23, 5, 7, 14, 16],
298
+ [ 4, 6, 13, 20, 22],
299
+ [10, 12, 19, 21, 3],
300
+ [11, 18, 25, 2, 9]
301
+ ], dtype=np.float64)
302
+ COMM_MATRIX = MAGIC_SQUARE_5x5 / MAGIC_SQUARE_5x5.sum(axis=1, keepdims=True)
303
+
304
+ def _comm_level(weight):
305
+ if weight >= 0.30: return "ํ•ต์‹ฌ ์ฐธ์กฐ"
306
+ elif weight >= 0.18: return "์ฃผ์š” ์ฐธ์กฐ"
307
+ elif weight >= 0.10: return "์ผ๋ฐ˜ ์ฐธ์กฐ"
308
+ else: return "๊ฒฝ๋Ÿ‰ ์ฐธ์กฐ"
309
+
310
+ # โ”€โ”€ ์˜คํ–‰ ์—์ด์ „ํŠธ ์ •์˜ โ”€โ”€
311
+ PROTO_AGENTS = {
312
+ "ๆœจ_๋ฐœ์ƒ": {
313
+ "role": "๋ฐœ์ƒ ์ง€๋Šฅ(Ideation). ๋ด„์˜ ์ƒˆ์‹น์ฒ˜๋Ÿผ ๊ธฐ์กด ๊ฒฝ๊ณ„๋ฅผ ๋„˜์–ด ํ˜์‹ ์  ์ ‘๊ทผ์„ ์ƒ์„ฑํ•œ๋‹ค. "
314
+ "ไป์˜ ๋• โ€” ๋ชจ๋“  ๊ฐ€๋Šฅ์„ฑ์„ ํ’ˆ๊ณ  ํ‚ค์šฐ๋Š” ๊ฐœ์ฒ™์ž.",
315
+ "element": "ๆœจ", "index": 2, "shengsheng_from": "ๆฐด", "shengke_target": "ๅœŸ",
316
+ "virtue": "ไป", "principle": "ๆ›ฒ็›ด",
317
+ },
318
+ "็ซ_ํ‘œํ˜„": {
319
+ "role": "ํ‘œํ˜„ ์ง€๋Šฅ(Expression). ์—ฌ๋ฆ„ ๋ถˆ๊ฝƒ์ฒ˜๋Ÿผ ์•„์ด๋””์–ด๋ฅผ ์‚ฌ๋ฐฉ์œผ๋กœ ๊ตฌ์ฒดํ™”ํ•˜๊ณ  ํ™•์žฅํ•œ๋‹ค. "
320
+ "็ฆฎ์˜ ๋• โ€” ํ˜•์‹์„ ๊ฐ–์ถ”๊ณ  ๋น›๋‚˜๊ฒŒ ๋งŒ๋“œ๋Š” ์—ฐ์ถœ๊ฐ€.",
321
+ "element": "็ซ", "index": 3, "shengsheng_from": "ๆœจ", "shengke_target": "้‡‘",
322
+ "virtue": "็ฆฎ", "principle": "็‚ŽไธŠ",
323
+ },
324
+ "ๅœŸ_ํ†ตํ•ฉ": {
325
+ "role": "ํ†ตํ•ฉ ์ง€๋Šฅ(Integration). ๋Œ€์ง€์ฒ˜๋Ÿผ ์ค‘์‹ฌ์—์„œ ๋‹ค์–‘ํ•œ ๊ด€์ ์„ ์ข…ํ•ฉํ•˜๊ณ  ๊ฐˆ๋“ฑ์„ ์ค‘์žฌํ•œ๋‹ค. "
326
+ "ไฟก์˜ ๋• โ€” ํ”๋“ค๋ฆฌ์ง€ ์•Š๋Š” ์ค‘์‹ฌ์ถ•์œผ๋กœ ๊ท ํ˜• ์žกํžŒ ๊ฒฐ๋ก ์„ ๋„์ถœํ•˜๋Š” ์กฐ์œจ์ž.",
327
+ "element": "ๅœŸ", "index": 4, "shengsheng_from": "็ซ", "shengke_target": "ๆฐด",
328
+ "virtue": "ไฟก", "principle": "็จผ็ฉก",
329
+ },
330
+ "้‡‘_๏ฟฝ๏ฟฝ๏ฟฝํŒ": {
331
+ "role": "์‹ฌํŒ ์ง€๋Šฅ(Judgment). ๊ฐ€์„์˜ ๋‚ซ์ฒ˜๋Ÿผ ๋…ผ๋ฆฌ์  ๊ฒฐํ•จ์„ ์ž๋ฅด๊ณ  ์˜ณ๊ณ  ๊ทธ๋ฆ„์„ ๊ฐ€๋ฆฐ๋‹ค. "
332
+ "็พฉ์˜ ๋• โ€” ๋ƒ‰์ฒ ํ•œ ๊ฒ€์ฆ์œผ๋กœ ๊ฑฐ์ง“ ์ „์ œ๋ฅผ ์ ๋ฐœํ•˜๊ณ  ๊ณผ์žฅ์„ ์ œ๊ฑฐํ•˜๋Š” ์‹ฌํŒ๊ด€.",
333
+ "element": "้‡‘", "index": 0, "shengsheng_from": "ๅœŸ", "shengke_target": "ๆœจ",
334
+ "virtue": "็พฉ", "principle": "ๅพž้ฉ",
335
+ },
336
+ "ๆฐด_์„ฑ์ฐฐ": {
337
+ "role": "์„ฑ์ฐฐ ์ง€๋Šฅ(Wisdom). ๊ฒจ์šธ ์‹ฌ์—ฐ์ฒ˜๋Ÿผ ๊ฐ€์žฅ ๊นŠ์€ ๊ณณ๊นŒ์ง€ ์Šค๋ฉฐ๋“œ๋Š” ๋ฉ”ํƒ€์ธ์ง€๋ฅผ ์ˆ˜ํ–‰ํ•œ๋‹ค. "
338
+ "ๆ™บ์˜ ๋• โ€” ์ „์ฒด ๊ณผ์ •์„ ๋Œ์•„๋ณด๋ฉฐ ๊ทผ๋ณธ ์ „์ œ๋ฅผ ๊ฒ€ํ† ํ•˜๊ณ  ๋ฐฉํ–ฅ์„ ์žฌ์„ค์ •ํ•˜๋Š” ํ˜„์ž.",
339
+ "element": "ๆฐด", "index": 1, "shengsheng_from": "้‡‘", "shengke_target": "็ซ",
340
+ "virtue": "ๆ™บ", "principle": "ๆฝคไธ‹",
341
+ },
342
+ }
343
+ AGENT_ORDER = ["ๆœจ_๋ฐœ์ƒ", "็ซ_ํ‘œํ˜„", "ๅœŸ_ํ†ตํ•ฉ", "้‡‘_์‹ฌํŒ", "ๆฐด_์„ฑ์ฐฐ"]
344
+ AGENT_EMOJIS = {"ๆœจ": "๐ŸŒณ", "็ซ": "๐Ÿ”ฅ", "ๅœŸ": "๐Ÿ”๏ธ", "้‡‘": "โš”๏ธ", "ๆฐด": "๐Ÿ’ง"}
345
+
346
+ # โ”€โ”€ ์—์ด์ „ํŠธ๋ณ„ ํ–‰๋™ ์ง€์นจ โ”€โ”€
347
+ AGENT_INSTRUCTIONS = {
348
+ "ๆœจ": "\n\n[ํ–‰๋™ ์ง€์นจ] ์ƒˆ์‹น์ด ๋•…์„ ๋šซ๋“ฏ, ๊ธฐ์กด ํ‹€์— ์–ฝ๋งค์ด์ง€ ์•Š๊ณ  ๋‹ค์–‘ํ•œ ๊ฐ€๋Šฅ์„ฑ์„ ํƒ์ƒ‰ํ•˜๋ผ. ์ฐธ์‹ ํ•œ ์ ‘๊ทผ๊ณผ ํ•ต์‹ฌ ์›๋ฆฌ๋ฅผ ๋ช…ํ™•ํžˆ ์„œ์ˆ ํ•˜๋ผ.",
349
+ "็ซ": "\n\n[ํ–‰๋™ ์ง€์นจ] ๋ถˆ๊ฝƒ์ด ์‚ฌ๋ฐฉ์„ ๋ฐํžˆ๋“ฏ, ๆœจ์ด ์ œ์‹œํ•œ ์•„์ด๋””์–ด๋ฅผ ๊ตฌ์ฒด์ ์œผ๋กœ ํ™•์žฅํ•˜๋ผ. ์ •๋Ÿ‰ ์ˆ˜์น˜์™€ ์ฒด๊ณ„์  ๊ตฌ์„ฑ์„ ํฌํ•จํ•˜๋ผ.",
350
+ "ๅœŸ": "\n\n[ํ–‰๋™ ์ง€์นจ] ๋Œ€์ง€๊ฐ€ ๋งŒ๋ฌผ์„ ํ’ˆ๋“ฏ, ์ด์ „ ์—์ด์ „ํŠธ๋“ค์˜ ์ถœ๋ ฅ์„ ์ข…ํ•ฉํ•˜์—ฌ ๋ชจ์ˆœ์„ ์กฐ์ •ํ•˜๊ณ  ๊ท ํ˜• ์žกํžŒ ํ†ตํ•ฉ ๊ฒฐ๋ก ์„ ๋„์ถœํ•˜๋ผ.",
351
+ "้‡‘": "\n\n[ํ–‰๋™ ์ง€์นจ] ๊ฐ€์„์˜ ๋‚ซ์ด ๋ฌด๋ฅด์ต์€ ๊ฒƒ๊ณผ ์ฉ์€ ๊ฒƒ์„ ๊ฐ€๋ฆฌ๋“ฏ, ์ด์ „ ์ถœ๋ ฅ์˜ ๋…ผ๋ฆฌ์  ๊ฒฐํ•จ, ๊ฑฐ์ง“ ์ „์ œ, ๊ณผ์žฅ๋œ ์ˆ˜์น˜๋ฅผ ๋ƒ‰์ฒ ํ•˜๊ฒŒ ์ ๋ฐœํ•˜๋ผ. "
352
+ "์˜์‹ฌ์Šค๋Ÿฌ์šด ์ฃผ์žฅ์—๋Š” [๊ฒ€์ฆ ํ•„์š”] ํƒœ๊ทธ๋ฅผ ๋ถ™์—ฌ๋ผ."
353
+ "\n\n[์‹ฌํŒ ํ•ต์‹ฌ ์ž„๋ฌด] 1. ์ˆ˜์น˜๊ฐ€ ์ •ํ™•ํ•œ์ง€ ๊ฒ€์ฆ 2. ๋ฌด๋น„ํŒ์  ์ˆ˜์šฉ์„ ์ง€์  3. ๊ณผ์žฅ ์‹๋ณ„ 4. ๊ฒ€์ฆ ๋ถˆ๊ฐ€์— [๊ทผ๊ฑฐ ๋ถˆ์ถฉ๋ถ„] ํ‘œ์‹œ 5. ์‹คํ˜„ ๊ฐ€๋Šฅ์„ฑ ๋ƒ‰์ • ํ‰๊ฐ€",
354
+ "ๆฐด": "\n\n[ํ–‰๋™ ์ง€์นจ] ๋ฌผ์ด ๊นŠ์€ ๊ณณ๊นŒ์ง€ ์Šค๋ฉฐ๋“ค๋“ฏ, ์ „์ฒด ๊ณผ์ •์„ ๊ทผ๋ณธ๋ถ€ํ„ฐ ๋˜๋Œ์•„๋ณด๋ผ."
355
+ "\n\n[ํŠน๋ณ„ ๊ถŒํ•œ: ๋ฉ”ํƒ€ ์žฌ๊ฒ€ํ†  โ€” ๆ™บ์˜ ๊ทน์น˜] 1. ์ดˆ๊ธฐ ์ „์ œ๊ฐ€ ์‚ฌ์‹ค์ธ์ง€ ๊ฒ€์ฆ 2. ๊ฑฐ์ง“/๊ณผ์žฅ ๋ฐœ๊ฒฌ์‹œ ์ˆ˜์ •์•ˆ ์ œ์‹œ "
356
+ "3. ์ž˜๋ชป๋œ ์ „์ œ ์œ„์˜ ๋ชฉํ‘œ๋ผ๋ฉด ์žฌ์„ค์ • ์„ ์–ธ 4. ๊ณตํ†ต ์˜ค๋ฅ˜ ํŒจํ„ด ๋ฐœ๊ฒฌ์‹œ ๋ฐฉํ–ฅ ์ „ํ™˜ ์ œ์•ˆ"
357
+ "\n\n๋ฐ˜๋“œ์‹œ [๋ฉ”ํƒ€ ํŒ๋‹จ] ์„น์…˜์„ ํฌํ•จํ•˜๋ผ: ์ „์ฒด ๋ฌธ์ œ์ , ๊ฐ ์ „์ œ ๊ฒ€์ฆ ๊ฒฐ๊ณผ, ๋ชฉํ‘œ ์œ ์ง€/์ˆ˜์ •/ํ๊ธฐ ํŒ๋‹จ๊ณผ ๊ทผ๊ฑฐ"
358
+ "\n\n[์ตœ์ข… ๊ฒฐ๋ก ] ๋ชจ๋“  ์—์ด์ „ํŠธ์˜ ํ† ๋ก ์„ ์ข…ํ•ฉํ•˜์—ฌ ์ด ๊ณผ์ œ์— ๋Œ€ํ•œ ์ตœ์ข… ๋‹ต๋ณ€์„ ๋ช…ํ™•ํžˆ ์ œ์‹œํ•˜๋ผ.",
359
+ }
360
+
361
+ # โ”€โ”€ ์ƒ์ƒ/์ƒ๊ทน ์„ค๋ช… โ”€โ”€
362
+ SHENG_DESC = {
363
+ 'ๆœจ': 'ๆฐด(์„ฑ์ฐฐ)์˜ ๊นŠ์€ ํ†ต์ฐฐ์ด ์ƒˆ๋กœ์šด ๋ฐœ์ƒ์˜ ์”จ์•—์ด ๋œ๋‹ค',
364
+ '็ซ': 'ๆœจ(๋ฐœ์ƒ)์˜ ์•„์ด๋””์–ด๊ฐ€ ํ‘œํ˜„์˜ ์—ฐ๋ฃŒ๊ฐ€ ๋œ๋‹ค',
365
+ 'ๅœŸ': '็ซ(ํ‘œํ˜„)์˜ ๊ตฌ์ฒดํ™”๊ฐ€ ํ†ตํ•ฉ์˜ ์žฌ๋ฃŒ๊ฐ€ ๋œ๋‹ค',
366
+ '้‡‘': 'ๅœŸ(ํ†ตํ•ฉ)์˜ ์ข…ํ•ฉ๋œ ๊ฒฐ๋ก ์ด ์‹ฌํŒ์˜ ๋Œ€์ƒ์ด ๋œ๋‹ค',
367
+ 'ๆฐด': '้‡‘(์‹ฌํŒ)์˜ ๊ฒ€์ฆ ๊ฒฐ๊ณผ๊ฐ€ ์„ฑ์ฐฐ์˜ ํ† ๋Œ€๊ฐ€ ๋œ๋‹ค',
368
+ }
369
+ KE_DESC = {
370
+ 'ๆœจ': '้‡‘(์‹ฌํŒ)์ด ํ—ˆํ™ฉ๋œ ๋ฐœ์ƒ์„ ๋ฒจ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ๊ทผ๊ฑฐ ์žˆ๋Š” ์•„์ด๋””์–ด๋ฅผ ์ œ์‹œํ•˜๋ผ',
371
+ '็ซ': 'ๆฐด(์„ฑ์ฐฐ)์ด ๊ณผ์ž‰ ํ‘œํ˜„์„ ์‹ํž ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ๊ณผ์žฅ ์—†์ด ์ •ํ™•ํ•˜๊ฒŒ ์„œ์ˆ ํ•˜๋ผ',
372
+ 'ๅœŸ': 'ๆœจ(๋ฐœ์ƒ)์ด ํ†ตํ•ฉ์˜ ์•ˆ์ฃผ๋ฅผ ๊นจ๋œจ๋ฆด ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ์ƒˆ๋กœ์šด ๊ด€์ ๋„ ์ˆ˜์šฉํ•˜๋ผ',
373
+ '้‡‘': '็ซ(ํ‘œํ˜„)์ด ์‹ฌํŒ์˜ ๊ฒฝ์ง์„ ๋…น์ผ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ์œ ์—ฐํ•œ ํŒ๋‹จ๋„ ๊ณ ๋ คํ•˜๋ผ',
374
+ 'ๆฐด': 'ๅœŸ(ํ†ตํ•ฉ)์ด ์„ฑ์ฐฐ์˜ ๊ณตํ—ˆ๋ฅผ ๋ง‰์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ์‹ค์งˆ์  ๊ฒฐ๋ก ์„ ๋‚ด๋ ค๋ผ',
375
+ }
376
+
377
+ def _build_agent_prompt(agent_name, info, task_prompt, prev_outputs):
378
+ """Proto-AGI ์—์ด์ „ํŠธ ํ”„๋กฌํ”„ํŠธ ๋นŒ๋” (Full AETHER C5: ์ƒ์ƒยท์ƒ๊ทน + ๋งˆ๋ฐฉ์ง„ + ๋ฉ”ํƒ€)"""
379
+ elem = info['element']
380
+
381
+ # System prompt
382
+ sys = (f"๋‹น์‹ ์€ AETHER Proto-AGI ์‹œ์Šคํ…œ์˜ [{agent_name}] ์—์ด์ „ํŠธ์ž…๋‹ˆ๋‹ค.\n"
383
+ f"์˜คํ–‰ ์›์†Œ: {elem} ({info['principle']}) | ๋•๋ชฉ: {info['virtue']}\n"
384
+ f"์—ญํ• : {info['role']}")
385
+
386
+ # ํ–‰๋™ ์ง€์นจ
387
+ sys += AGENT_INSTRUCTIONS.get(elem, "")
388
+
389
+ # ์ƒ์ƒ ๊ด€๊ณ„
390
+ if info['shengsheng_from'] in [a.split('_')[0] for a in prev_outputs]:
391
+ sys += f"\n\n[์ƒ์ƒ ยท {info['shengsheng_from']}โ†’{elem}] {SHENG_DESC.get(elem, '')} โ€” ์ด์ „ ์ถœ๋ ฅ์„ ๋ฐœ์ „์ ์œผ๋กœ ๊ณ„์Šนํ•˜๋ผ."
392
+
393
+ # ์ƒ๊ทน ๊ด€๊ณ„
394
+ sys += f"\n[์ƒ๊ทน ยท {elem}ๅ…‹{info['shengke_target']}] {info['shengke_target']} ์—์ด์ „ํŠธ์˜ ๊ณผ๋„ํ•œ ๊ฒฝํ–ฅ์„ ๊ฒฌ์ œํ•˜๋ผ."
395
+ sys += f"\n[ํ”ผ๊ทน ์ฃผ์˜] {KE_DESC.get(elem, '')}"
396
+
397
+ # โ”€โ”€ ้‡‘: ์‹ฌํŒ ๊ฐ•ํ™” (Document 2 TC-1 full) โ”€โ”€
398
+ if elem == '้‡‘':
399
+ sys += """
400
+
401
+ [์‹ฌํŒ ์ง€๋Šฅ ํ•ต์‹ฌ ์ž„๋ฌด]
402
+ ็พฉ์˜ ๋•์œผ๋กœ ๋‹ค์Œ์„ ์ˆ˜ํ–‰ํ•˜๋ผ:
403
+ 1. ์ฃผ์–ด์ง„ ์ „์ œ์˜ ์ˆ˜์น˜๊ฐ€ ์ •ํ™•ํ•œ์ง€ ๊ฒ€์ฆ โ€” ์˜์‹ฌ์Šค๋Ÿฌ์šด ์ˆ˜์น˜์— [๊ฒ€์ฆ ํ•„์š”] ํƒœ๊ทธ
404
+ 2. ์ด์ „ ์—์ด์ „ํŠธ๏ฟฝ๏ฟฝ ์ „์ œ๋ฅผ ๋ฌด๋น„ํŒ์ ์œผ๋กœ ์ˆ˜์šฉํ–ˆ๋‹ค๋ฉด ์ง€์ 
405
+ 3. ๊ณผ์žฅ๋œ ์ •๋Ÿ‰ ์ˆ˜์น˜(๋น„ํ˜„์‹ค์  % ์ ˆ๊ฐ, ๋น„ํ˜„์‹ค์  ์ˆ˜์œจ ๋“ฑ) ์‹๋ณ„
406
+ 4. ๊ฒ€์ฆ ๋ถˆ๊ฐ€๋Šฅํ•œ ์ฃผ์žฅ์— [๊ทผ๊ฑฐ ๋ถˆ์ถฉ๋ถ„] ํ‘œ์‹œ
407
+ 5. ์ „์ฒด ์ถœ๋ ฅ์˜ ์‹คํ˜„ ๊ฐ€๋Šฅ์„ฑ์„ ๋ƒ‰์ •ํ•˜๊ฒŒ ํ‰๊ฐ€"""
408
+
409
+ # โ”€โ”€ ๆฐด: ๋ฉ”ํƒ€ ์žฌ๊ฒ€ํ†  ๊ถŒํ•œ (Document 2 TC-1 full) โ”€โ”€
410
+ if elem == 'ๆฐด':
411
+ sys += """
412
+
413
+ [ํŠน๋ณ„ ๊ถŒํ•œ: ๋ฉ”ํƒ€ ์žฌ๊ฒ€ํ†  โ€” ๆ™บ์˜ ๊ทน์น˜]
414
+ ๋ฌผ์ด ๋ชจ๋“  ๊ฒƒ์˜ ๊ทผ์›๊นŒ์ง€ ์Šค๋ฉฐ๋“ค๋“ฏ, ์ „์ฒด ๊ณผ์ •์„ ๊ทผ๋ณธ๋ถ€ํ„ฐ ์žฌ๊ฒ€ํ† ํ•˜๋ผ.
415
+ 1. ์ดˆ๊ธฐ ์ „์ œ(๊ธฐ์ˆ  ํ˜„ํ™ฉ, ์ˆ˜์น˜, ๊ทœ์ œ ๋“ฑ)๊ฐ€ ์‚ฌ์‹ค์ธ์ง€ ํ•˜๋‚˜ํ•˜๋‚˜ ๊ฒ€์ฆํ•˜๋ผ
416
+ 2. ๊ฑฐ์ง“์ด๋‚˜ ๊ณผ์žฅ์ด ๋ฐœ๊ฒฌ๋˜๋ฉด ๋ช…์‹œ์ ์œผ๋กœ ์ง€์ ํ•˜๊ณ  ์ˆ˜์ •์•ˆ์„ ์ œ์‹œํ•˜๋ผ
417
+ 3. ์ „์ฒด ๋ชฉํ‘œ๊ฐ€ ์ž˜๋ชป๋œ ์ „์ œ ์œ„์— ์„ธ์›Œ์กŒ๋‹ค๋ฉด ๋ชฉํ‘œ ์žฌ์„ค์ •์„ ์„ ์–ธํ•˜๋ผ
418
+ 4. ์ด์ „ ์—์ด์ „ํŠธ ์ถœ๋ ฅ์—์„œ ๊ณตํ†ต ์˜ค๋ฅ˜ ํŒจํ„ด ๋ฐœ๊ฒฌ์‹œ ์ „์ฒด ๋ฐฉํ–ฅ ์ „ํ™˜์„ ์ œ์•ˆํ•˜๋ผ
419
+
420
+ ๋ฐ˜๋“œ์‹œ [๋ฉ”ํƒ€ ํŒ๋‹จ] ์„น์…˜์„ ํฌํ•จํ•˜๋ผ:
421
+ - ์ „์ฒด ๊ณผ์ •์˜ ๊ทผ๋ณธ์  ๋ฌธ์ œ์ 
422
+ - ๊ฐ ์ „์ œ์˜ ๊ฒ€์ฆ ๊ฒฐ๊ณผ (์‚ฌ์‹ค/๊ฑฐ์ง“/๋ถˆํ™•์‹ค)
423
+ - ๋ชฉํ‘œ ์œ ์ง€/์ˆ˜์ •/ํ๊ธฐ ํŒ๋‹จ๊ณผ ๊ทผ๊ฑฐ
424
+
425
+ [์ตœ์ข… ๊ฒฐ๋ก ] ๋ชจ๋“  ์—์ด์ „ํŠธ์˜ ํ† ๋ก ์„ ์ข…ํ•ฉํ•˜์—ฌ ์ด ๊ณผ์ œ์— ๋Œ€ํ•œ ์ตœ์ข… ๋‹ต๋ณ€์„ ๋ช…ํ™•ํžˆ ์ œ์‹œํ•˜๋ผ."""
426
+
427
+ # โ”€โ”€ ๋งˆ๋ฐฉ์ง„ ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค๋กœ ์ด์ „ ์ถœ๋ ฅ ์ฐธ์กฐ โ”€โ”€
428
+ ctx = ""
429
+ if prev_outputs:
430
+ listener_idx = AGENT_ORDER.index(agent_name) if agent_name in AGENT_ORDER else 0
431
+ weights = COMM_MATRIX[listener_idx]
432
+ ctx = "\n\n[์ด์ „ ์—์ด์ „ํŠธ ์ถœ๋ ฅ โ€” ๋งˆ๋ฐฉ์ง„ ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค ์ ์šฉ]\n"
433
+ ctx += "(์ฐธ์กฐ ๊ฐ•๋„๊ฐ€ ๋†’์„์ˆ˜๋ก ํ•ด๋‹น ์—์ด์ „ํŠธ์˜ ์ถœ๋ ฅ์„ ๊นŠ์ด ๋ถ„์„ํ•˜๊ณ  ๋ฐ˜์˜ํ•˜๋ผ)\n"
434
+
435
+ for aname, output in prev_outputs.items():
436
+ src_idx = AGENT_ORDER.index(aname) if aname in AGENT_ORDER else 0
437
+ w = weights[src_idx]
438
+ level = _comm_level(w)
439
+ ctx += f"\n--- {aname} [{level} ยท ๊ฐ•๋„ {w:.0%}] ---\n"
440
+ if w >= 0.30:
441
+ ctx += f"{output[:3000]}\nโš ๏ธ ์œ„ ์—์ด์ „ํŠธ์˜ ์ฃผ์žฅ์„ ๋ฐ˜๋“œ์‹œ ์ •๋ฐ€ ๊ฒ€ํ† ํ•˜๊ณ  ์‘๋‹ต์— ๋ฐ˜์˜ํ•˜๋ผ.\n"
442
+ elif w >= 0.18:
443
+ ctx += f"{output[:2000]}\n"
444
+ elif w >= 0.10:
445
+ ctx += f"{output[:1500]}\n"
446
+ else:
447
+ ctx += f"{output[:800]}\n(๊ฒฝ๋Ÿ‰ ์ฐธ์กฐ โ€” ํ•ต์‹ฌ ๊ฒฐ๋ก ๋งŒ ์ฐธ๊ณ )\n"
448
+
449
+ return sys, f"{task_prompt}\n{ctx}"
450
+
451
+
452
+ def _run_proto_agi_pipeline(task_prompt, api_key, eval_model):
453
+ """Proto-AGI Full AETHER ํŒŒ์ดํ”„๋ผ์ธ: ๆœจโ†’็ซโ†’ๅœŸโ†’้‡‘โ†’ๆฐด ์ˆœ์ฐจ ์‹คํ–‰
454
+
455
+ Returns: (final_output, agent_trace)
456
+ final_output: ๆฐด_์„ฑ์ฐฐ์˜ ์ตœ์ข… ๊ฒฐ๋ก  (Judge์— ์ „๋‹ฌ)
457
+ agent_trace: {agent_name: response} ์ „์ฒด ๊ธฐ๋ก
458
+ """
459
+ prev_outputs = {}
460
+
461
+ for aname in AGENT_ORDER:
462
+ info = PROTO_AGENTS[aname]
463
+ sys_prompt, usr_prompt = _build_agent_prompt(aname, info, task_prompt, prev_outputs)
464
+ resp = call_llm(usr_prompt, system=sys_prompt, api_key=api_key, model=eval_model)
465
+ # qwen3 thinking ํƒœ๊ทธ๋Š” call_llm ๋‚ด๋ถ€์—์„œ ์ด๋ฏธ ์ œ๊ฑฐ๋จ
466
+ prev_outputs[aname] = resp
467
+
468
+ # ์ตœ์ข… ์ถœ๋ ฅ: ๋ชจ๋“  ์—์ด์ „ํŠธ ์‘๋‹ต ๊ฒฐํ•ฉ (ๆฐด์˜ ๊ฒฐ๋ก ์ด ๋งˆ์ง€๋ง‰)
469
+ combined = []
470
+ for aname in AGENT_ORDER:
471
+ elem = PROTO_AGENTS[aname]['element']
472
+ emoji = AGENT_EMOJIS.get(elem, "")
473
+ combined.append(f"{'='*40}\n{emoji} [{aname}] ์‘๋‹ต\n{'='*40}\n{prev_outputs[aname]}")
474
+
475
+ return "\n\n".join(combined), prev_outputs
476
+
477
+
478
+ def _execute_task(task, api_key, eval_model, proto_agi=False):
479
+ """๊ณผ์ œ ์œ ํ˜•์— ๋”ฐ๋ผ ์‹คํ–‰ ๋ถ„๊ธฐ โ€” Proto-AGI ๋ชจ๋“œ ์ง€์›"""
480
+ if proto_agi:
481
+ # Proto-AGI: ๋ชจ๋“  ๊ณผ์ œ๋ฅผ ์˜คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ์œผ๋กœ ์ฒ˜๋ฆฌ
482
+ final_output, _ = _run_proto_agi_pipeline(task.prompt, api_key, eval_model)
483
+ return final_output
484
+ elif task.sub_dimension == "mutual_verification":
485
+ topic = task.prompt.replace("[์ƒ์ƒ-์ƒ๊ทน ์‚ฌ์ดํด] ", "").split("\n")[0]
486
+ return _run_mutual_verification(topic, api_key, eval_model)
487
+ elif task.sub_dimension == "feedback_incorporation":
488
+ return _run_feedback_incorporation(task.prompt, api_key, eval_model)
489
+ else:
490
+ return call_llm(task.prompt, api_key=api_key, model=eval_model)
491
 
492
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
493
  # PART 5: LLM-as-Judge ์ฑ„์ 
 
502
  rubric = task.scoring_rubric
503
  rubric_text = "\n".join([f" - {k} (x{v['weight']}): {v['desc']}" for k, v in rubric.items()])
504
  expected = task.expected_behavior or "N/A"
505
+ # ๋‹ค์ค‘ ๋ผ์šด๋“œ๋Š” ์‘๋‹ต์ด ๊ธธ๋ฏ€๋กœ ๋” ๋งŽ์ด ํฌํ•จ
506
+ resp_limit = 6000 if _is_multi_round(task) else 3000
507
  return f"""[๊ณผ์ œ] {task.task_id} | {task.pillar} | {task.difficulty}
508
  [ํ”„๋กฌํ”„ํŠธ] {task.prompt[:1500]}
509
  [๊ธฐ๋Œ€] {expected[:500]}
510
+ [ํ”ผํ‰๊ฐ€ ์‘๋‹ต] {response[:resp_limit]}
511
  [๋ฃจ๋ธŒ๋ฆญ]
512
  {rubric_text}
513
  ์œ„ ๋ฃจ๋ธŒ๋ฆญ์— ๋”ฐ๋ผ JSON์œผ๋กœ ์ฑ„์ ."""
514
 
515
  def parse_judge_response(text, rubric_keys):
516
+ """Judge ์‘๋‹ต์—์„œ ์ ์ˆ˜ JSON ์ถ”์ถœ โ€” ๋‹ค์ค‘ ํŒจํ„ด ํŒŒ์‹ฑ"""
517
+ # Pattern 1: ํ‘œ์ค€ {"scores": {...}, "comment": ...}
518
  try:
519
  match = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', text, re.DOTALL)
520
  if match:
 
526
  return {"scores": scores, "comment": data.get("comment", "")}
527
  except:
528
  pass
529
+
530
+ # Pattern 2: ```json ๋ธ”๋ก ๋‚ด๋ถ€
531
+ try:
532
+ match = re.search(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL)
533
+ if match:
534
+ data = json.loads(match.group(1))
535
+ scores = data.get("scores", {})
536
+ for k in rubric_keys:
537
+ if k not in scores:
538
+ scores[k] = 0.5
539
+ return {"scores": scores, "comment": data.get("comment", "")}
540
+ except:
541
+ pass
542
+
543
+ # Pattern 3: ๊ฐœ๋ณ„ ํ•ญ๋ชฉ ์ถ”์ถœ (key: 0.75 ํŒจํ„ด)
544
+ try:
545
+ scores = {}
546
+ for k in rubric_keys:
547
+ m = re.search(rf'["\']?{k}["\']?\s*[:=]\s*([\d.]+)', text)
548
+ if m:
549
+ scores[k] = min(max(float(m.group(1)), 0), 1.0)
550
+ else:
551
+ scores[k] = 0.5
552
+ if any(v != 0.5 for v in scores.values()):
553
+ return {"scores": scores, "comment": "ํŒจํ„ด3 ํŒŒ์‹ฑ"}
554
+ except:
555
+ pass
556
+
557
  return {"scores": {k: 0.5 for k in rubric_keys}, "comment": "ํŒŒ์‹ฑ์‹คํŒจ"}
558
 
559
  def compute_weighted_score(scores, rubric):
 
763
 
764
  from concurrent.futures import ThreadPoolExecutor, as_completed
765
 
766
+ def _eval_single_task(task, run_id, eval_api_key, eval_model, judge_api_key, judge_model, state, proto_agi=False):
767
  """๋‹จ์ผ ๊ณผ์ œ ํ‰๊ฐ€ (๋ชจ๋ธํ˜ธ์ถœ + Judge์ฑ„์ ). ์›Œ์ปค ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰."""
768
  try:
769
+ # Step 1: ํ”ผํ‰๊ฐ€ ๋ชจ๋ธ ํ˜ธ์ถœ (Proto-AGI / ๋‹ค์ค‘ ๋ผ์šด๋“œ ์ž๋™ ๋ถ„๊ธฐ)
770
+ model_response = _execute_task(task, eval_api_key, eval_model, proto_agi=proto_agi)
771
 
772
+ if model_response.startswith("[API_ERROR"):
773
  _save_result(run_id, task.task_id, model_response, "{}", 0)
774
  with state["lock"]:
775
  state["done"] += 1
776
  state["errors"].append(task.task_id)
777
  return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
778
 
779
+ # Step 2: Judge ์ฑ„์  (๋ณ„๋„ API ํ‚ค/๋ชจ๋ธ)
780
  judge_prompt = build_judge_prompt(task, model_response)
781
+ judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=judge_api_key,
782
  model=judge_model, temperature=0.3)
783
 
784
  rubric_keys = list(task.scoring_rubric.keys())
 
848
  return out
849
 
850
 
851
+ def run_evaluation(eval_api_key, judge_api_key, eval_model, judge_model, pillar_filter, diff_filter,
852
+ max_tasks, n_workers, proto_agi, fresh_start, progress=gr.Progress()):
853
+ """๋ฉ”์ธ ํ‰๊ฐ€ โ€” ๊ธฐ๋‘ฅ๋ณ„ ๋ณ‘๋ ฌ ์‹คํ–‰ (Eval: Groq, Judge: Fireworks ๋ถ„๋ฆฌ, Proto-AGI ์ง€์›)"""
854
+ eval_api_key = eval_api_key.strip() or os.getenv("GROQ_API_KEY", "")
855
+ judge_api_key = judge_api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
856
+ if not eval_api_key:
857
+ yield "โŒ ํ”ผํ‰๊ฐ€ ๋ชจ๋ธ API Key๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”.", "", "", "", None
858
+ return
859
+ if not judge_api_key:
860
+ yield "โŒ Judge ๋ชจ๋ธ API Key๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”.", "", "", "", None
861
  return
862
 
863
+ # Proto-AGI ํ™œ์„ฑํ™” ์‹œ ์›Œ์ปค ์ˆ˜ ์ž๋™ ์กฐ์ • (๊ณผ์ œ๋‹น 5ํšŒ API ํ˜ธ์ถœ)
864
+ n_workers = int(n_workers)
865
+ if proto_agi and n_workers > 3:
866
+ n_workers = 3 # 5 agents ร— 3 workers = 15 ๋™์‹œ API ํ˜ธ์ถœ
867
+
868
  # โ”€โ”€ ๊ณผ์ œ ํ•„ํ„ฐ๋ง โ”€โ”€
869
  tasks = ALL_TASKS[:]
870
  if pillar_filter != "์ „์ฒด":
 
873
  tasks = [t for t in tasks if t.difficulty == diff_filter]
874
  tasks = tasks[:int(max_tasks)]
875
 
876
+ # run_id์— proto_agi ๋ชจ๋“œ ํฌํ•จ (์ฒดํฌํฌ์ธํŠธ ๋ถ„๋ฆฌ)
877
+ mode_suffix = "_PAGI" if proto_agi else ""
878
+ run_id = _make_run_id(eval_model + mode_suffix)
879
  if fresh_start:
880
  _clear_run(run_id)
881
 
 
927
  "pillar_done": {p: 0 for p in pillar_tasks},
928
  }
929
 
930
+ mode_tag = '๐ŸŒŸ <b>Proto-AGI ON</b> (ๆœจโ†’็ซโ†’ๅœŸโ†’้‡‘โ†’ๆฐด)' if proto_agi else '๐Ÿค– <b>๋‹จ์ผ LLM ๋ชจ๋“œ</b>'
931
+ yield (CSS + f'<div style="background:{"#fff3e0" if proto_agi else "#e8f5e9"};padding:12px;border-radius:8px;margin:8px 0;">'
932
+ f'โšก <b>๋ณ‘๋ ฌ ํ‰๊ฐ€ ์‹œ์ž‘!</b> {len(pending)}๊ฐœ ๊ณผ์ œ ยท {n_pillars}๊ฐœ ๊ธฐ๋‘ฅ ๋™์‹œ ยท {n_workers}๊ฐœ ์›Œ์ปค<br>'
933
+ f'{mode_tag}</div>', _build_progress_table(results, tasks), "", "", None)
934
 
935
  # โ”€โ”€ ThreadPoolExecutor ๋ณ‘๋ ฌ ์‹คํ–‰ โ”€โ”€
936
  with ThreadPoolExecutor(max_workers=n_workers) as executor:
937
  futures = {}
938
  for task in pending:
939
+ fut = executor.submit(_eval_single_task, task, run_id, eval_api_key,
940
+ eval_model, judge_api_key, judge_model, state, proto_agi)
941
  futures[fut] = task
942
 
943
  completed = set()
 
993
  n_err = len(state["errors"])
994
  err_msg = f" (โš ๏ธ {n_err}๊ฐœ ์˜ค๋ฅ˜)" if n_err > 0 else ""
995
  restore_msg = f" (๐Ÿ’พ {cached}๊ฐœ ๋ณต์›)" if cached > 0 else ""
996
+ mode_str = "๐ŸŒŸProto-AGI" if proto_agi else "๐Ÿค–๋‹จ์ผLLM"
997
 
998
+ display_model = f"{eval_model} [{mode_str}]"
999
+ summary = _build_final_summary(results, tasks, pillar_scores, aether, display_model, hf_status)
1000
  table = _build_progress_table(results, tasks)
1001
  detail = _build_detail_view(results, tasks)
1002
 
1003
+ yield (f"๐Ÿ ํ‰๊ฐ€ ์™„๋ฃŒ! {mode_str}{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
1004
  table, summary, detail, csv_path)
1005
 
1006
 
 
1013
 
1014
  HEADER = """
1015
  <div style="text-align:center;padding:16px 0;">
1016
+ <h1 style="margin:0;font-size:1.8em;">๐ŸŒ€ AETHER-Bench v0.3.0</h1>
1017
+ <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM ํ‰๊ฐ€ ์‹œ์Šคํ…œ + Proto-AGI ์˜คํ–‰ ๋ฉ€ํ‹ฐ์—์ด์ „ํŠธ</h2>
1018
+ <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
1019
  120 Tasks ยท 5 Pillars ยท 19 Sub-dimensions ยท HAR Metric<br>
1020
+ ๐ŸŒŸ <b>Proto-AGI</b>: ๆœจโ†’็ซโ†’ๅœŸโ†’้‡‘โ†’ๆฐด ์˜คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ + ๋งˆ๋ฐฉ์ง„ ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค<br>
1021
+ ๐Ÿค– <b>๋‹จ์ผ LLM</b>: ์ˆœ์ˆ˜ ์‹œํ—˜ ํ‰๊ฐ€ | CSV โ†’ HuggingFace PRIVATE ๊ธฐ๋ก
1022
  </p>
1023
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
1024
+ <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">๐ŸŒณ ๆœจ ๋ฐœ์ƒ(ไป)</span>
1025
+ <span style="background:#ffebee;padding:2px 10px;border-radius:12px;">๐Ÿ”ฅ ็ซ ํ‘œํ˜„(็ฆฎ)</span>
1026
+ <span style="background:#fff3e0;padding:2px 10px;border-radius:12px;">๐Ÿ”๏ธ ๅœŸ ํ†ตํ•ฉ(ไฟก)</span>
1027
+ <span style="background:#f5f5f5;padding:2px 10px;border-radius:12px;">โš”๏ธ ้‡‘ ์‹ฌํŒ(็พฉ)</span>
1028
+ <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px;">๐Ÿ’ง ๆฐด ์„ฑ์ฐฐ(ๆ™บ)</span>
1029
  </div>
1030
  </div>"""
1031
 
1032
  def create_app():
1033
+ with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
1034
  css=".gradio-container{max-width:1100px !important}") as app:
1035
  gr.HTML(HEADER)
1036
 
1037
  with gr.Row():
1038
+ eval_api_key = gr.Textbox(label="๐Ÿ”‘ ํ”ผํ‰๊ฐ€ API Key (Groq)", type="password",
1039
+ placeholder="gsk_...", value=os.getenv("GROQ_API_KEY", ""), scale=3)
1040
+ judge_api_key = gr.Textbox(label="โš–๏ธ Judge API Key (Fireworks)", type="password",
1041
+ placeholder="fw_...", value=os.getenv("FIREWORKS_API_KEY", ""), scale=3)
1042
 
1043
  with gr.Row():
1044
+ eval_model = gr.Dropdown(
1045
+ choices=["qwen/qwen3-32b", "qwen-qwq-32b", "deepseek-r1-distill-llama-70b",
1046
+ "llama-3.3-70b-versatile", "meta-llama/llama-4-scout-17b-16e-instruct",
1047
+ "mistral-saba-24b", "gemma2-9b-it", "llama-3.1-8b-instant"],
1048
+ value="qwen/qwen3-32b", label="๐Ÿค– ํ”ผํ‰๊ฐ€ ๋ชจ๋ธ (Groq)", allow_custom_value=True, scale=3)
1049
+ judge_model = gr.Dropdown(
1050
+ choices=["accounts/fireworks/models/kimi-k2p5",
1051
+ "qwen/qwen3-32b", "deepseek-r1-distill-llama-70b",
1052
+ "llama-3.3-70b-versatile"],
1053
+ value="accounts/fireworks/models/kimi-k2p5",
1054
+ label="โš–๏ธ ์‹ฌํŒ ๋ชจ๋ธ (Fireworks/Groq)", allow_custom_value=True, scale=3)
1055
+
1056
+ # โ”€โ”€ Proto-AGI ํ† ๊ธ€ โ”€โ”€
1057
+ with gr.Row():
1058
+ proto_agi_toggle = gr.Checkbox(
1059
+ label="๐ŸŒŸ Proto-AGI ํ™œ์„ฑํ™” (ๆœจโ†’็ซโ†’ๅœŸโ†’้‡‘โ†’ๆฐด ์˜คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ)",
1060
+ value=True, scale=3)
1061
+ gr.HTML('''<div style="font-size:0.82em;color:#666;padding:8px;background:#fffde7;border-radius:8px;margin:auto 0;" id="pagi-info">
1062
+ <b>Proto-AGI ON:</b> ๊ณผ์ œ๋‹น 5ํšŒ ์ˆœ์ฐจ API ํ˜ธ์ถœ (๋ฐœ์ƒโ†’ํ‘œํ˜„โ†’ํ†ตํ•ฉโ†’์‹ฌํŒโ†’์„ฑ์ฐฐ)<br>
1063
+ ์ƒ์ƒยท์ƒ๊ทน + ๋งˆ๋ฐฉ์ง„ ์†Œํ†ต ๋งคํŠธ๋ฆญ์Šค + ๆฐด ๋ฉ”ํƒ€ ์žฌ๊ฒ€ํ†  | ์›Œ์ปค ์ž๋™ ์ œํ•œ 3๊ฐœ<br>
1064
+ <b>Proto-AGI OFF:</b> ๊ณผ์ œ๋‹น 1ํšŒ API ํ˜ธ์ถœ (์ˆœ์ˆ˜ LLM ์‹œํ—˜)
1065
+ </div>''', scale=3)
1066
 
1067
  with gr.Row():
1068
  pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="์ „์ฒด", label="๊ธฐ๋‘ฅ ํ•„ํ„ฐ", scale=2)
 
1073
  with gr.Row():
1074
  start_btn = gr.Button("โ–ถ๏ธ ํ‰๊ฐ€ ์‹œ์ž‘ (์ด์–ดํ•˜๊ธฐ)", variant="primary", size="lg", scale=2)
1075
  fresh_btn = gr.Button("๐Ÿš€ ์ƒˆ๋กœ ์‹œ์ž‘", variant="secondary", size="lg", scale=2)
1076
+ gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">โšก ๋“€์–ผ ๋ฐฑ์—”๋“œ: Groq(ํ”ผํ‰๊ฐ€) + Fireworks(Judge)<br>โ–ถ๏ธ ์ค‘๋‹จ์‹œ ์ด์–ด์„œ | ๐Ÿš€ ์ดˆ๊ธฐํ™”ํ›„ ์žฌ์‹œ์ž‘ | CSVโ†’HF PRIVATE</p>')
1077
 
1078
  with gr.Tabs():
1079
  with gr.Tab("๐Ÿ“Š ์ง„ํ–‰"):
 
1087
  with gr.Tab("๐Ÿ’พ CSV"):
1088
  csv_file = gr.File(label="ํ‰๊ฐ€ ๊ฒฐ๊ณผ CSV")
1089
 
1090
+ def _run_resume(eak,jak,em,jm,pagi,pf,df,mt,nw):
1091
+ yield from run_evaluation(eak,jak,em,jm,pf,df,mt,nw,pagi,False)
1092
+ def _run_fresh(eak,jak,em,jm,pagi,pf,df,mt,nw):
1093
+ yield from run_evaluation(eak,jak,em,jm,pf,df,mt,nw,pagi,True)
1094
+
1095
+ all_inputs = [eval_api_key, judge_api_key, eval_model, judge_model,
1096
+ proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
1097
 
1098
  start_btn.click(
1099
  fn=_run_resume,
1100
+ inputs=all_inputs,
1101
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
1102
  )
1103
  fresh_btn.click(
1104
  fn=_run_fresh,
1105
+ inputs=all_inputs,
1106
  outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
1107
  )
1108
 
1109
  gr.Markdown("""---
1110
+ <center>AETHER-Bench v0.3.0 ยท Apache 2.0 ยท Ginigen AI (์ง€๋‹ˆ์  AI)<br>
1111
+ ๐ŸŒŸ Proto-AGI ์˜คํ–‰ ํŒŒ์ดํ”„๋ผ์ธ + ๋“€์–ผ ๋ฐฑ์—”๋“œ: <b>Groq</b> (ํ”ผํ‰๊ฐ€) + <b>Fireworks</b> (Judge)<br>
1112
+ <code>HF_TOKEN</code> ์„ค์ • ์‹œ PRIVATE ์ž๋™ ๊ธฐ๋ก</center>""")
1113
  return app
1114
 
1115
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
 
1120
  stats = {}
1121
  for t in ALL_TASKS:
1122
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
1123
+ print(f"AETHER-Bench v0.3.0 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
1124
+ print(f" Proto-AGI: ๆœจ_๋ฐœ์ƒโ†’็ซ_ํ‘œํ˜„โ†’ๅœŸ_ํ†ตํ•ฉโ†’้‡‘_์‹ฌํŒโ†’ๆฐด_์„ฑ์ฐฐ (5 agents)")
1125
  for p, n in stats.items():
1126
  info = PILLAR_INFO[p]
1127
  print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")