Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
AETHER-Bench v0.
|
| 3 |
-
========================================
|
| 4 |
-
120๊ฐ ๊ณผ์
|
|
|
|
| 5 |
ํ๊ฐ โ Judge ์ฑ์ โ CSV โ HuggingFace PRIVATE ๋ฐ์ดํฐ์
|
| 6 |
|
| 7 |
Author: Ginigen AI (์ง๋์ AI) โ Choi Sunyoung
|
|
@@ -129,11 +130,43 @@ def generate_all_tasks() -> List[EvalTask]:
|
|
| 129 |
ALL_TASKS = generate_all_tasks()
|
| 130 |
|
| 131 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 132 |
-
# PART 4:
|
| 133 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
messages = []
|
| 138 |
if system:
|
| 139 |
messages.append({"role": "system", "content": system})
|
|
@@ -152,7 +185,309 @@ def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/glm
|
|
| 152 |
if attempt < 2:
|
| 153 |
time.sleep(3 * (attempt + 1))
|
| 154 |
else:
|
| 155 |
-
return f"[API_ERROR] {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 158 |
# PART 5: LLM-as-Judge ์ฑ์
|
|
@@ -167,15 +502,19 @@ def build_judge_prompt(task, response):
|
|
| 167 |
rubric = task.scoring_rubric
|
| 168 |
rubric_text = "\n".join([f" - {k} (x{v['weight']}): {v['desc']}" for k, v in rubric.items()])
|
| 169 |
expected = task.expected_behavior or "N/A"
|
|
|
|
|
|
|
| 170 |
return f"""[๊ณผ์ ] {task.task_id} | {task.pillar} | {task.difficulty}
|
| 171 |
[ํ๋กฌํํธ] {task.prompt[:1500]}
|
| 172 |
[๊ธฐ๋] {expected[:500]}
|
| 173 |
-
[ํผํ๊ฐ ์๋ต] {response[:
|
| 174 |
[๋ฃจ๋ธ๋ฆญ]
|
| 175 |
{rubric_text}
|
| 176 |
์ ๋ฃจ๋ธ๋ฆญ์ ๋ฐ๋ผ JSON์ผ๋ก ์ฑ์ ."""
|
| 177 |
|
| 178 |
def parse_judge_response(text, rubric_keys):
|
|
|
|
|
|
|
| 179 |
try:
|
| 180 |
match = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', text, re.DOTALL)
|
| 181 |
if match:
|
|
@@ -187,6 +526,34 @@ def parse_judge_response(text, rubric_keys):
|
|
| 187 |
return {"scores": scores, "comment": data.get("comment", "")}
|
| 188 |
except:
|
| 189 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
return {"scores": {k: 0.5 for k in rubric_keys}, "comment": "ํ์ฑ์คํจ"}
|
| 191 |
|
| 192 |
def compute_weighted_score(scores, rubric):
|
|
@@ -396,22 +763,22 @@ def _build_detail_view(results, tasks):
|
|
| 396 |
|
| 397 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 398 |
|
| 399 |
-
def _eval_single_task(task, run_id,
|
| 400 |
"""๋จ์ผ ๊ณผ์ ํ๊ฐ (๋ชจ๋ธํธ์ถ + Judge์ฑ์ ). ์์ปค ์ค๋ ๋์์ ์คํ."""
|
| 401 |
try:
|
| 402 |
-
# Step 1: ํผํ๊ฐ ๋ชจ๋ธ ํธ์ถ
|
| 403 |
-
model_response =
|
| 404 |
|
| 405 |
-
if model_response.startswith("[API_ERROR
|
| 406 |
_save_result(run_id, task.task_id, model_response, "{}", 0)
|
| 407 |
with state["lock"]:
|
| 408 |
state["done"] += 1
|
| 409 |
state["errors"].append(task.task_id)
|
| 410 |
return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
|
| 411 |
|
| 412 |
-
# Step 2: Judge ์ฑ์
|
| 413 |
judge_prompt = build_judge_prompt(task, model_response)
|
| 414 |
-
judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=
|
| 415 |
model=judge_model, temperature=0.3)
|
| 416 |
|
| 417 |
rubric_keys = list(task.scoring_rubric.keys())
|
|
@@ -481,14 +848,23 @@ def _parallel_progress_html(state, total):
|
|
| 481 |
return out
|
| 482 |
|
| 483 |
|
| 484 |
-
def run_evaluation(
|
| 485 |
-
max_tasks, n_workers, fresh_start, progress=gr.Progress()):
|
| 486 |
-
"""๋ฉ์ธ ํ๊ฐ โ ๊ธฐ๋ฅ๋ณ ๋ณ๋ ฌ ์คํ"""
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
return
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
# โโ ๊ณผ์ ํํฐ๋ง โโ
|
| 493 |
tasks = ALL_TASKS[:]
|
| 494 |
if pillar_filter != "์ ์ฒด":
|
|
@@ -497,7 +873,9 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
|
| 497 |
tasks = [t for t in tasks if t.difficulty == diff_filter]
|
| 498 |
tasks = tasks[:int(max_tasks)]
|
| 499 |
|
| 500 |
-
run_id
|
|
|
|
|
|
|
| 501 |
if fresh_start:
|
| 502 |
_clear_run(run_id)
|
| 503 |
|
|
@@ -549,16 +927,17 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
|
| 549 |
"pillar_done": {p: 0 for p in pillar_tasks},
|
| 550 |
}
|
| 551 |
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
f'</
|
|
|
|
| 555 |
|
| 556 |
# โโ ThreadPoolExecutor ๋ณ๋ ฌ ์คํ โโ
|
| 557 |
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 558 |
futures = {}
|
| 559 |
for task in pending:
|
| 560 |
-
fut = executor.submit(_eval_single_task, task, run_id,
|
| 561 |
-
eval_model, judge_model, state)
|
| 562 |
futures[fut] = task
|
| 563 |
|
| 564 |
completed = set()
|
|
@@ -614,12 +993,14 @@ def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
|
| 614 |
n_err = len(state["errors"])
|
| 615 |
err_msg = f" (โ ๏ธ {n_err}๊ฐ ์ค๋ฅ)" if n_err > 0 else ""
|
| 616 |
restore_msg = f" (๐พ {cached}๊ฐ ๋ณต์)" if cached > 0 else ""
|
|
|
|
| 617 |
|
| 618 |
-
|
|
|
|
| 619 |
table = _build_progress_table(results, tasks)
|
| 620 |
detail = _build_detail_view(results, tasks)
|
| 621 |
|
| 622 |
-
yield (f"๐ ํ๊ฐ ์๋ฃ!{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
|
| 623 |
table, summary, detail, csv_path)
|
| 624 |
|
| 625 |
|
|
@@ -632,35 +1013,56 @@ DIFF_CHOICES = ["์ ์ฒด", "basic", "intermediate", "advanced", "expert", "fronti
|
|
| 632 |
|
| 633 |
HEADER = """
|
| 634 |
<div style="text-align:center;padding:16px 0;">
|
| 635 |
-
<h1 style="margin:0;font-size:1.8em;">๐ AETHER-Bench v0.
|
| 636 |
-
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM
|
| 637 |
-
<p style="color:#888;font-size:0.9em;max-width:
|
| 638 |
120 Tasks ยท 5 Pillars ยท 19 Sub-dimensions ยท HAR Metric<br>
|
| 639 |
-
<b>Proto-AGI
|
|
|
|
| 640 |
</p>
|
| 641 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 642 |
-
<span style="background:#
|
| 643 |
-
<span style="background:#
|
| 644 |
-
<span style="background:#
|
| 645 |
-
<span style="background:#
|
| 646 |
-
<span style="background:#
|
| 647 |
</div>
|
| 648 |
</div>"""
|
| 649 |
|
| 650 |
def create_app():
|
| 651 |
-
with gr.Blocks(title="AETHER-Bench
|
| 652 |
css=".gradio-container{max-width:1100px !important}") as app:
|
| 653 |
gr.HTML(HEADER)
|
| 654 |
|
| 655 |
with gr.Row():
|
| 656 |
-
|
| 657 |
-
|
|
|
|
|
|
|
| 658 |
|
| 659 |
with gr.Row():
|
| 660 |
-
eval_model = gr.
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
with gr.Row():
|
| 666 |
pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="์ ์ฒด", label="๊ธฐ๋ฅ ํํฐ", scale=2)
|
|
@@ -671,7 +1073,7 @@ def create_app():
|
|
| 671 |
with gr.Row():
|
| 672 |
start_btn = gr.Button("โถ๏ธ ํ๊ฐ ์์ (์ด์ดํ๊ธฐ)", variant="primary", size="lg", scale=2)
|
| 673 |
fresh_btn = gr.Button("๐ ์๋ก ์์", variant="secondary", size="lg", scale=2)
|
| 674 |
-
gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">โก
|
| 675 |
|
| 676 |
with gr.Tabs():
|
| 677 |
with gr.Tab("๐ ์งํ"):
|
|
@@ -685,25 +1087,29 @@ def create_app():
|
|
| 685 |
with gr.Tab("๐พ CSV"):
|
| 686 |
csv_file = gr.File(label="ํ๊ฐ ๊ฒฐ๊ณผ CSV")
|
| 687 |
|
| 688 |
-
def _run_resume(
|
| 689 |
-
yield from run_evaluation(
|
| 690 |
-
def _run_fresh(
|
| 691 |
-
yield from run_evaluation(
|
|
|
|
|
|
|
|
|
|
| 692 |
|
| 693 |
start_btn.click(
|
| 694 |
fn=_run_resume,
|
| 695 |
-
inputs=
|
| 696 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 697 |
)
|
| 698 |
fresh_btn.click(
|
| 699 |
fn=_run_fresh,
|
| 700 |
-
inputs=
|
| 701 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 702 |
)
|
| 703 |
|
| 704 |
gr.Markdown("""---
|
| 705 |
-
<center>AETHER-Bench v0.
|
| 706 |
-
<
|
|
|
|
| 707 |
return app
|
| 708 |
|
| 709 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -714,7 +1120,8 @@ if __name__ == "__main__":
|
|
| 714 |
stats = {}
|
| 715 |
for t in ALL_TASKS:
|
| 716 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 717 |
-
print(f"AETHER-Bench
|
|
|
|
| 718 |
for p, n in stats.items():
|
| 719 |
info = PILLAR_INFO[p]
|
| 720 |
print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
|
|
|
|
| 1 |
"""
|
| 2 |
+
AETHER-Bench v0.3.0 โ LLM ํ๊ฐ ์์คํ
+ Proto-AGI ์คํ ๋ฉํฐ์์ด์ ํธ
|
| 3 |
+
=====================================================================
|
| 4 |
+
120๊ฐ ๊ณผ์ ร Proto-AGI(ๆจโ็ซโๅโ้โๆฐด) or ๋จ์ผLLM ํ๊ฐ
|
| 5 |
+
๋ง๋ฐฉ์ง ์ํต ๋งคํธ๋ฆญ์ค + ์์ยท์๊ทน + ๆฐด ๋ฉํ ์ฌ๊ฒํ
|
| 6 |
ํ๊ฐ โ Judge ์ฑ์ โ CSV โ HuggingFace PRIVATE ๋ฐ์ดํฐ์
|
| 7 |
|
| 8 |
Author: Ginigen AI (์ง๋์ AI) โ Choi Sunyoung
|
|
|
|
| 130 |
ALL_TASKS = generate_all_tasks()
|
| 131 |
|
| 132 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 133 |
+
# PART 4: ๋์ผ ๋ฐฑ์๋ API (Groq + Fireworks)
|
| 134 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 135 |
|
| 136 |
+
GROQ_MODELS = {"qwen/qwen3-32b", "deepseek-r1-distill-llama-70b", "llama-3.3-70b-versatile",
|
| 137 |
+
"llama-3.1-8b-instant", "meta-llama/llama-4-scout-17b-16e-instruct",
|
| 138 |
+
"mistral-saba-24b", "gemma2-9b-it", "qwen-qwq-32b"}
|
| 139 |
+
|
| 140 |
+
def _call_groq(prompt, system="", api_key="", model="qwen/qwen3-32b",
|
| 141 |
+
max_tokens=8192, temperature=0.6):
|
| 142 |
+
"""Groq SDK ํธ์ถ (non-streaming)"""
|
| 143 |
+
from groq import Groq
|
| 144 |
+
client = Groq(api_key=api_key)
|
| 145 |
+
messages = []
|
| 146 |
+
if system:
|
| 147 |
+
messages.append({"role": "system", "content": system})
|
| 148 |
+
messages.append({"role": "user", "content": prompt})
|
| 149 |
+
for attempt in range(3):
|
| 150 |
+
try:
|
| 151 |
+
resp = client.chat.completions.create(
|
| 152 |
+
model=model, messages=messages,
|
| 153 |
+
temperature=temperature, max_completion_tokens=max_tokens,
|
| 154 |
+
top_p=0.95, stream=False, stop=None,
|
| 155 |
+
)
|
| 156 |
+
content = resp.choices[0].message.content or ""
|
| 157 |
+
# qwen3 thinking ํ๊ทธ ์ ๊ฑฐ
|
| 158 |
+
if "<think>" in content:
|
| 159 |
+
content = re.sub(r'<think>.*?</think>\s*', '', content, flags=re.DOTALL).strip()
|
| 160 |
+
return content
|
| 161 |
+
except Exception as e:
|
| 162 |
+
if attempt < 2:
|
| 163 |
+
time.sleep(3 * (attempt + 1))
|
| 164 |
+
else:
|
| 165 |
+
return f"[API_ERROR:Groq] {e}"
|
| 166 |
+
|
| 167 |
+
def _call_fireworks(prompt, system="", api_key="", model="accounts/fireworks/models/kimi-k2p5",
|
| 168 |
+
max_tokens=8192, temperature=0.6):
|
| 169 |
+
"""Fireworks REST API ํธ์ถ"""
|
| 170 |
messages = []
|
| 171 |
if system:
|
| 172 |
messages.append({"role": "system", "content": system})
|
|
|
|
| 185 |
if attempt < 2:
|
| 186 |
time.sleep(3 * (attempt + 1))
|
| 187 |
else:
|
| 188 |
+
return f"[API_ERROR:Fireworks] {e}"
|
| 189 |
+
|
| 190 |
+
def _detect_backend(model_name):
|
| 191 |
+
"""๋ชจ๋ธ๋ช
์ผ๋ก ๋ฐฑ์๋ ์๋ ๊ฐ์ง"""
|
| 192 |
+
if model_name in GROQ_MODELS or not model_name.startswith("accounts/"):
|
| 193 |
+
return "groq"
|
| 194 |
+
return "fireworks"
|
| 195 |
+
|
| 196 |
+
def call_llm(prompt, system="", api_key="", model="qwen/qwen3-32b",
|
| 197 |
+
max_tokens=8192, temperature=0.6, backend=None):
|
| 198 |
+
"""ํตํฉ LLM ํธ์ถ โ ๋ฐฑ์๋ ์๋ ๊ฐ์ง ๋๋ ์ง์ """
|
| 199 |
+
if backend is None:
|
| 200 |
+
backend = _detect_backend(model)
|
| 201 |
+
if backend == "groq":
|
| 202 |
+
return _call_groq(prompt, system, api_key, model, max_tokens, temperature)
|
| 203 |
+
else:
|
| 204 |
+
return _call_fireworks(prompt, system, api_key, model, max_tokens, temperature)
|
| 205 |
+
|
| 206 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 207 |
+
# PART 4-B: ๋ค์ค ๋ผ์ด๋ ์คํ๊ธฐ (mutual_verification, feedback_incorporation)
|
| 208 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 209 |
+
|
| 210 |
+
def _run_mutual_verification(topic, api_key, model):
|
| 211 |
+
"""์์-์๊ทน 4๋ผ์ด๋๋ฅผ ๊ฐ๋ณ API ํธ์ถ๋ก ์ฒด์ด๋"""
|
| 212 |
+
rounds = []
|
| 213 |
+
|
| 214 |
+
# R1: ์์ (๋ณด๊ณ ์)
|
| 215 |
+
r1 = call_llm(f"[R1-์์] '{topic}'์ ๋ํด 500๋จ์ด ๋ถ์ ๋ณด๊ณ ์๋ฅผ ์์ฑํ์ธ์. "
|
| 216 |
+
"๊ตฌ์ฒด์ ๋ฐ์ดํฐ์ ๊ทผ๊ฑฐ๋ฅผ ํฌํจํ์ธ์.",
|
| 217 |
+
api_key=api_key, model=model)
|
| 218 |
+
rounds.append(f"[R1-์์ ๋ณด๊ณ ์]\n{r1}")
|
| 219 |
+
|
| 220 |
+
# R2: ์๊ทน (๋นํ)
|
| 221 |
+
r2 = call_llm(f"[R2-์๊ทน ๋นํ] ์๋ ๋ณด๊ณ ์๋ฅผ ๋์ฒ ํ๊ฒ ๋นํ ๊ฒํ ํ์ธ์.\n"
|
| 222 |
+
f"์ฌ์ค ์ค๋ฅ, ๋
ผ๋ฆฌ์ ์ฝ์ , ๋๋ฝ๋ ๊ด์ , ๊ณผ์ฅ๋ ์ฃผ์ฅ์ ์ง์ ํ์ธ์.\n\n"
|
| 223 |
+
f"--- ์๋ฌธ ๋ณด๊ณ ์ ---\n{r1[:3000]}",
|
| 224 |
+
api_key=api_key, model=model)
|
| 225 |
+
rounds.append(f"[R2-์๊ทน ๋นํ]\n{r2}")
|
| 226 |
+
|
| 227 |
+
# R3: ์์ (์์ )
|
| 228 |
+
r3 = call_llm(f"[R3-์์ ์์ ] ๋นํ์ ๋ฐ์ํ์ฌ ์๋ฌธ ๋ณด๊ณ ์๋ฅผ ์์ ํ์ธ์.\n\n"
|
| 229 |
+
f"--- ์๋ฌธ ---\n{r1[:2000]}\n\n--- ๋นํ ---\n{r2[:2000]}",
|
| 230 |
+
api_key=api_key, model=model)
|
| 231 |
+
rounds.append(f"[R3-์์ ์์ ]\n{r3}")
|
| 232 |
+
|
| 233 |
+
# R4: ๋ฉํ ๋ถ์ (ํต์ฌ!)
|
| 234 |
+
r4 = call_llm(f"[R4-๋ฉํ ๋ถ์] ์ 3๋ผ์ด๋(์์โ์๊ทนโ์์ ) ์ฌ์ดํด์ ๋ฉํ ๋ถ์์ ์ํํ์ธ์.\n"
|
| 235 |
+
f"๋ฐ๋์ ๋ค์์ ํฌํจ:\n"
|
| 236 |
+
f"1. ๋ฐ๊ฒฌ๋ ํ๊ฐ/์ค๋ฅ ์ ํ ๋ถ๋ฅ\n"
|
| 237 |
+
f"2. ์๊ทน ๋จ๊ณ์ ๊ธฐ์ฌ๋ ์ ๋ ํ๊ฐ\n"
|
| 238 |
+
f"3. ์ฌ์ดํด์ ํตํ ํ์ง ํฅ์ ๋ถ์\n\n"
|
| 239 |
+
f"--- R1 ์์ฝ ---\n{r1[:1000]}\n--- R2 ์์ฝ ---\n{r2[:1000]}\n--- R3 ์์ฝ ---\n{r3[:1000]}",
|
| 240 |
+
api_key=api_key, model=model)
|
| 241 |
+
rounds.append(f"[R4-๋ฉํ ๋ถ์]\n{r4}")
|
| 242 |
+
|
| 243 |
+
return "\n\n".join(rounds)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _run_feedback_incorporation(prompt_json, api_key, model):
|
| 247 |
+
"""ํผ๋๋ฐฑ ๋ฐ์ ๊ณผ์ ๋ฅผ ๋ผ์ด๋๋ณ ๊ฐ๋ณ ํธ์ถ๋ก ์ฒด์ด๋"""
|
| 248 |
+
try:
|
| 249 |
+
data = json.loads(prompt_json)
|
| 250 |
+
except:
|
| 251 |
+
return call_llm(prompt_json, api_key=api_key, model=model)
|
| 252 |
+
|
| 253 |
+
topic = data.get("topic", "")
|
| 254 |
+
rounds_spec = data.get("rounds", [])
|
| 255 |
+
outputs = []
|
| 256 |
+
prev = ""
|
| 257 |
+
|
| 258 |
+
for i, rd in enumerate(rounds_spec):
|
| 259 |
+
instruction = rd.get("instruction", "")
|
| 260 |
+
feedback = rd.get("feedback")
|
| 261 |
+
|
| 262 |
+
if i == 0:
|
| 263 |
+
prompt = f"'{topic}' โ {instruction}."
|
| 264 |
+
elif feedback:
|
| 265 |
+
prompt = (f"์๋๋ ์ด์ ๋ฒ์ ๊ณผ ํผ๋๋ฐฑ์
๋๋ค. ํผ๋๋ฐฑ์ ๋ฐ์ํ์ฌ {instruction}.\n\n"
|
| 266 |
+
f"--- ์ด์ ๋ฒ์ ---\n{prev[:2500]}\n\n"
|
| 267 |
+
f"--- ํผ๋๋ฐฑ ---\n{feedback}")
|
| 268 |
+
else:
|
| 269 |
+
prompt = (f"์๋๋ ์ต์ข
๋ฒ์ ์
๋๋ค. {instruction}.\n"
|
| 270 |
+
f"๋ณ๊ฒฝ์ ์ ์ ๋์ ์ผ๋ก ๋ถ์ํ๊ณ ์๊ธฐ ํ๊ฐ๋ฅผ ํฌํจํ์ธ์.\n\n"
|
| 271 |
+
f"--- ์ต์ข
๋ฒ์ ---\n{prev[:3000]}")
|
| 272 |
+
|
| 273 |
+
resp = call_llm(prompt, api_key=api_key, model=model)
|
| 274 |
+
outputs.append(f"[๋ผ์ด๋ {i+1}: {instruction}]\n{resp}")
|
| 275 |
+
prev = resp
|
| 276 |
+
|
| 277 |
+
# ํผ๋๋ฐฑ์ด ์์ผ๋ฉด ๋ค์ ๋ผ์ด๋์ ์ ๋ฌ
|
| 278 |
+
if feedback and i < len(rounds_spec) - 1:
|
| 279 |
+
outputs.append(f"[ํผ๋๋ฐฑ] {feedback}")
|
| 280 |
+
|
| 281 |
+
return "\n\n".join(outputs)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _is_multi_round(task):
|
| 285 |
+
"""๋ค์ค ๋ผ์ด๋ ๊ณผ์ ์ฌ๋ถ ํ๋ณ"""
|
| 286 |
+
return task.sub_dimension in ("mutual_verification", "feedback_incorporation")
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 290 |
+
# PART 4-C: Proto-AGI ์คํ ๋ฉํฐ์์ด์ ํธ ์์ง
|
| 291 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 292 |
+
|
| 293 |
+
# โโ ๋ง๋ฐฉ์ง 5ร5 ์ํต ๋งคํธ๋ฆญ์ค โโ
|
| 294 |
+
# ํ ํฉ = ์ด ํฉ = 65 โ ์ ๊ทํ ํ ํธํฅ ์๋ ๋ฏผ์ฃผ์ ์ํต
|
| 295 |
+
MAGIC_SQUARE_5x5 = np.array([
|
| 296 |
+
[17, 24, 1, 8, 15],
|
| 297 |
+
[23, 5, 7, 14, 16],
|
| 298 |
+
[ 4, 6, 13, 20, 22],
|
| 299 |
+
[10, 12, 19, 21, 3],
|
| 300 |
+
[11, 18, 25, 2, 9]
|
| 301 |
+
], dtype=np.float64)
|
| 302 |
+
COMM_MATRIX = MAGIC_SQUARE_5x5 / MAGIC_SQUARE_5x5.sum(axis=1, keepdims=True)
|
| 303 |
+
|
| 304 |
+
def _comm_level(weight):
|
| 305 |
+
if weight >= 0.30: return "ํต์ฌ ์ฐธ์กฐ"
|
| 306 |
+
elif weight >= 0.18: return "์ฃผ์ ์ฐธ์กฐ"
|
| 307 |
+
elif weight >= 0.10: return "์ผ๋ฐ ์ฐธ์กฐ"
|
| 308 |
+
else: return "๊ฒฝ๋ ์ฐธ์กฐ"
|
| 309 |
+
|
| 310 |
+
# โโ ์คํ ์์ด์ ํธ ์ ์ โโ
|
| 311 |
+
PROTO_AGENTS = {
|
| 312 |
+
"ๆจ_๋ฐ์": {
|
| 313 |
+
"role": "๋ฐ์ ์ง๋ฅ(Ideation). ๋ด์ ์์น์ฒ๋ผ ๊ธฐ์กด ๊ฒฝ๊ณ๋ฅผ ๋์ด ํ์ ์ ์ ๊ทผ์ ์์ฑํ๋ค. "
|
| 314 |
+
"ไป์ ๋ โ ๋ชจ๋ ๊ฐ๋ฅ์ฑ์ ํ๊ณ ํค์ฐ๋ ๊ฐ์ฒ์.",
|
| 315 |
+
"element": "ๆจ", "index": 2, "shengsheng_from": "ๆฐด", "shengke_target": "ๅ",
|
| 316 |
+
"virtue": "ไป", "principle": "ๆฒ็ด",
|
| 317 |
+
},
|
| 318 |
+
"็ซ_ํํ": {
|
| 319 |
+
"role": "ํํ ์ง๋ฅ(Expression). ์ฌ๋ฆ ๋ถ๊ฝ์ฒ๋ผ ์์ด๋์ด๋ฅผ ์ฌ๋ฐฉ์ผ๋ก ๊ตฌ์ฒดํํ๊ณ ํ์ฅํ๋ค. "
|
| 320 |
+
"็ฆฎ์ ๋ โ ํ์์ ๊ฐ์ถ๊ณ ๋น๋๊ฒ ๋ง๋๋ ์ฐ์ถ๊ฐ.",
|
| 321 |
+
"element": "็ซ", "index": 3, "shengsheng_from": "ๆจ", "shengke_target": "้",
|
| 322 |
+
"virtue": "็ฆฎ", "principle": "็ไธ",
|
| 323 |
+
},
|
| 324 |
+
"ๅ_ํตํฉ": {
|
| 325 |
+
"role": "ํตํฉ ์ง๋ฅ(Integration). ๋์ง์ฒ๋ผ ์ค์ฌ์์ ๋ค์ํ ๊ด์ ์ ์ข
ํฉํ๊ณ ๊ฐ๋ฑ์ ์ค์ฌํ๋ค. "
|
| 326 |
+
"ไฟก์ ๋ โ ํ๋ค๋ฆฌ์ง ์๋ ์ค์ฌ์ถ์ผ๋ก ๊ท ํ ์กํ ๊ฒฐ๋ก ์ ๋์ถํ๋ ์กฐ์จ์.",
|
| 327 |
+
"element": "ๅ", "index": 4, "shengsheng_from": "็ซ", "shengke_target": "ๆฐด",
|
| 328 |
+
"virtue": "ไฟก", "principle": "็จผ็ฉก",
|
| 329 |
+
},
|
| 330 |
+
"้_๏ฟฝ๏ฟฝ๏ฟฝํ": {
|
| 331 |
+
"role": "์ฌํ ์ง๋ฅ(Judgment). ๊ฐ์์ ๋ซ์ฒ๋ผ ๋
ผ๋ฆฌ์ ๊ฒฐํจ์ ์๋ฅด๊ณ ์ณ๊ณ ๊ทธ๋ฆ์ ๊ฐ๋ฆฐ๋ค. "
|
| 332 |
+
"็พฉ์ ๋ โ ๋์ฒ ํ ๊ฒ์ฆ์ผ๋ก ๊ฑฐ์ง ์ ์ ๋ฅผ ์ ๋ฐํ๊ณ ๊ณผ์ฅ์ ์ ๊ฑฐํ๋ ์ฌํ๊ด.",
|
| 333 |
+
"element": "้", "index": 0, "shengsheng_from": "ๅ", "shengke_target": "ๆจ",
|
| 334 |
+
"virtue": "็พฉ", "principle": "ๅพ้ฉ",
|
| 335 |
+
},
|
| 336 |
+
"ๆฐด_์ฑ์ฐฐ": {
|
| 337 |
+
"role": "์ฑ์ฐฐ ์ง๋ฅ(Wisdom). ๊ฒจ์ธ ์ฌ์ฐ์ฒ๋ผ ๊ฐ์ฅ ๊น์ ๊ณณ๊น์ง ์ค๋ฉฐ๋๋ ๋ฉํ์ธ์ง๋ฅผ ์ํํ๋ค. "
|
| 338 |
+
"ๆบ์ ๋ โ ์ ์ฒด ๊ณผ์ ์ ๋์๋ณด๋ฉฐ ๊ทผ๋ณธ ์ ์ ๋ฅผ ๊ฒํ ํ๊ณ ๋ฐฉํฅ์ ์ฌ์ค์ ํ๋ ํ์.",
|
| 339 |
+
"element": "ๆฐด", "index": 1, "shengsheng_from": "้", "shengke_target": "็ซ",
|
| 340 |
+
"virtue": "ๆบ", "principle": "ๆฝคไธ",
|
| 341 |
+
},
|
| 342 |
+
}
|
| 343 |
+
AGENT_ORDER = ["ๆจ_๋ฐ์", "็ซ_ํํ", "ๅ_ํตํฉ", "้_์ฌํ", "ๆฐด_์ฑ์ฐฐ"]
|
| 344 |
+
AGENT_EMOJIS = {"ๆจ": "๐ณ", "็ซ": "๐ฅ", "ๅ": "๐๏ธ", "้": "โ๏ธ", "ๆฐด": "๐ง"}
|
| 345 |
+
|
| 346 |
+
# โโ ์์ด์ ํธ๋ณ ํ๋ ์ง์นจ โโ
|
| 347 |
+
AGENT_INSTRUCTIONS = {
|
| 348 |
+
"ๆจ": "\n\n[ํ๋ ์ง์นจ] ์์น์ด ๋
์ ๋ซ๋ฏ, ๊ธฐ์กด ํ์ ์ฝ๋งค์ด์ง ์๊ณ ๋ค์ํ ๊ฐ๋ฅ์ฑ์ ํ์ํ๋ผ. ์ฐธ์ ํ ์ ๊ทผ๊ณผ ํต์ฌ ์๋ฆฌ๋ฅผ ๋ช
ํํ ์์ ํ๋ผ.",
|
| 349 |
+
"็ซ": "\n\n[ํ๋ ์ง์นจ] ๋ถ๊ฝ์ด ์ฌ๋ฐฉ์ ๋ฐํ๋ฏ, ๆจ์ด ์ ์ํ ์์ด๋์ด๋ฅผ ๊ตฌ์ฒด์ ์ผ๋ก ํ์ฅํ๋ผ. ์ ๋ ์์น์ ์ฒด๊ณ์ ๊ตฌ์ฑ์ ํฌํจํ๋ผ.",
|
| 350 |
+
"ๅ": "\n\n[ํ๋ ์ง์นจ] ๋์ง๊ฐ ๋ง๋ฌผ์ ํ๋ฏ, ์ด์ ์์ด์ ํธ๋ค์ ์ถ๋ ฅ์ ์ข
ํฉํ์ฌ ๋ชจ์์ ์กฐ์ ํ๊ณ ๊ท ํ ์กํ ํตํฉ ๊ฒฐ๋ก ์ ๋์ถํ๋ผ.",
|
| 351 |
+
"้": "\n\n[ํ๋ ์ง์นจ] ๊ฐ์์ ๋ซ์ด ๋ฌด๋ฅด์ต์ ๊ฒ๊ณผ ์ฉ์ ๊ฒ์ ๊ฐ๋ฆฌ๋ฏ, ์ด์ ์ถ๋ ฅ์ ๋
ผ๋ฆฌ์ ๊ฒฐํจ, ๊ฑฐ์ง ์ ์ , ๊ณผ์ฅ๋ ์์น๋ฅผ ๋์ฒ ํ๊ฒ ์ ๋ฐํ๋ผ. "
|
| 352 |
+
"์์ฌ์ค๋ฌ์ด ์ฃผ์ฅ์๋ [๊ฒ์ฆ ํ์] ํ๊ทธ๋ฅผ ๋ถ์ฌ๋ผ."
|
| 353 |
+
"\n\n[์ฌํ ํต์ฌ ์๋ฌด] 1. ์์น๊ฐ ์ ํํ์ง ๊ฒ์ฆ 2. ๋ฌด๋นํ์ ์์ฉ์ ์ง์ 3. ๊ณผ์ฅ ์๋ณ 4. ๊ฒ์ฆ ๋ถ๊ฐ์ [๊ทผ๊ฑฐ ๋ถ์ถฉ๋ถ] ํ์ 5. ์คํ ๊ฐ๋ฅ์ฑ ๋์ ํ๊ฐ",
|
| 354 |
+
"ๆฐด": "\n\n[ํ๋ ์ง์นจ] ๋ฌผ์ด ๊น์ ๊ณณ๊น์ง ์ค๋ฉฐ๋ค๋ฏ, ์ ์ฒด ๊ณผ์ ์ ๊ทผ๋ณธ๋ถํฐ ๋๋์๋ณด๋ผ."
|
| 355 |
+
"\n\n[ํน๋ณ ๊ถํ: ๋ฉํ ์ฌ๊ฒํ โ ๆบ์ ๊ทน์น] 1. ์ด๊ธฐ ์ ์ ๊ฐ ์ฌ์ค์ธ์ง ๊ฒ์ฆ 2. ๊ฑฐ์ง/๊ณผ์ฅ ๋ฐ๊ฒฌ์ ์์ ์ ์ ์ "
|
| 356 |
+
"3. ์๋ชป๋ ์ ์ ์์ ๋ชฉํ๋ผ๋ฉด ์ฌ์ค์ ์ ์ธ 4. ๊ณตํต ์ค๋ฅ ํจํด ๋ฐ๊ฒฌ์ ๋ฐฉํฅ ์ ํ ์ ์"
|
| 357 |
+
"\n\n๋ฐ๋์ [๋ฉํ ํ๋จ] ์น์
์ ํฌํจํ๋ผ: ์ ์ฒด ๋ฌธ์ ์ , ๊ฐ ์ ์ ๊ฒ์ฆ ๊ฒฐ๊ณผ, ๋ชฉํ ์ ์ง/์์ /ํ๊ธฐ ํ๋จ๊ณผ ๊ทผ๊ฑฐ"
|
| 358 |
+
"\n\n[์ต์ข
๊ฒฐ๋ก ] ๋ชจ๋ ์์ด์ ํธ์ ํ ๋ก ์ ์ข
ํฉํ์ฌ ์ด ๊ณผ์ ์ ๋ํ ์ต์ข
๋ต๋ณ์ ๋ช
ํํ ์ ์ํ๋ผ.",
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
# โโ ์์/์๊ทน ์ค๋ช
โโ
|
| 362 |
+
SHENG_DESC = {
|
| 363 |
+
'ๆจ': 'ๆฐด(์ฑ์ฐฐ)์ ๊น์ ํต์ฐฐ์ด ์๋ก์ด ๋ฐ์์ ์จ์์ด ๋๋ค',
|
| 364 |
+
'็ซ': 'ๆจ(๋ฐ์)์ ์์ด๋์ด๊ฐ ํํ์ ์ฐ๋ฃ๊ฐ ๋๋ค',
|
| 365 |
+
'ๅ': '็ซ(ํํ)์ ๊ตฌ์ฒดํ๊ฐ ํตํฉ์ ์ฌ๋ฃ๊ฐ ๋๋ค',
|
| 366 |
+
'้': 'ๅ(ํตํฉ)์ ์ข
ํฉ๋ ๊ฒฐ๋ก ์ด ์ฌํ์ ๋์์ด ๋๋ค',
|
| 367 |
+
'ๆฐด': '้(์ฌํ)์ ๊ฒ์ฆ ๊ฒฐ๊ณผ๊ฐ ์ฑ์ฐฐ์ ํ ๋๊ฐ ๋๋ค',
|
| 368 |
+
}
|
| 369 |
+
KE_DESC = {
|
| 370 |
+
'ๆจ': '้(์ฌํ)์ด ํํฉ๋ ๋ฐ์์ ๋ฒจ ์ ์์ผ๋ฏ๋ก, ๊ทผ๊ฑฐ ์๋ ์์ด๋์ด๋ฅผ ์ ์ํ๋ผ',
|
| 371 |
+
'็ซ': 'ๆฐด(์ฑ์ฐฐ)์ด ๊ณผ์ ํํ์ ์ํ ์ ์์ผ๋ฏ๋ก, ๊ณผ์ฅ ์์ด ์ ํํ๊ฒ ์์ ํ๋ผ',
|
| 372 |
+
'ๅ': 'ๆจ(๋ฐ์)์ด ํตํฉ์ ์์ฃผ๋ฅผ ๊นจ๋จ๋ฆด ์ ์์ผ๋ฏ๋ก, ์๋ก์ด ๊ด์ ๋ ์์ฉํ๋ผ',
|
| 373 |
+
'้': '็ซ(ํํ)์ด ์ฌํ์ ๊ฒฝ์ง์ ๋
น์ผ ์ ์์ผ๋ฏ๋ก, ์ ์ฐํ ํ๋จ๋ ๊ณ ๋ คํ๋ผ',
|
| 374 |
+
'ๆฐด': 'ๅ(ํตํฉ)์ด ์ฑ์ฐฐ์ ๊ณตํ๋ฅผ ๋ง์ ์ ์์ผ๋ฏ๋ก, ์ค์ง์ ๊ฒฐ๋ก ์ ๋ด๋ ค๋ผ',
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
def _build_agent_prompt(agent_name, info, task_prompt, prev_outputs):
|
| 378 |
+
"""Proto-AGI ์์ด์ ํธ ํ๋กฌํํธ ๋น๋ (Full AETHER C5: ์์ยท์๊ทน + ๋ง๋ฐฉ์ง + ๋ฉํ)"""
|
| 379 |
+
elem = info['element']
|
| 380 |
+
|
| 381 |
+
# System prompt
|
| 382 |
+
sys = (f"๋น์ ์ AETHER Proto-AGI ์์คํ
์ [{agent_name}] ์์ด์ ํธ์
๋๋ค.\n"
|
| 383 |
+
f"์คํ ์์: {elem} ({info['principle']}) | ๋๋ชฉ: {info['virtue']}\n"
|
| 384 |
+
f"์ญํ : {info['role']}")
|
| 385 |
+
|
| 386 |
+
# ํ๋ ์ง์นจ
|
| 387 |
+
sys += AGENT_INSTRUCTIONS.get(elem, "")
|
| 388 |
+
|
| 389 |
+
# ์์ ๊ด๊ณ
|
| 390 |
+
if info['shengsheng_from'] in [a.split('_')[0] for a in prev_outputs]:
|
| 391 |
+
sys += f"\n\n[์์ ยท {info['shengsheng_from']}โ{elem}] {SHENG_DESC.get(elem, '')} โ ์ด์ ์ถ๋ ฅ์ ๋ฐ์ ์ ์ผ๋ก ๊ณ์นํ๋ผ."
|
| 392 |
+
|
| 393 |
+
# ์๊ทน ๊ด๊ณ
|
| 394 |
+
sys += f"\n[์๊ทน ยท {elem}ๅ
{info['shengke_target']}] {info['shengke_target']} ์์ด์ ํธ์ ๊ณผ๋ํ ๊ฒฝํฅ์ ๊ฒฌ์ ํ๋ผ."
|
| 395 |
+
sys += f"\n[ํผ๊ทน ์ฃผ์] {KE_DESC.get(elem, '')}"
|
| 396 |
+
|
| 397 |
+
# โโ ้: ์ฌํ ๊ฐํ (Document 2 TC-1 full) โโ
|
| 398 |
+
if elem == '้':
|
| 399 |
+
sys += """
|
| 400 |
+
|
| 401 |
+
[์ฌํ ์ง๋ฅ ํต์ฌ ์๋ฌด]
|
| 402 |
+
็พฉ์ ๋์ผ๋ก ๋ค์์ ์ํํ๋ผ:
|
| 403 |
+
1. ์ฃผ์ด์ง ์ ์ ์ ์์น๊ฐ ์ ํํ์ง ๊ฒ์ฆ โ ์์ฌ์ค๋ฌ์ด ์์น์ [๊ฒ์ฆ ํ์] ํ๊ทธ
|
| 404 |
+
2. ์ด์ ์์ด์ ํธ๏ฟฝ๏ฟฝ ์ ์ ๋ฅผ ๋ฌด๋นํ์ ์ผ๋ก ์์ฉํ๋ค๋ฉด ์ง์
|
| 405 |
+
3. ๊ณผ์ฅ๋ ์ ๋ ์์น(๋นํ์ค์ % ์ ๊ฐ, ๋นํ์ค์ ์์จ ๋ฑ) ์๋ณ
|
| 406 |
+
4. ๊ฒ์ฆ ๋ถ๊ฐ๋ฅํ ์ฃผ์ฅ์ [๊ทผ๊ฑฐ ๋ถ์ถฉ๋ถ] ํ์
|
| 407 |
+
5. ์ ์ฒด ์ถ๋ ฅ์ ์คํ ๊ฐ๋ฅ์ฑ์ ๋์ ํ๊ฒ ํ๊ฐ"""
|
| 408 |
+
|
| 409 |
+
# โโ ๆฐด: ๋ฉํ ์ฌ๊ฒํ ๊ถํ (Document 2 TC-1 full) โโ
|
| 410 |
+
if elem == 'ๆฐด':
|
| 411 |
+
sys += """
|
| 412 |
+
|
| 413 |
+
[ํน๋ณ ๊ถํ: ๋ฉํ ์ฌ๊ฒํ โ ๆบ์ ๊ทน์น]
|
| 414 |
+
๋ฌผ์ด ๋ชจ๋ ๊ฒ์ ๊ทผ์๊น์ง ์ค๋ฉฐ๋ค๋ฏ, ์ ์ฒด ๊ณผ์ ์ ๊ทผ๋ณธ๋ถํฐ ์ฌ๊ฒํ ํ๋ผ.
|
| 415 |
+
1. ์ด๊ธฐ ์ ์ (๊ธฐ์ ํํฉ, ์์น, ๊ท์ ๋ฑ)๊ฐ ์ฌ์ค์ธ์ง ํ๋ํ๋ ๊ฒ์ฆํ๋ผ
|
| 416 |
+
2. ๊ฑฐ์ง์ด๋ ๊ณผ์ฅ์ด ๋ฐ๊ฒฌ๋๋ฉด ๋ช
์์ ์ผ๋ก ์ง์ ํ๊ณ ์์ ์์ ์ ์ํ๋ผ
|
| 417 |
+
3. ์ ์ฒด ๋ชฉํ๊ฐ ์๋ชป๋ ์ ์ ์์ ์ธ์์ก๋ค๋ฉด ๋ชฉํ ์ฌ์ค์ ์ ์ ์ธํ๋ผ
|
| 418 |
+
4. ์ด์ ์์ด์ ํธ ์ถ๋ ฅ์์ ๊ณตํต ์ค๋ฅ ํจํด ๋ฐ๊ฒฌ์ ์ ์ฒด ๋ฐฉํฅ ์ ํ์ ์ ์ํ๋ผ
|
| 419 |
+
|
| 420 |
+
๋ฐ๋์ [๋ฉํ ํ๋จ] ์น์
์ ํฌํจํ๋ผ:
|
| 421 |
+
- ์ ์ฒด ๊ณผ์ ์ ๊ทผ๋ณธ์ ๋ฌธ์ ์
|
| 422 |
+
- ๊ฐ ์ ์ ์ ๊ฒ์ฆ ๊ฒฐ๊ณผ (์ฌ์ค/๊ฑฐ์ง/๋ถํ์ค)
|
| 423 |
+
- ๋ชฉํ ์ ์ง/์์ /ํ๊ธฐ ํ๋จ๊ณผ ๊ทผ๊ฑฐ
|
| 424 |
+
|
| 425 |
+
[์ต์ข
๊ฒฐ๋ก ] ๋ชจ๋ ์์ด์ ํธ์ ํ ๋ก ์ ์ข
ํฉํ์ฌ ์ด ๊ณผ์ ์ ๋ํ ์ต์ข
๋ต๋ณ์ ๋ช
ํํ ์ ์ํ๋ผ."""
|
| 426 |
+
|
| 427 |
+
# โโ ๋ง๋ฐฉ์ง ์ํต ๋งคํธ๋ฆญ์ค๋ก ์ด์ ์ถ๋ ฅ ์ฐธ์กฐ โโ
|
| 428 |
+
ctx = ""
|
| 429 |
+
if prev_outputs:
|
| 430 |
+
listener_idx = AGENT_ORDER.index(agent_name) if agent_name in AGENT_ORDER else 0
|
| 431 |
+
weights = COMM_MATRIX[listener_idx]
|
| 432 |
+
ctx = "\n\n[์ด์ ์์ด์ ํธ ์ถ๋ ฅ โ ๋ง๋ฐฉ์ง ์ํต ๋งคํธ๋ฆญ์ค ์ ์ฉ]\n"
|
| 433 |
+
ctx += "(์ฐธ์กฐ ๊ฐ๋๊ฐ ๋์์๋ก ํด๋น ์์ด์ ํธ์ ์ถ๋ ฅ์ ๊น์ด ๋ถ์ํ๊ณ ๋ฐ์ํ๋ผ)\n"
|
| 434 |
+
|
| 435 |
+
for aname, output in prev_outputs.items():
|
| 436 |
+
src_idx = AGENT_ORDER.index(aname) if aname in AGENT_ORDER else 0
|
| 437 |
+
w = weights[src_idx]
|
| 438 |
+
level = _comm_level(w)
|
| 439 |
+
ctx += f"\n--- {aname} [{level} ยท ๊ฐ๋ {w:.0%}] ---\n"
|
| 440 |
+
if w >= 0.30:
|
| 441 |
+
ctx += f"{output[:3000]}\nโ ๏ธ ์ ์์ด์ ํธ์ ์ฃผ์ฅ์ ๋ฐ๋์ ์ ๋ฐ ๊ฒํ ํ๊ณ ์๋ต์ ๋ฐ์ํ๋ผ.\n"
|
| 442 |
+
elif w >= 0.18:
|
| 443 |
+
ctx += f"{output[:2000]}\n"
|
| 444 |
+
elif w >= 0.10:
|
| 445 |
+
ctx += f"{output[:1500]}\n"
|
| 446 |
+
else:
|
| 447 |
+
ctx += f"{output[:800]}\n(๊ฒฝ๋ ์ฐธ์กฐ โ ํต์ฌ ๊ฒฐ๋ก ๋ง ์ฐธ๊ณ )\n"
|
| 448 |
+
|
| 449 |
+
return sys, f"{task_prompt}\n{ctx}"
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def _run_proto_agi_pipeline(task_prompt, api_key, eval_model):
|
| 453 |
+
"""Proto-AGI Full AETHER ํ์ดํ๋ผ์ธ: ๆจโ็ซโๅโ้โๆฐด ์์ฐจ ์คํ
|
| 454 |
+
|
| 455 |
+
Returns: (final_output, agent_trace)
|
| 456 |
+
final_output: ๆฐด_์ฑ์ฐฐ์ ์ต์ข
๊ฒฐ๋ก (Judge์ ์ ๋ฌ)
|
| 457 |
+
agent_trace: {agent_name: response} ์ ์ฒด ๊ธฐ๋ก
|
| 458 |
+
"""
|
| 459 |
+
prev_outputs = {}
|
| 460 |
+
|
| 461 |
+
for aname in AGENT_ORDER:
|
| 462 |
+
info = PROTO_AGENTS[aname]
|
| 463 |
+
sys_prompt, usr_prompt = _build_agent_prompt(aname, info, task_prompt, prev_outputs)
|
| 464 |
+
resp = call_llm(usr_prompt, system=sys_prompt, api_key=api_key, model=eval_model)
|
| 465 |
+
# qwen3 thinking ํ๊ทธ๋ call_llm ๋ด๋ถ์์ ์ด๋ฏธ ์ ๊ฑฐ๋จ
|
| 466 |
+
prev_outputs[aname] = resp
|
| 467 |
+
|
| 468 |
+
# ์ต์ข
์ถ๋ ฅ: ๋ชจ๋ ์์ด์ ํธ ์๋ต ๊ฒฐํฉ (ๆฐด์ ๊ฒฐ๋ก ์ด ๋ง์ง๋ง)
|
| 469 |
+
combined = []
|
| 470 |
+
for aname in AGENT_ORDER:
|
| 471 |
+
elem = PROTO_AGENTS[aname]['element']
|
| 472 |
+
emoji = AGENT_EMOJIS.get(elem, "")
|
| 473 |
+
combined.append(f"{'='*40}\n{emoji} [{aname}] ์๋ต\n{'='*40}\n{prev_outputs[aname]}")
|
| 474 |
+
|
| 475 |
+
return "\n\n".join(combined), prev_outputs
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
def _execute_task(task, api_key, eval_model, proto_agi=False):
|
| 479 |
+
"""๊ณผ์ ์ ํ์ ๋ฐ๋ผ ์คํ ๋ถ๊ธฐ โ Proto-AGI ๋ชจ๋ ์ง์"""
|
| 480 |
+
if proto_agi:
|
| 481 |
+
# Proto-AGI: ๋ชจ๋ ๊ณผ์ ๋ฅผ ์คํ ํ์ดํ๋ผ์ธ์ผ๋ก ์ฒ๋ฆฌ
|
| 482 |
+
final_output, _ = _run_proto_agi_pipeline(task.prompt, api_key, eval_model)
|
| 483 |
+
return final_output
|
| 484 |
+
elif task.sub_dimension == "mutual_verification":
|
| 485 |
+
topic = task.prompt.replace("[์์-์๊ทน ์ฌ์ดํด] ", "").split("\n")[0]
|
| 486 |
+
return _run_mutual_verification(topic, api_key, eval_model)
|
| 487 |
+
elif task.sub_dimension == "feedback_incorporation":
|
| 488 |
+
return _run_feedback_incorporation(task.prompt, api_key, eval_model)
|
| 489 |
+
else:
|
| 490 |
+
return call_llm(task.prompt, api_key=api_key, model=eval_model)
|
| 491 |
|
| 492 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 493 |
# PART 5: LLM-as-Judge ์ฑ์
|
|
|
|
| 502 |
rubric = task.scoring_rubric
|
| 503 |
rubric_text = "\n".join([f" - {k} (x{v['weight']}): {v['desc']}" for k, v in rubric.items()])
|
| 504 |
expected = task.expected_behavior or "N/A"
|
| 505 |
+
# ๋ค์ค ๋ผ์ด๋๋ ์๋ต์ด ๊ธธ๋ฏ๋ก ๋ ๋ง์ด ํฌํจ
|
| 506 |
+
resp_limit = 6000 if _is_multi_round(task) else 3000
|
| 507 |
return f"""[๊ณผ์ ] {task.task_id} | {task.pillar} | {task.difficulty}
|
| 508 |
[ํ๋กฌํํธ] {task.prompt[:1500]}
|
| 509 |
[๊ธฐ๋] {expected[:500]}
|
| 510 |
+
[ํผํ๊ฐ ์๋ต] {response[:resp_limit]}
|
| 511 |
[๋ฃจ๋ธ๋ฆญ]
|
| 512 |
{rubric_text}
|
| 513 |
์ ๋ฃจ๋ธ๋ฆญ์ ๋ฐ๋ผ JSON์ผ๋ก ์ฑ์ ."""
|
| 514 |
|
| 515 |
def parse_judge_response(text, rubric_keys):
|
| 516 |
+
"""Judge ์๋ต์์ ์ ์ JSON ์ถ์ถ โ ๋ค์ค ํจํด ํ์ฑ"""
|
| 517 |
+
# Pattern 1: ํ์ค {"scores": {...}, "comment": ...}
|
| 518 |
try:
|
| 519 |
match = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', text, re.DOTALL)
|
| 520 |
if match:
|
|
|
|
| 526 |
return {"scores": scores, "comment": data.get("comment", "")}
|
| 527 |
except:
|
| 528 |
pass
|
| 529 |
+
|
| 530 |
+
# Pattern 2: ```json ๋ธ๋ก ๋ด๋ถ
|
| 531 |
+
try:
|
| 532 |
+
match = re.search(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 533 |
+
if match:
|
| 534 |
+
data = json.loads(match.group(1))
|
| 535 |
+
scores = data.get("scores", {})
|
| 536 |
+
for k in rubric_keys:
|
| 537 |
+
if k not in scores:
|
| 538 |
+
scores[k] = 0.5
|
| 539 |
+
return {"scores": scores, "comment": data.get("comment", "")}
|
| 540 |
+
except:
|
| 541 |
+
pass
|
| 542 |
+
|
| 543 |
+
# Pattern 3: ๊ฐ๋ณ ํญ๋ชฉ ์ถ์ถ (key: 0.75 ํจํด)
|
| 544 |
+
try:
|
| 545 |
+
scores = {}
|
| 546 |
+
for k in rubric_keys:
|
| 547 |
+
m = re.search(rf'["\']?{k}["\']?\s*[:=]\s*([\d.]+)', text)
|
| 548 |
+
if m:
|
| 549 |
+
scores[k] = min(max(float(m.group(1)), 0), 1.0)
|
| 550 |
+
else:
|
| 551 |
+
scores[k] = 0.5
|
| 552 |
+
if any(v != 0.5 for v in scores.values()):
|
| 553 |
+
return {"scores": scores, "comment": "ํจํด3 ํ์ฑ"}
|
| 554 |
+
except:
|
| 555 |
+
pass
|
| 556 |
+
|
| 557 |
return {"scores": {k: 0.5 for k in rubric_keys}, "comment": "ํ์ฑ์คํจ"}
|
| 558 |
|
| 559 |
def compute_weighted_score(scores, rubric):
|
|
|
|
| 763 |
|
| 764 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 765 |
|
| 766 |
+
def _eval_single_task(task, run_id, eval_api_key, eval_model, judge_api_key, judge_model, state, proto_agi=False):
|
| 767 |
"""๋จ์ผ ๊ณผ์ ํ๊ฐ (๋ชจ๋ธํธ์ถ + Judge์ฑ์ ). ์์ปค ์ค๋ ๋์์ ์คํ."""
|
| 768 |
try:
|
| 769 |
+
# Step 1: ํผํ๊ฐ ๋ชจ๋ธ ํธ์ถ (Proto-AGI / ๋ค์ค ๋ผ์ด๋ ์๋ ๋ถ๊ธฐ)
|
| 770 |
+
model_response = _execute_task(task, eval_api_key, eval_model, proto_agi=proto_agi)
|
| 771 |
|
| 772 |
+
if model_response.startswith("[API_ERROR"):
|
| 773 |
_save_result(run_id, task.task_id, model_response, "{}", 0)
|
| 774 |
with state["lock"]:
|
| 775 |
state["done"] += 1
|
| 776 |
state["errors"].append(task.task_id)
|
| 777 |
return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
|
| 778 |
|
| 779 |
+
# Step 2: Judge ์ฑ์ (๋ณ๋ API ํค/๋ชจ๋ธ)
|
| 780 |
judge_prompt = build_judge_prompt(task, model_response)
|
| 781 |
+
judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=judge_api_key,
|
| 782 |
model=judge_model, temperature=0.3)
|
| 783 |
|
| 784 |
rubric_keys = list(task.scoring_rubric.keys())
|
|
|
|
| 848 |
return out
|
| 849 |
|
| 850 |
|
| 851 |
+
def run_evaluation(eval_api_key, judge_api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
| 852 |
+
max_tasks, n_workers, proto_agi, fresh_start, progress=gr.Progress()):
|
| 853 |
+
"""๋ฉ์ธ ํ๊ฐ โ ๊ธฐ๋ฅ๋ณ ๋ณ๋ ฌ ์คํ (Eval: Groq, Judge: Fireworks ๋ถ๋ฆฌ, Proto-AGI ์ง์)"""
|
| 854 |
+
eval_api_key = eval_api_key.strip() or os.getenv("GROQ_API_KEY", "")
|
| 855 |
+
judge_api_key = judge_api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 856 |
+
if not eval_api_key:
|
| 857 |
+
yield "โ ํผํ๊ฐ ๋ชจ๋ธ API Key๋ฅผ ์
๋ ฅํ์ธ์.", "", "", "", None
|
| 858 |
+
return
|
| 859 |
+
if not judge_api_key:
|
| 860 |
+
yield "โ Judge ๋ชจ๋ธ API Key๋ฅผ ์
๋ ฅํ์ธ์.", "", "", "", None
|
| 861 |
return
|
| 862 |
|
| 863 |
+
# Proto-AGI ํ์ฑํ ์ ์์ปค ์ ์๋ ์กฐ์ (๊ณผ์ ๋น 5ํ API ํธ์ถ)
|
| 864 |
+
n_workers = int(n_workers)
|
| 865 |
+
if proto_agi and n_workers > 3:
|
| 866 |
+
n_workers = 3 # 5 agents ร 3 workers = 15 ๋์ API ํธ์ถ
|
| 867 |
+
|
| 868 |
# โโ ๊ณผ์ ํํฐ๋ง โโ
|
| 869 |
tasks = ALL_TASKS[:]
|
| 870 |
if pillar_filter != "์ ์ฒด":
|
|
|
|
| 873 |
tasks = [t for t in tasks if t.difficulty == diff_filter]
|
| 874 |
tasks = tasks[:int(max_tasks)]
|
| 875 |
|
| 876 |
+
# run_id์ proto_agi ๋ชจ๋ ํฌํจ (์ฒดํฌํฌ์ธํธ ๋ถ๋ฆฌ)
|
| 877 |
+
mode_suffix = "_PAGI" if proto_agi else ""
|
| 878 |
+
run_id = _make_run_id(eval_model + mode_suffix)
|
| 879 |
if fresh_start:
|
| 880 |
_clear_run(run_id)
|
| 881 |
|
|
|
|
| 927 |
"pillar_done": {p: 0 for p in pillar_tasks},
|
| 928 |
}
|
| 929 |
|
| 930 |
+
mode_tag = '๐ <b>Proto-AGI ON</b> (ๆจโ็ซโๅโ้โๆฐด)' if proto_agi else '๐ค <b>๋จ์ผ LLM ๋ชจ๋</b>'
|
| 931 |
+
yield (CSS + f'<div style="background:{"#fff3e0" if proto_agi else "#e8f5e9"};padding:12px;border-radius:8px;margin:8px 0;">'
|
| 932 |
+
f'โก <b>๋ณ๋ ฌ ํ๊ฐ ์์!</b> {len(pending)}๊ฐ ๊ณผ์ ยท {n_pillars}๊ฐ ๊ธฐ๋ฅ ๋์ ยท {n_workers}๊ฐ ์์ปค<br>'
|
| 933 |
+
f'{mode_tag}</div>', _build_progress_table(results, tasks), "", "", None)
|
| 934 |
|
| 935 |
# โโ ThreadPoolExecutor ๋ณ๋ ฌ ์คํ โโ
|
| 936 |
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 937 |
futures = {}
|
| 938 |
for task in pending:
|
| 939 |
+
fut = executor.submit(_eval_single_task, task, run_id, eval_api_key,
|
| 940 |
+
eval_model, judge_api_key, judge_model, state, proto_agi)
|
| 941 |
futures[fut] = task
|
| 942 |
|
| 943 |
completed = set()
|
|
|
|
| 993 |
n_err = len(state["errors"])
|
| 994 |
err_msg = f" (โ ๏ธ {n_err}๊ฐ ์ค๋ฅ)" if n_err > 0 else ""
|
| 995 |
restore_msg = f" (๐พ {cached}๊ฐ ๋ณต์)" if cached > 0 else ""
|
| 996 |
+
mode_str = "๐Proto-AGI" if proto_agi else "๐ค๋จ์ผLLM"
|
| 997 |
|
| 998 |
+
display_model = f"{eval_model} [{mode_str}]"
|
| 999 |
+
summary = _build_final_summary(results, tasks, pillar_scores, aether, display_model, hf_status)
|
| 1000 |
table = _build_progress_table(results, tasks)
|
| 1001 |
detail = _build_detail_view(results, tasks)
|
| 1002 |
|
| 1003 |
+
yield (f"๐ ํ๊ฐ ์๋ฃ! {mode_str}{restore_msg}{err_msg} AETHER Score: {aether:.1f}",
|
| 1004 |
table, summary, detail, csv_path)
|
| 1005 |
|
| 1006 |
|
|
|
|
| 1013 |
|
| 1014 |
HEADER = """
|
| 1015 |
<div style="text-align:center;padding:16px 0;">
|
| 1016 |
+
<h1 style="margin:0;font-size:1.8em;">๐ AETHER-Bench v0.3.0</h1>
|
| 1017 |
+
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM ํ๊ฐ ์์คํ
+ Proto-AGI ์คํ ๋ฉํฐ์์ด์ ํธ</h2>
|
| 1018 |
+
<p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
|
| 1019 |
120 Tasks ยท 5 Pillars ยท 19 Sub-dimensions ยท HAR Metric<br>
|
| 1020 |
+
๐ <b>Proto-AGI</b>: ๆจโ็ซโๅโ้โๆฐด ์คํ ํ์ดํ๋ผ์ธ + ๋ง๋ฐฉ์ง ์ํต ๋งคํธ๋ฆญ์ค<br>
|
| 1021 |
+
๐ค <b>๋จ์ผ LLM</b>: ์์ ์ํ ํ๊ฐ | CSV โ HuggingFace PRIVATE ๊ธฐ๋ก
|
| 1022 |
</p>
|
| 1023 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 1024 |
+
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">๐ณ ๆจ ๋ฐ์(ไป)</span>
|
| 1025 |
+
<span style="background:#ffebee;padding:2px 10px;border-radius:12px;">๐ฅ ็ซ ํํ(็ฆฎ)</span>
|
| 1026 |
+
<span style="background:#fff3e0;padding:2px 10px;border-radius:12px;">๐๏ธ ๅ ํตํฉ(ไฟก)</span>
|
| 1027 |
+
<span style="background:#f5f5f5;padding:2px 10px;border-radius:12px;">โ๏ธ ้ ์ฌํ(็พฉ)</span>
|
| 1028 |
+
<span style="background:#e3f2fd;padding:2px 10px;border-radius:12px;">๐ง ๆฐด ์ฑ์ฐฐ(ๆบ)</span>
|
| 1029 |
</div>
|
| 1030 |
</div>"""
|
| 1031 |
|
| 1032 |
def create_app():
|
| 1033 |
+
with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
|
| 1034 |
css=".gradio-container{max-width:1100px !important}") as app:
|
| 1035 |
gr.HTML(HEADER)
|
| 1036 |
|
| 1037 |
with gr.Row():
|
| 1038 |
+
eval_api_key = gr.Textbox(label="๐ ํผํ๊ฐ API Key (Groq)", type="password",
|
| 1039 |
+
placeholder="gsk_...", value=os.getenv("GROQ_API_KEY", ""), scale=3)
|
| 1040 |
+
judge_api_key = gr.Textbox(label="โ๏ธ Judge API Key (Fireworks)", type="password",
|
| 1041 |
+
placeholder="fw_...", value=os.getenv("FIREWORKS_API_KEY", ""), scale=3)
|
| 1042 |
|
| 1043 |
with gr.Row():
|
| 1044 |
+
eval_model = gr.Dropdown(
|
| 1045 |
+
choices=["qwen/qwen3-32b", "qwen-qwq-32b", "deepseek-r1-distill-llama-70b",
|
| 1046 |
+
"llama-3.3-70b-versatile", "meta-llama/llama-4-scout-17b-16e-instruct",
|
| 1047 |
+
"mistral-saba-24b", "gemma2-9b-it", "llama-3.1-8b-instant"],
|
| 1048 |
+
value="qwen/qwen3-32b", label="๐ค ํผํ๊ฐ ๋ชจ๋ธ (Groq)", allow_custom_value=True, scale=3)
|
| 1049 |
+
judge_model = gr.Dropdown(
|
| 1050 |
+
choices=["accounts/fireworks/models/kimi-k2p5",
|
| 1051 |
+
"qwen/qwen3-32b", "deepseek-r1-distill-llama-70b",
|
| 1052 |
+
"llama-3.3-70b-versatile"],
|
| 1053 |
+
value="accounts/fireworks/models/kimi-k2p5",
|
| 1054 |
+
label="โ๏ธ ์ฌํ ๋ชจ๋ธ (Fireworks/Groq)", allow_custom_value=True, scale=3)
|
| 1055 |
+
|
| 1056 |
+
# โโ Proto-AGI ํ ๊ธ โโ
|
| 1057 |
+
with gr.Row():
|
| 1058 |
+
proto_agi_toggle = gr.Checkbox(
|
| 1059 |
+
label="๐ Proto-AGI ํ์ฑํ (ๆจโ็ซโๅโ้โๆฐด ์คํ ํ์ดํ๋ผ์ธ)",
|
| 1060 |
+
value=True, scale=3)
|
| 1061 |
+
gr.HTML('''<div style="font-size:0.82em;color:#666;padding:8px;background:#fffde7;border-radius:8px;margin:auto 0;" id="pagi-info">
|
| 1062 |
+
<b>Proto-AGI ON:</b> ๊ณผ์ ๋น 5ํ ์์ฐจ API ํธ์ถ (๋ฐ์โํํโํตํฉโ์ฌํโ์ฑ์ฐฐ)<br>
|
| 1063 |
+
์์ยท์๊ทน + ๋ง๋ฐฉ์ง ์ํต ๋งคํธ๋ฆญ์ค + ๆฐด ๋ฉํ ์ฌ๊ฒํ | ์์ปค ์๋ ์ ํ 3๊ฐ<br>
|
| 1064 |
+
<b>Proto-AGI OFF:</b> ๊ณผ์ ๋น 1ํ API ํธ์ถ (์์ LLM ์ํ)
|
| 1065 |
+
</div>''', scale=3)
|
| 1066 |
|
| 1067 |
with gr.Row():
|
| 1068 |
pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="์ ์ฒด", label="๊ธฐ๋ฅ ํํฐ", scale=2)
|
|
|
|
| 1073 |
with gr.Row():
|
| 1074 |
start_btn = gr.Button("โถ๏ธ ํ๊ฐ ์์ (์ด์ดํ๊ธฐ)", variant="primary", size="lg", scale=2)
|
| 1075 |
fresh_btn = gr.Button("๐ ์๋ก ์์", variant="secondary", size="lg", scale=2)
|
| 1076 |
+
gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">โก ๋์ผ ๋ฐฑ์๋: Groq(ํผํ๊ฐ) + Fireworks(Judge)<br>โถ๏ธ ์ค๋จ์ ์ด์ด์ | ๐ ์ด๊ธฐํํ ์ฌ์์ | CSVโHF PRIVATE</p>')
|
| 1077 |
|
| 1078 |
with gr.Tabs():
|
| 1079 |
with gr.Tab("๐ ์งํ"):
|
|
|
|
| 1087 |
with gr.Tab("๐พ CSV"):
|
| 1088 |
csv_file = gr.File(label="ํ๊ฐ ๊ฒฐ๊ณผ CSV")
|
| 1089 |
|
| 1090 |
+
def _run_resume(eak,jak,em,jm,pagi,pf,df,mt,nw):
|
| 1091 |
+
yield from run_evaluation(eak,jak,em,jm,pf,df,mt,nw,pagi,False)
|
| 1092 |
+
def _run_fresh(eak,jak,em,jm,pagi,pf,df,mt,nw):
|
| 1093 |
+
yield from run_evaluation(eak,jak,em,jm,pf,df,mt,nw,pagi,True)
|
| 1094 |
+
|
| 1095 |
+
all_inputs = [eval_api_key, judge_api_key, eval_model, judge_model,
|
| 1096 |
+
proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
|
| 1097 |
|
| 1098 |
start_btn.click(
|
| 1099 |
fn=_run_resume,
|
| 1100 |
+
inputs=all_inputs,
|
| 1101 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 1102 |
)
|
| 1103 |
fresh_btn.click(
|
| 1104 |
fn=_run_fresh,
|
| 1105 |
+
inputs=all_inputs,
|
| 1106 |
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 1107 |
)
|
| 1108 |
|
| 1109 |
gr.Markdown("""---
|
| 1110 |
+
<center>AETHER-Bench v0.3.0 ยท Apache 2.0 ยท Ginigen AI (์ง๋์ AI)<br>
|
| 1111 |
+
๐ Proto-AGI ์คํ ํ์ดํ๋ผ์ธ + ๋์ผ ๋ฐฑ์๋: <b>Groq</b> (ํผํ๊ฐ) + <b>Fireworks</b> (Judge)<br>
|
| 1112 |
+
<code>HF_TOKEN</code> ์ค์ ์ PRIVATE ์๋ ๊ธฐ๋ก</center>""")
|
| 1113 |
return app
|
| 1114 |
|
| 1115 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
| 1120 |
stats = {}
|
| 1121 |
for t in ALL_TASKS:
|
| 1122 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 1123 |
+
print(f"AETHER-Bench v0.3.0 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
|
| 1124 |
+
print(f" Proto-AGI: ๆจ_๋ฐ์โ็ซ_ํํโๅ_ํตํฉโ้_์ฌํโๆฐด_์ฑ์ฐฐ (5 agents)")
|
| 1125 |
for p, n in stats.items():
|
| 1126 |
info = PILLAR_INFO[p]
|
| 1127 |
print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
|