Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -908,156 +908,298 @@ def _parallel_progress_html(state, total):
|
|
| 908 |
return out
|
| 909 |
|
| 910 |
|
| 911 |
-
def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
| 912 |
-
max_tasks, n_workers, proto_agi, fresh_start, progress=gr.Progress()):
|
| 913 |
-
"""λ©μΈ νκ° β β
νμμμ λ°©μ§: λΉ λ₯Έ yield μ£ΌκΈ° + μ΅μ νλ ν ν°/timeout"""
|
| 914 |
-
api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 915 |
-
if not api_key:
|
| 916 |
-
yield "β Fireworks API Keyλ₯Ό μ
λ ₯νμΈμ.", "", "", "", None
|
| 917 |
-
return
|
| 918 |
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 929 |
|
| 930 |
-
mode_suffix = "_PAGI" if proto_agi else ""
|
| 931 |
-
run_id = _make_run_id(eval_model + mode_suffix)
|
| 932 |
-
if fresh_start:
|
| 933 |
-
_clear_run(run_id)
|
| 934 |
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
|
| 939 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 940 |
|
| 941 |
-
|
| 942 |
-
yield (f"πΎ 체ν¬ν¬μΈνΈ 볡μ: {cached}/{total} μλ£ β {len(pending)}κ° λ¨μ",
|
| 943 |
-
_build_progress_table(results, tasks), "", "", None)
|
| 944 |
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
|
| 950 |
-
aether = calculate_aether_score(pillar_scores)
|
| 951 |
-
csv_str = generate_csv(results, eval_model)
|
| 952 |
-
csv_path = f"/tmp/aether_eval_{run_id}.csv"
|
| 953 |
-
with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_str)
|
| 954 |
-
hf_status = upload_to_hf(csv_str, eval_model)
|
| 955 |
-
yield (f"π μ λΆ μΊμ! AETHER Score: {aether:.1f}",
|
| 956 |
-
_build_progress_table(results, tasks),
|
| 957 |
-
_build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status),
|
| 958 |
-
_build_detail_view(results, tasks), csv_path)
|
| 959 |
-
return
|
| 960 |
-
|
| 961 |
-
pillar_tasks = {}
|
| 962 |
-
for t in pending:
|
| 963 |
-
pillar_tasks.setdefault(t.pillar, []).append(t)
|
| 964 |
-
|
| 965 |
-
state = {
|
| 966 |
-
"lock": threading.Lock(),
|
| 967 |
-
"done": 0,
|
| 968 |
-
"active": [],
|
| 969 |
-
"errors": [],
|
| 970 |
-
"pillar_total": {p: len(ts) for p, ts in pillar_tasks.items()},
|
| 971 |
-
"pillar_done": {p: 0 for p in pillar_tasks},
|
| 972 |
-
"start_time": time.time(),
|
| 973 |
-
"parse_ok": 0,
|
| 974 |
-
"parse_fail": 0,
|
| 975 |
-
}
|
| 976 |
-
|
| 977 |
-
mode_tag = 'π Proto-AGI ON' if proto_agi else 'π€ λ¨μΌ LLM'
|
| 978 |
-
yield (CSS + f'<div style="background:{"#fff3e0" if proto_agi else "#e8f5e9"};padding:12px;border-radius:8px;">'
|
| 979 |
-
f'β‘ <b>λ³λ ¬ νκ° μμ!</b> {len(pending)}κ° Β· {n_workers}μ컀 Β· {mode_tag}</div>',
|
| 980 |
-
_build_progress_table(results, tasks), "", "", None)
|
| 981 |
-
|
| 982 |
-
# ββ β
ν΅μ¬: ThreadPoolExecutor + λΉ λ₯Έ yield (0.3μ΄ κ°κ²©) ββ
|
| 983 |
-
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 984 |
-
futures = {}
|
| 985 |
-
for task in pending:
|
| 986 |
-
fut = executor.submit(_eval_single_task, task, run_id, api_key,
|
| 987 |
-
eval_model, judge_model, state, proto_agi)
|
| 988 |
-
futures[fut] = task
|
| 989 |
-
|
| 990 |
-
completed = set()
|
| 991 |
-
last_yield = time.time()
|
| 992 |
-
|
| 993 |
-
while len(completed) < len(futures):
|
| 994 |
-
newly_done = []
|
| 995 |
-
for fut in futures:
|
| 996 |
-
if fut in completed: continue
|
| 997 |
-
if fut.done():
|
| 998 |
-
completed.add(fut)
|
| 999 |
-
newly_done.append(fut)
|
| 1000 |
-
|
| 1001 |
-
for fut in newly_done:
|
| 1002 |
-
try:
|
| 1003 |
-
tid, data = fut.result()
|
| 1004 |
-
results[tid] = data
|
| 1005 |
-
task_obj = futures[fut]
|
| 1006 |
-
with state["lock"]:
|
| 1007 |
-
state["pillar_done"][task_obj.pillar] = state["pillar_done"].get(task_obj.pillar, 0) + 1
|
| 1008 |
-
except Exception as e:
|
| 1009 |
-
with state["lock"]:
|
| 1010 |
-
state["errors"].append(str(e)[:60])
|
| 1011 |
-
|
| 1012 |
-
# β
ν΅μ¬ λ³κ²½: 0.3μ΄λ§λ€ yield (SSE heartbeat μν )
|
| 1013 |
-
now = time.time()
|
| 1014 |
-
if now - last_yield >= 0.3 or newly_done:
|
| 1015 |
-
last_yield = now
|
| 1016 |
-
with state["lock"]:
|
| 1017 |
-
done_now = cached + state["done"]
|
| 1018 |
-
pct = min(int(done_now / total * 100), 100)
|
| 1019 |
-
progress(done_now / total, desc=f"{done_now}/{total} ({pct}%)")
|
| 1020 |
-
prog_html = CSS + _parallel_progress_html(state, len(pending))
|
| 1021 |
|
| 1022 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
|
| 1024 |
-
if len(completed) < len(futures):
|
| 1025 |
-
time.sleep(0.2) # β
1.0μ΄β0.2μ΄ (λΉ λ₯Έ ν΄λ§)
|
| 1026 |
|
| 1027 |
-
|
| 1028 |
-
|
|
|
|
| 1029 |
|
| 1030 |
pillar_scores = {}
|
| 1031 |
for p in PILLAR_INFO:
|
| 1032 |
pt = [t for t in tasks if t.pillar == p and t.task_id in results]
|
| 1033 |
-
if pt:
|
|
|
|
| 1034 |
|
| 1035 |
aether = calculate_aether_score(pillar_scores)
|
| 1036 |
|
| 1037 |
csv_str = generate_csv(results, eval_model)
|
|
|
|
| 1038 |
csv_path = f"/tmp/aether_eval_{run_id}.csv"
|
| 1039 |
with open(csv_path, "w", encoding="utf-8") as f:
|
| 1040 |
f.write(csv_str)
|
| 1041 |
|
| 1042 |
hf_status = upload_to_hf(csv_str, eval_model)
|
| 1043 |
|
| 1044 |
-
|
| 1045 |
-
err_msg = f" (β οΈ {n_err}κ° μ€λ₯)" if n_err > 0 else ""
|
| 1046 |
-
restore_msg = f" (πΎ {cached}κ° λ³΅μ)" if cached > 0 else ""
|
| 1047 |
mode_str = "πProto-AGI" if proto_agi else "π€λ¨μΌLLM"
|
| 1048 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
table = _build_progress_table(results, tasks)
|
| 1053 |
-
detail = _build_detail_view(results, tasks)
|
| 1054 |
|
| 1055 |
-
|
| 1056 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1057 |
|
| 1058 |
|
| 1059 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1060 |
-
# PART 11: Gradio App β β
|
| 1061 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1062 |
|
| 1063 |
PILLAR_CHOICES = ["μ 체"] + list(PILLAR_INFO.keys())
|
|
@@ -1065,12 +1207,13 @@ DIFF_CHOICES = ["μ 체", "basic", "intermediate", "advanced", "expert", "fronti
|
|
| 1065 |
|
| 1066 |
HEADER = """
|
| 1067 |
<div style="text-align:center;padding:16px 0;">
|
| 1068 |
-
<h1 style="margin:0;font-size:1.8em;">π AETHER-Bench v0.3.
|
| 1069 |
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM νκ° μμ€ν
+ Proto-AGI μ€ν λ©ν°μμ΄μ νΈ</h2>
|
| 1070 |
<p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
|
| 1071 |
120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
|
| 1072 |
π <b>Proto-AGI</b>: ζ¨βη«βεβιβζ°΄ μ€ν νμ΄νλΌμΈ + λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€<br>
|
| 1073 |
-
π€ <b>λ¨μΌ LLM</b>: μμ μν νκ° | CSV β HuggingFace PRIVATE κΈ°λ‘
|
|
|
|
| 1074 |
</p>
|
| 1075 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 1076 |
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">π³ ζ¨ λ°μ(δ»)</span>
|
|
@@ -1081,6 +1224,7 @@ HEADER = """
|
|
| 1081 |
</div>
|
| 1082 |
</div>"""
|
| 1083 |
|
|
|
|
| 1084 |
def create_app():
|
| 1085 |
with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
|
| 1086 |
css=".gradio-container{max-width:1100px !important}") as app:
|
|
@@ -1115,7 +1259,12 @@ def create_app():
|
|
| 1115 |
with gr.Row():
|
| 1116 |
start_btn = gr.Button("βΆοΈ νκ° μμ (μ΄μ΄νκΈ°)", variant="primary", size="lg", scale=2)
|
| 1117 |
fresh_btn = gr.Button("π μλ‘ μμ", variant="secondary", size="lg", scale=2)
|
| 1118 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1119 |
|
| 1120 |
with gr.Tabs():
|
| 1121 |
with gr.Tab("π μ§ν"):
|
|
@@ -1129,29 +1278,36 @@ def create_app():
|
|
| 1129 |
with gr.Tab("πΎ CSV"):
|
| 1130 |
csv_file = gr.File(label="νκ° κ²°κ³Ό CSV")
|
| 1131 |
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
|
|
|
|
|
|
| 1136 |
|
|
|
|
| 1137 |
all_inputs = [api_key, eval_model, judge_model,
|
| 1138 |
proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
|
| 1139 |
|
| 1140 |
start_btn.click(
|
| 1141 |
-
fn=
|
| 1142 |
inputs=all_inputs,
|
| 1143 |
-
outputs=[
|
| 1144 |
)
|
| 1145 |
fresh_btn.click(
|
| 1146 |
-
fn=
|
| 1147 |
inputs=all_inputs,
|
| 1148 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1149 |
)
|
| 1150 |
|
| 1151 |
gr.Markdown("""---
|
| 1152 |
-
<center>AETHER-Bench v0.3.
|
| 1153 |
π Proto-AGI μ€ν νμ΄νλΌμΈ | Fireworks: <b>kimi-k2p5</b> (νΌνκ°) + <b>kimi-k2p5</b> (Judge)<br>
|
| 1154 |
-
<code>HF_TOKEN</code> μ€μ μ PRIVATE μλ κΈ°λ‘</center>""")
|
| 1155 |
return app
|
| 1156 |
|
| 1157 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1162,17 +1318,15 @@ if __name__ == "__main__":
|
|
| 1162 |
stats = {}
|
| 1163 |
for t in ALL_TASKS:
|
| 1164 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 1165 |
-
print(f"AETHER-Bench v0.3.
|
| 1166 |
print(f" Proto-AGI: ζ¨_λ°μβη«_ννβε_ν΅ν©βι_μ¬νβζ°΄_μ±μ°° (5 agents)")
|
|
|
|
| 1167 |
for p, n in stats.items():
|
| 1168 |
info = PILLAR_INFO[p]
|
| 1169 |
print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
|
| 1170 |
|
| 1171 |
app = create_app()
|
| 1172 |
-
|
| 1173 |
-
app.queue(
|
| 1174 |
-
default_concurrency_limit=1, # λμ μ¬μ©μ 1λͺ
μ©
|
| 1175 |
-
)
|
| 1176 |
app.launch(
|
| 1177 |
server_name="0.0.0.0",
|
| 1178 |
server_port=7860,
|
|
|
|
| 908 |
return out
|
| 909 |
|
| 910 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 911 |
|
| 912 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 913 |
+
# PART 10-B: λ°±κ·ΈλΌμ΄λ νκ° μμ§ (μΈμ
νμμμ λ°©μ§)
|
| 914 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 915 |
+
# β
ν΅μ¬ λ³κ²½: generator(yield) β λ°±κ·ΈλΌμ΄λ μ€λ λ + Timer ν΄λ§
|
| 916 |
+
# - λ²νΌ ν΄λ¦ β μ¦μ λ¦¬ν΄ (SSE λκΉ μμ)
|
| 917 |
+
# - gr.Timer(2μ΄) β μν ν΄λ§ β UI κ°±μ (κ° ν΄λ§μ λ
립 μμ²)
|
| 918 |
+
# - νμ΄μ§ μλ‘κ³ μΉ¨ β Timer μλ μ¬κ° β μ§ν μν© μ¦μ νμ
|
| 919 |
+
# - DB 체ν¬ν¬μΈνΈ β μ΄λ€ μν©μμλ μ΄μ΄νκΈ° κ°λ₯
|
| 920 |
+
|
| 921 |
+
_EVAL_STATE = {
|
| 922 |
+
"running": False,
|
| 923 |
+
"stop_requested": False,
|
| 924 |
+
"finished": False,
|
| 925 |
+
"run_id": "",
|
| 926 |
+
"model": "",
|
| 927 |
+
"proto_agi": False,
|
| 928 |
+
"done": 0,
|
| 929 |
+
"total": 0,
|
| 930 |
+
"cached": 0,
|
| 931 |
+
"pending_count": 0,
|
| 932 |
+
"errors": [],
|
| 933 |
+
"active": [],
|
| 934 |
+
"parse_ok": 0,
|
| 935 |
+
"parse_fail": 0,
|
| 936 |
+
"start_time": 0,
|
| 937 |
+
"results": {},
|
| 938 |
+
"tasks": [],
|
| 939 |
+
"pillar_done": {},
|
| 940 |
+
"pillar_total": {},
|
| 941 |
+
"n_workers": 5,
|
| 942 |
+
"lock": threading.Lock(),
|
| 943 |
+
"message": "",
|
| 944 |
+
"csv_path": None,
|
| 945 |
+
"hf_status": "",
|
| 946 |
+
}
|
| 947 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 948 |
|
| 949 |
+
def _reset_eval_state():
|
| 950 |
+
"""νκ° μν μ΄κΈ°ν"""
|
| 951 |
+
global _EVAL_STATE
|
| 952 |
+
with _EVAL_STATE["lock"]:
|
| 953 |
+
_EVAL_STATE.update({
|
| 954 |
+
"running": False,
|
| 955 |
+
"stop_requested": False,
|
| 956 |
+
"finished": False,
|
| 957 |
+
"done": 0,
|
| 958 |
+
"cached": 0,
|
| 959 |
+
"pending_count": 0,
|
| 960 |
+
"errors": [],
|
| 961 |
+
"active": [],
|
| 962 |
+
"parse_ok": 0,
|
| 963 |
+
"parse_fail": 0,
|
| 964 |
+
"start_time": 0,
|
| 965 |
+
"results": {},
|
| 966 |
+
"tasks": [],
|
| 967 |
+
"pillar_done": {},
|
| 968 |
+
"pillar_total": {},
|
| 969 |
+
"message": "",
|
| 970 |
+
"csv_path": None,
|
| 971 |
+
"hf_status": "",
|
| 972 |
+
})
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
def _bg_evaluate(api_key, eval_model, judge_model, tasks, run_id,
|
| 976 |
+
n_workers, proto_agi):
|
| 977 |
+
"""λ°±κ·ΈλΌμ΄λ μ€λ λ: νκ° μ€ν β _EVAL_STATE μ
λ°μ΄νΈ β DB μ μ₯"""
|
| 978 |
+
global _EVAL_STATE
|
| 979 |
|
| 980 |
+
try:
|
| 981 |
+
results = dict(_load_all(run_id))
|
| 982 |
+
cached = sum(1 for t in tasks if t.task_id in results)
|
| 983 |
+
pending = [t for t in tasks if t.task_id not in results]
|
| 984 |
+
|
| 985 |
+
pillar_tasks = {}
|
| 986 |
+
for t in pending:
|
| 987 |
+
pillar_tasks.setdefault(t.pillar, []).append(t)
|
| 988 |
+
|
| 989 |
+
with _EVAL_STATE["lock"]:
|
| 990 |
+
_EVAL_STATE["results"] = results
|
| 991 |
+
_EVAL_STATE["cached"] = cached
|
| 992 |
+
_EVAL_STATE["pending_count"] = len(pending)
|
| 993 |
+
_EVAL_STATE["total"] = len(tasks)
|
| 994 |
+
_EVAL_STATE["pillar_total"] = {p: len(ts) for p, ts in pillar_tasks.items()}
|
| 995 |
+
_EVAL_STATE["pillar_done"] = {p: 0 for p in pillar_tasks}
|
| 996 |
+
_EVAL_STATE["start_time"] = time.time()
|
| 997 |
+
|
| 998 |
+
if not pending:
|
| 999 |
+
with _EVAL_STATE["lock"]:
|
| 1000 |
+
_EVAL_STATE["message"] = f"πΎ μ λΆ μΊμ μλ£! ({cached}κ°)"
|
| 1001 |
+
_finalize_results(tasks, results, eval_model, proto_agi)
|
| 1002 |
+
return
|
| 1003 |
+
|
| 1004 |
+
with _EVAL_STATE["lock"]:
|
| 1005 |
+
_EVAL_STATE["message"] = f"β‘ μμ! {len(pending)}κ° κ³Όμ Β· {n_workers}μ컀"
|
| 1006 |
+
|
| 1007 |
+
# ββ ThreadPoolExecutor ββ
|
| 1008 |
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 1009 |
+
futures = {}
|
| 1010 |
+
for task in pending:
|
| 1011 |
+
if _EVAL_STATE["stop_requested"]:
|
| 1012 |
+
break
|
| 1013 |
+
fut = executor.submit(_eval_single_task, task, run_id, api_key,
|
| 1014 |
+
eval_model, judge_model, _EVAL_STATE, proto_agi)
|
| 1015 |
+
futures[fut] = task
|
| 1016 |
+
|
| 1017 |
+
completed = set()
|
| 1018 |
+
while len(completed) < len(futures):
|
| 1019 |
+
if _EVAL_STATE["stop_requested"]:
|
| 1020 |
+
executor.shutdown(wait=False, cancel_futures=True)
|
| 1021 |
+
with _EVAL_STATE["lock"]:
|
| 1022 |
+
_EVAL_STATE["message"] = "βΉοΈ μ€λ¨λ¨ (DBμ μ μ₯λ κ²°κ³Όλ 보쑴)"
|
| 1023 |
+
_EVAL_STATE["running"] = False
|
| 1024 |
+
_EVAL_STATE["finished"] = True
|
| 1025 |
+
return
|
| 1026 |
+
|
| 1027 |
+
for fut in list(futures):
|
| 1028 |
+
if fut in completed:
|
| 1029 |
+
continue
|
| 1030 |
+
if fut.done():
|
| 1031 |
+
completed.add(fut)
|
| 1032 |
+
try:
|
| 1033 |
+
tid, data = fut.result()
|
| 1034 |
+
with _EVAL_STATE["lock"]:
|
| 1035 |
+
_EVAL_STATE["results"][tid] = data
|
| 1036 |
+
task_obj = futures[fut]
|
| 1037 |
+
_EVAL_STATE["pillar_done"][task_obj.pillar] = \
|
| 1038 |
+
_EVAL_STATE["pillar_done"].get(task_obj.pillar, 0) + 1
|
| 1039 |
+
except Exception as e:
|
| 1040 |
+
with _EVAL_STATE["lock"]:
|
| 1041 |
+
_EVAL_STATE["errors"].append(str(e)[:60])
|
| 1042 |
|
| 1043 |
+
time.sleep(0.5)
|
|
|
|
|
|
|
| 1044 |
|
| 1045 |
+
# ββ μλ£ β κ²°κ³Ό μ§κ³ ββ
|
| 1046 |
+
with _EVAL_STATE["lock"]:
|
| 1047 |
+
results = dict(_EVAL_STATE["results"])
|
| 1048 |
+
_finalize_results(tasks, results, eval_model, proto_agi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
|
| 1050 |
+
except Exception as e:
|
| 1051 |
+
with _EVAL_STATE["lock"]:
|
| 1052 |
+
_EVAL_STATE["message"] = f"β μΉλͺ
μ μ€λ₯: {str(e)[:100]}"
|
| 1053 |
+
_EVAL_STATE["running"] = False
|
| 1054 |
+
_EVAL_STATE["finished"] = True
|
| 1055 |
|
|
|
|
|
|
|
| 1056 |
|
| 1057 |
+
def _finalize_results(tasks, results, eval_model, proto_agi):
|
| 1058 |
+
"""μ΅μ’
κ²°κ³Ό μ§κ³ + CSV + HF μ
λ‘λ"""
|
| 1059 |
+
global _EVAL_STATE
|
| 1060 |
|
| 1061 |
pillar_scores = {}
|
| 1062 |
for p in PILLAR_INFO:
|
| 1063 |
pt = [t for t in tasks if t.pillar == p and t.task_id in results]
|
| 1064 |
+
if pt:
|
| 1065 |
+
pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
|
| 1066 |
|
| 1067 |
aether = calculate_aether_score(pillar_scores)
|
| 1068 |
|
| 1069 |
csv_str = generate_csv(results, eval_model)
|
| 1070 |
+
run_id = _EVAL_STATE["run_id"]
|
| 1071 |
csv_path = f"/tmp/aether_eval_{run_id}.csv"
|
| 1072 |
with open(csv_path, "w", encoding="utf-8") as f:
|
| 1073 |
f.write(csv_str)
|
| 1074 |
|
| 1075 |
hf_status = upload_to_hf(csv_str, eval_model)
|
| 1076 |
|
| 1077 |
+
elapsed = int(time.time() - _EVAL_STATE["start_time"]) if _EVAL_STATE["start_time"] else 0
|
|
|
|
|
|
|
| 1078 |
mode_str = "πProto-AGI" if proto_agi else "π€λ¨μΌLLM"
|
| 1079 |
+
cached = _EVAL_STATE["cached"]
|
| 1080 |
+
n_err = len(_EVAL_STATE["errors"])
|
| 1081 |
+
err_msg = f" (β οΈ {n_err}κ° μ€λ₯)" if n_err else ""
|
| 1082 |
+
restore_msg = f" (πΎ {cached}κ° λ³΅μ)" if cached else ""
|
| 1083 |
+
|
| 1084 |
+
with _EVAL_STATE["lock"]:
|
| 1085 |
+
_EVAL_STATE["csv_path"] = csv_path
|
| 1086 |
+
_EVAL_STATE["hf_status"] = hf_status
|
| 1087 |
+
_EVAL_STATE["message"] = f"π μλ£! {mode_str}{restore_msg}{err_msg} AETHER={aether:.1f} ({elapsed}μ΄)"
|
| 1088 |
+
_EVAL_STATE["running"] = False
|
| 1089 |
+
_EVAL_STATE["finished"] = True
|
| 1090 |
+
|
| 1091 |
+
|
| 1092 |
+
def _start_eval(api_key, eval_model, judge_model, proto_agi,
|
| 1093 |
+
pillar_filter, diff_filter, max_tasks, n_workers, fresh_start):
|
| 1094 |
+
"""λ²νΌ ν΄λ¦ νΈλ€λ¬ β μ¦μ λ¦¬ν΄ (λ°±κ·ΈλΌμ΄λ μ€λ λ μμ)"""
|
| 1095 |
+
global _EVAL_STATE
|
| 1096 |
|
| 1097 |
+
if _EVAL_STATE["running"]:
|
| 1098 |
+
return "β οΈ μ΄λ―Έ νκ°κ° μ§ν μ€μ
λλ€. μ€λ¨ ν λ€μ μμνμΈμ."
|
|
|
|
|
|
|
| 1099 |
|
| 1100 |
+
api_key = (api_key or "").strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 1101 |
+
if not api_key:
|
| 1102 |
+
return "β Fireworks API Keyλ₯Ό μ
λ ₯νμΈμ."
|
| 1103 |
+
|
| 1104 |
+
n_workers = int(n_workers)
|
| 1105 |
+
if proto_agi and n_workers > 5:
|
| 1106 |
+
n_workers = 5
|
| 1107 |
+
|
| 1108 |
+
tasks = ALL_TASKS[:]
|
| 1109 |
+
if pillar_filter != "μ 체":
|
| 1110 |
+
tasks = [t for t in tasks if t.pillar == pillar_filter]
|
| 1111 |
+
if diff_filter != "μ 체":
|
| 1112 |
+
tasks = [t for t in tasks if t.difficulty == diff_filter]
|
| 1113 |
+
tasks = tasks[:int(max_tasks)]
|
| 1114 |
+
|
| 1115 |
+
mode_suffix = "_PAGI" if proto_agi else ""
|
| 1116 |
+
run_id = _make_run_id(eval_model + mode_suffix)
|
| 1117 |
+
|
| 1118 |
+
if fresh_start:
|
| 1119 |
+
_clear_run(run_id)
|
| 1120 |
+
|
| 1121 |
+
_reset_eval_state()
|
| 1122 |
+
with _EVAL_STATE["lock"]:
|
| 1123 |
+
_EVAL_STATE["running"] = True
|
| 1124 |
+
_EVAL_STATE["run_id"] = run_id
|
| 1125 |
+
_EVAL_STATE["model"] = eval_model
|
| 1126 |
+
_EVAL_STATE["proto_agi"] = proto_agi
|
| 1127 |
+
_EVAL_STATE["tasks"] = tasks
|
| 1128 |
+
_EVAL_STATE["total"] = len(tasks)
|
| 1129 |
+
_EVAL_STATE["n_workers"] = n_workers
|
| 1130 |
+
_EVAL_STATE["message"] = "π νκ° μ€λΉ μ€..."
|
| 1131 |
+
|
| 1132 |
+
thread = threading.Thread(
|
| 1133 |
+
target=_bg_evaluate,
|
| 1134 |
+
args=(api_key, eval_model, judge_model, tasks, run_id, n_workers, proto_agi),
|
| 1135 |
+
daemon=True,
|
| 1136 |
+
)
|
| 1137 |
+
thread.start()
|
| 1138 |
+
|
| 1139 |
+
mode_tag = 'π Proto-AGI' if proto_agi else 'π€ λ¨μΌLLM'
|
| 1140 |
+
return f"β‘ {mode_tag} νκ° μμ! ({len(tasks)}κ° κ³Όμ , {n_workers}μ컀)"
|
| 1141 |
+
|
| 1142 |
+
|
| 1143 |
+
def _stop_eval():
|
| 1144 |
+
"""μ€λ¨ λ²νΌ νΈλ€λ¬"""
|
| 1145 |
+
global _EVAL_STATE
|
| 1146 |
+
if _EVAL_STATE["running"]:
|
| 1147 |
+
_EVAL_STATE["stop_requested"] = True
|
| 1148 |
+
return "βΉοΈ μ€λ¨ μμ²λ¨... (νμ¬ μ§ν μ€μΈ κ³Όμ μλ£ ν μ€λ¨)"
|
| 1149 |
+
return "βΉοΈ μ€ν μ€μΈ νκ°κ° μμ΅λλ€."
|
| 1150 |
+
|
| 1151 |
+
|
| 1152 |
+
def _poll_status():
|
| 1153 |
+
"""Timer μ½λ°± β 2μ΄λ§λ€ νΈμΆ β UI μ 체 κ°±μ """
|
| 1154 |
+
global _EVAL_STATE
|
| 1155 |
+
|
| 1156 |
+
with _EVAL_STATE["lock"]:
|
| 1157 |
+
running = _EVAL_STATE["running"]
|
| 1158 |
+
finished = _EVAL_STATE["finished"]
|
| 1159 |
+
tasks = _EVAL_STATE.get("tasks", [])
|
| 1160 |
+
results = dict(_EVAL_STATE.get("results", {}))
|
| 1161 |
+
message = _EVAL_STATE.get("message", "")
|
| 1162 |
+
csv_path = _EVAL_STATE.get("csv_path")
|
| 1163 |
+
|
| 1164 |
+
# μ무κ²λ μ νκ³ μμΌλ©΄ μ΅μ UI
|
| 1165 |
+
if not running and not finished and not results:
|
| 1166 |
+
return ("βΉοΈ βΆοΈ νκ° μμ λλ π μλ‘ μμμ λλ¬μ£ΌμΈμ.",
|
| 1167 |
+
"", "", "", None)
|
| 1168 |
+
|
| 1169 |
+
# μ§ν μ€μ΄κ±°λ μλ£
|
| 1170 |
+
if running:
|
| 1171 |
+
prog_html = CSS + _parallel_progress_html(_EVAL_STATE, _EVAL_STATE.get("pending_count", 0))
|
| 1172 |
+
elif finished:
|
| 1173 |
+
prog_html = f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;">{message}</div>'
|
| 1174 |
+
else:
|
| 1175 |
+
prog_html = message
|
| 1176 |
+
|
| 1177 |
+
table_html = _build_progress_table(results, tasks) if tasks else ""
|
| 1178 |
+
|
| 1179 |
+
summary_html = ""
|
| 1180 |
+
detail_html = ""
|
| 1181 |
+
csv_out = None
|
| 1182 |
+
|
| 1183 |
+
if finished and tasks:
|
| 1184 |
+
pillar_scores = {}
|
| 1185 |
+
for p in PILLAR_INFO:
|
| 1186 |
+
pt = [t for t in tasks if t.pillar == p and t.task_id in results]
|
| 1187 |
+
if pt:
|
| 1188 |
+
pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
|
| 1189 |
+
aether = calculate_aether_score(pillar_scores)
|
| 1190 |
+
|
| 1191 |
+
display_model = f'{_EVAL_STATE.get("model", "?")} [{"πProto-AGI" if _EVAL_STATE.get("proto_agi") else "π€λ¨μΌLLM"}]'
|
| 1192 |
+
hf_status = _EVAL_STATE.get("hf_status", "")
|
| 1193 |
+
summary_html = _build_final_summary(results, tasks, pillar_scores, aether,
|
| 1194 |
+
display_model, hf_status)
|
| 1195 |
+
detail_html = _build_detail_view(results, tasks)
|
| 1196 |
+
csv_out = csv_path
|
| 1197 |
+
|
| 1198 |
+
return (prog_html, table_html, summary_html, detail_html, csv_out)
|
| 1199 |
|
| 1200 |
|
| 1201 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1202 |
+
# PART 11: Gradio App β β
Timer ν΄λ§ κΈ°λ° (μΈμ
λκΉ μμ λ°©μ§)
|
| 1203 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1204 |
|
| 1205 |
PILLAR_CHOICES = ["μ 체"] + list(PILLAR_INFO.keys())
|
|
|
|
| 1207 |
|
| 1208 |
HEADER = """
|
| 1209 |
<div style="text-align:center;padding:16px 0;">
|
| 1210 |
+
<h1 style="margin:0;font-size:1.8em;">π AETHER-Bench v0.3.3</h1>
|
| 1211 |
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM νκ° μμ€ν
+ Proto-AGI μ€ν λ©ν°μμ΄μ νΈ</h2>
|
| 1212 |
<p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
|
| 1213 |
120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
|
| 1214 |
π <b>Proto-AGI</b>: ζ¨βη«βεβιβζ°΄ μ€ν νμ΄νλΌμΈ + λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€<br>
|
| 1215 |
+
π€ <b>λ¨μΌ LLM</b>: μμ μν νκ° | CSV β HuggingFace PRIVATE κΈ°λ‘<br>
|
| 1216 |
+
β‘ <b>v0.3.3</b>: λ°±κ·ΈλΌμ΄λ μ€ν β μΈμ
λκΉ/μλ‘κ³ μΉ¨ μμλ νκ° κ³μ μ§ν
|
| 1217 |
</p>
|
| 1218 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 1219 |
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">π³ ζ¨ λ°μ(δ»)</span>
|
|
|
|
| 1224 |
</div>
|
| 1225 |
</div>"""
|
| 1226 |
|
| 1227 |
+
|
| 1228 |
def create_app():
|
| 1229 |
with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
|
| 1230 |
css=".gradio-container{max-width:1100px !important}") as app:
|
|
|
|
| 1259 |
with gr.Row():
|
| 1260 |
start_btn = gr.Button("βΆοΈ νκ° μμ (μ΄μ΄νκΈ°)", variant="primary", size="lg", scale=2)
|
| 1261 |
fresh_btn = gr.Button("π μλ‘ μμ", variant="secondary", size="lg", scale=2)
|
| 1262 |
+
stop_btn = gr.Button("βΉοΈ μ€λ¨", variant="stop", size="lg", scale=1)
|
| 1263 |
+
gr.HTML('''<p style="color:#888;font-size:0.8em;margin:auto 0;">
|
| 1264 |
+
β‘ λ°±κ·ΈλΌμ΄λ μ€ν β νμ΄μ§ μλ‘κ³ μΉ¨ν΄λ νκ° κ³μ μ§ν<br>
|
| 1265 |
+
βΆοΈ μ€λ¨μ μ΄μ΄μ | π μ΄κΈ°νν μ¬μμ | βΉοΈ κΈ΄κΈ μ€λ¨</p>''')
|
| 1266 |
+
|
| 1267 |
+
status_msg = gr.Textbox(label="μν", interactive=False, max_lines=1)
|
| 1268 |
|
| 1269 |
with gr.Tabs():
|
| 1270 |
with gr.Tab("π μ§ν"):
|
|
|
|
| 1278 |
with gr.Tab("πΎ CSV"):
|
| 1279 |
csv_file = gr.File(label="νκ° κ²°κ³Ό CSV")
|
| 1280 |
|
| 1281 |
+
# ββ Timer: 2μ΄λ§λ€ UI κ°±μ (SSE λκΉκ³Ό 무κ΄) ββ
|
| 1282 |
+
timer = gr.Timer(value=2, active=True)
|
| 1283 |
+
timer.tick(
|
| 1284 |
+
fn=_poll_status,
|
| 1285 |
+
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 1286 |
+
)
|
| 1287 |
|
| 1288 |
+
# ββ λ²νΌ: μ¦μ λ¦¬ν΄ (generator μλ!) ββ
|
| 1289 |
all_inputs = [api_key, eval_model, judge_model,
|
| 1290 |
proto_agi_toggle, pillar_dd, diff_dd, max_tasks, n_workers]
|
| 1291 |
|
| 1292 |
start_btn.click(
|
| 1293 |
+
fn=lambda *args: _start_eval(*args, fresh_start=False),
|
| 1294 |
inputs=all_inputs,
|
| 1295 |
+
outputs=[status_msg],
|
| 1296 |
)
|
| 1297 |
fresh_btn.click(
|
| 1298 |
+
fn=lambda *args: _start_eval(*args, fresh_start=True),
|
| 1299 |
inputs=all_inputs,
|
| 1300 |
+
outputs=[status_msg],
|
| 1301 |
+
)
|
| 1302 |
+
stop_btn.click(
|
| 1303 |
+
fn=_stop_eval,
|
| 1304 |
+
outputs=[status_msg],
|
| 1305 |
)
|
| 1306 |
|
| 1307 |
gr.Markdown("""---
|
| 1308 |
+
<center>AETHER-Bench v0.3.3 Β· Apache 2.0 Β· Ginigen AI (μ§λμ AI)<br>
|
| 1309 |
π Proto-AGI μ€ν νμ΄νλΌμΈ | Fireworks: <b>kimi-k2p5</b> (νΌνκ°) + <b>kimi-k2p5</b> (Judge)<br>
|
| 1310 |
+
β‘ λ°±κ·ΈλΌμ΄λ μ€ν β μΈμ
λκΉ μμ λ°©μ§ | <code>HF_TOKEN</code> μ€μ μ PRIVATE μλ κΈ°λ‘</center>""")
|
| 1311 |
return app
|
| 1312 |
|
| 1313 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1318 |
stats = {}
|
| 1319 |
for t in ALL_TASKS:
|
| 1320 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 1321 |
+
print(f"AETHER-Bench v0.3.3 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
|
| 1322 |
print(f" Proto-AGI: ζ¨_λ°μβη«_ννβε_ν΅ν©βι_μ¬νβζ°΄_μ±μ°° (5 agents)")
|
| 1323 |
+
print(f" β
Background thread + Timer polling (session-safe)")
|
| 1324 |
for p, n in stats.items():
|
| 1325 |
info = PILLAR_INFO[p]
|
| 1326 |
print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
|
| 1327 |
|
| 1328 |
app = create_app()
|
| 1329 |
+
app.queue(default_concurrency_limit=2)
|
|
|
|
|
|
|
|
|
|
| 1330 |
app.launch(
|
| 1331 |
server_name="0.0.0.0",
|
| 1332 |
server_port=7860,
|