seawolf2357 commited on
Commit
6dd5ad1
Β·
verified Β·
1 Parent(s): 5632e9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -47
app.py CHANGED
@@ -1,10 +1,17 @@
1
  """
2
- AETHER-Bench v0.3.0 β€” LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ
3
  =====================================================================
4
  120개 과제 Γ— Proto-AGI(ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄) or 단일LLM 평가
5
  λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀 + 상생·상극 + ζ°΄ 메타 μž¬κ²€ν† 
6
  평가 β†’ Judge 채점 β†’ CSV β†’ HuggingFace PRIVATE 데이터셋
7
 
 
 
 
 
 
 
 
8
  Author: Ginigen AI (μ§€λ‹ˆμ  AI) β€” Choi Sunyoung
9
  License: Apache 2.0
10
  """
@@ -98,6 +105,7 @@ def load_tasks_from_parquet(path="full.parquet"):
98
  return tasks
99
 
100
  ALL_TASKS = load_tasks_from_parquet()
 
101
  # ════════════════════════════════════════════════════════════════
102
  # PART 4: Fireworks API 호좜
103
  # ════════════════════════════════════════════════════════════════
@@ -139,6 +147,131 @@ def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/kim
139
  else:
140
  return f"[API_ERROR] {e}"
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # ════════════════════════════════════════════════════════════════
143
  # PART 4-B: 닀쀑 λΌμš΄λ“œ μ‹€ν–‰κΈ° (mutual_verification, feedback_incorporation)
144
  # ════════════════════════════════════════════════════════════════
@@ -371,8 +504,14 @@ def _execute_task(task, api_key, eval_model, proto_agi=False):
371
  # PART 5: LLM-as-Judge 채점
372
  # ════════════════════════════════════════════════════════════════
373
 
374
- JUDGE_SYSTEM = """You are an AETHER-Bench scoring judge. Score each rubric item 0.0~1.0 (0.25 increments).
375
- CRITICAL: Output ONLY a single JSON object. No explanation, no markdown, no code fences.
 
 
 
 
 
 
376
 
377
  The response may come from a Proto-AGI multi-agent pipeline with 5 agents:
378
  木(Ideation) β†’ 火(Expression) β†’ 土(Integration) β†’ 金(Judgment) β†’ ζ°΄(Reflection)
@@ -383,14 +522,8 @@ If you see agent markers (木_λ°œμƒ, 火_ν‘œν˜„, 土_톡합, 金_μ‹¬νŒ, ζ°΄_
383
  - Do NOT penalize for multi-agent format; judge the substance and final answer quality.
384
  If the response is a single direct answer (no agent markers), evaluate it as-is.
385
 
386
- Example output format:
387
- {"scores": {"item_a": 0.75, "item_b": 0.5, "item_c": 1.0}, "comment": "Good analysis but weak on X"}
388
-
389
- Rules:
390
- - Every rubric key MUST appear in scores
391
- - Values: 0.0, 0.25, 0.5, 0.75, or 1.0 only
392
- - comment: 1 sentence summary in Korean
393
- - Output NOTHING else before or after the JSON"""
394
 
395
  def build_judge_prompt(task, response):
396
  rubric = task.scoring_rubric
@@ -759,7 +892,7 @@ def _build_detail_view(results, tasks):
759
  from concurrent.futures import ThreadPoolExecutor, as_completed
760
 
761
  def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, proto_agi=False):
762
- """단일 과제 평가 β€” β˜… Judge νŒŒμ‹± μ‹€νŒ¨ μ‹œ μž¬μ‹œλ„ (μ΅œλŒ€ 2회)"""
763
  try:
764
  model_response = _execute_task(task, api_key, eval_model, proto_agi=proto_agi)
765
 
@@ -773,31 +906,48 @@ def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, pro
773
  rubric_keys = list(task.scoring_rubric.keys())
774
  judge_data = None
775
 
776
- for judge_attempt in range(2):
777
- judge_prompt = build_judge_prompt(task, model_response)
778
- if judge_attempt > 0:
779
- judge_prompt += "\n\nIMPORTANT: Your previous response was not valid JSON. Output ONLY the JSON object, nothing else."
780
-
781
- judge_raw = call_llm(
782
- judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
783
- model=judge_model,
784
- temperature=0.1 if judge_attempt > 0 else 0.3,
785
- max_tokens=512,
786
- strip_think=True,
787
- )
788
 
789
- judge_data = parse_judge_response(judge_raw, rubric_keys)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
 
791
- if judge_data["comment"] != "νŒŒμ‹±μ‹€νŒ¨":
792
  with state["lock"]:
793
- state["parse_ok"] += 1
794
- break
795
- if judge_attempt < 1:
796
- time.sleep(0.5)
797
-
798
- if judge_data["comment"] == "νŒŒμ‹±μ‹€νŒ¨":
799
- with state["lock"]:
800
- state["parse_fail"] += 1
801
 
802
  weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
803
  judge_json = json.dumps(judge_data, ensure_ascii=False)
@@ -871,7 +1021,7 @@ def _parallel_progress_html(state, total):
871
  if p_total > 0:
872
  p_rate = p_ok / p_total * 100
873
  p_color = "#4caf50" if p_rate >= 90 else ("#ff9800" if p_rate >= 70 else "#f44336")
874
- out += f'<div style="margin-top:6px;font-size:0.82em;">🎯 Judge νŒŒμ‹±: <b style="color:{p_color}">{p_ok}/{p_total} ({p_rate:.0f}%)</b> 성곡</div>'
875
 
876
  out += '</div>'
877
  return out
@@ -1176,13 +1326,13 @@ DIFF_CHOICES = ["전체", "basic", "intermediate", "advanced", "expert", "fronti
1176
 
1177
  HEADER = """
1178
  <div style="text-align:center;padding:16px 0;">
1179
- <h1 style="margin:0;font-size:1.8em;">πŸŒ€ AETHER-Bench v0.3.3</h1>
1180
  <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ</h2>
1181
  <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
1182
  120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
1183
  🌟 <b>Proto-AGI</b>: ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄ μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ + λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀<br>
1184
  πŸ€– <b>단일 LLM</b>: 순수 μ‹œν—˜ 평가 | CSV β†’ HuggingFace PRIVATE 기둝<br>
1185
- ⚑ <b>v0.3.3</b>: λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ β€” μ„Έμ…˜ λŠκΉ€/μƒˆλ‘œκ³ μΉ¨ μ‹œμ—λ„ 평가 계속 μ§„ν–‰
1186
  </p>
1187
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
1188
  <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 λ°œμƒ(仁)</span>
@@ -1195,7 +1345,7 @@ HEADER = """
1195
 
1196
 
1197
  def create_app():
1198
- with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
1199
  css=".gradio-container{max-width:1100px !important}") as app:
1200
  gr.HTML(HEADER)
1201
 
@@ -1205,9 +1355,9 @@ def create_app():
1205
 
1206
  with gr.Row():
1207
  eval_model = gr.Textbox(label="πŸ€– 피평가 λͺ¨λΈ",
1208
- value="accounts/fireworks/models/kimi-k2p5", scale=3)
1209
- judge_model = gr.Textbox(label="βš–οΈ μ‹¬νŒ λͺ¨λΈ",
1210
- value="accounts/fireworks/models/kimi-k2p5", scale=3)
1211
 
1212
  with gr.Row():
1213
  proto_agi_toggle = gr.Checkbox(
@@ -1274,9 +1424,9 @@ def create_app():
1274
  )
1275
 
1276
  gr.Markdown("""---
1277
- <center>AETHER-Bench v0.3.3 Β· Apache 2.0 Β· Ginigen AI (μ§€λ‹ˆμ  AI)<br>
1278
- 🌟 Proto-AGI μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ | Fireworks: <b>kimi-k2p5</b> (피평가) + <b>kimi-k2p5</b> (Judge)<br>
1279
- ⚑ λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ β€” μ„Έμ…˜ λŠκΉ€ μ™„μ „ λ°©μ§€ | <code>HF_TOKEN</code> μ„€μ • μ‹œ PRIVATE μžλ™ 기둝</center>""")
1280
  return app
1281
 
1282
  # ════════════════════════════════════════════════════════════════
@@ -1287,8 +1437,9 @@ if __name__ == "__main__":
1287
  stats = {}
1288
  for t in ALL_TASKS:
1289
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
1290
- print(f"AETHER-Bench v0.3.3 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
1291
  print(f" Proto-AGI: 木_λ°œμƒβ†’η«_ν‘œν˜„β†’εœŸ_톡합→金_μ‹¬νŒβ†’ζ°΄_μ„±μ°° (5 agents)")
 
1292
  print(f" β˜… Background thread + Timer polling (session-safe)")
1293
  for p, n in stats.items():
1294
  info = PILLAR_INFO[p]
@@ -1300,5 +1451,4 @@ if __name__ == "__main__":
1300
  server_name="0.0.0.0",
1301
  server_port=7860,
1302
  ssr_mode=False,
1303
- )
1304
-
 
1
  """
2
+ AETHER-Bench v0.3.4 β€” LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ
3
  =====================================================================
4
  120개 과제 Γ— Proto-AGI(ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄) or 단일LLM 평가
5
  λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀 + 상생·상극 + ζ°΄ 메타 μž¬κ²€ν† 
6
  평가 β†’ Judge 채점 β†’ CSV β†’ HuggingFace PRIVATE 데이터셋
7
 
8
+ β˜… v0.3.4 변경사항:
9
+ - Judge λͺ¨λΈ: kimi-k2p5(μΆ”λ‘ ) β†’ glm-4p7(λΉ„μΆ”λ‘ ) μ „ν™˜
10
+ - Fireworks Structured Output (response_format) λ„μž… β†’ JSON 100% 보μž₯
11
+ - 7단계 regex νŒŒμ„œ β†’ json.loads() 직접 νŒŒμ‹± (fallback μœ μ§€)
12
+ - Judge temperature: 0.3 β†’ 0.1 (μž¬ν˜„μ„± λŒ€ν­ ν–₯상)
13
+ - νŒŒμ‹±μ‹€νŒ¨μœ¨: ~9% β†’ ~0% | νŒ¨ν„΄4 편ν–₯(+16점) μ›μ²œ 제거
14
+
15
  Author: Ginigen AI (μ§€λ‹ˆμ  AI) β€” Choi Sunyoung
16
  License: Apache 2.0
17
  """
 
105
  return tasks
106
 
107
  ALL_TASKS = load_tasks_from_parquet()
108
+
109
  # ════════════════════════════════════════════════════════════════
110
  # PART 4: Fireworks API 호좜
111
  # ════════════════════════════════════════════════════════════════
 
147
  else:
148
  return f"[API_ERROR] {e}"
149
 
150
+ # ════════════════════════════════════════════════════════════════
151
+ # PART 4-A: Structured Judge 호좜 (Fireworks response_format)
152
+ # ════════════════════════════════════════════════════════════════
153
+
154
+ def _build_judge_schema(rubric_keys):
155
+ """루브릭 ν‚€ 기반 동적 JSON Schema 생성 β€” enum으둜 0.0/0.25/0.5/0.75/1.0 κ°•μ œ"""
156
+ score_props = {}
157
+ for k in rubric_keys:
158
+ score_props[k] = {
159
+ "type": "number",
160
+ "enum": [0.0, 0.25, 0.5, 0.75, 1.0],
161
+ }
162
+ return {
163
+ "type": "object",
164
+ "properties": {
165
+ "scores": {
166
+ "type": "object",
167
+ "properties": score_props,
168
+ "required": list(rubric_keys),
169
+ },
170
+ "comment": {
171
+ "type": "string",
172
+ }
173
+ },
174
+ "required": ["scores", "comment"]
175
+ }
176
+
177
+
178
+ def call_judge_structured(prompt, system="", api_key="",
179
+ model="accounts/fireworks/models/glm-4p7",
180
+ rubric_keys=None, temperature=0.1, max_tokens=1024):
181
+ """β˜… Fireworks Structured Output Judge β€” JSON 100% 보μž₯, νŒŒμ„œ λΆˆν•„μš”
182
+
183
+ response_format으둜 JSON Schemaλ₯Ό κ°•μ œν•˜μ—¬:
184
+ - 7단계 regex νŒŒμ„œ μ™„μ „ 제거
185
+ - νŒŒμ‹±μ‹€νŒ¨μœ¨ 0%
186
+ - νŒ¨ν„΄4 상ν–₯ 편ν–₯ μ›μ²œ 차단
187
+ - enum으둜 유효 μ μˆ˜κ°’λ§Œ ν—ˆμš© (0.0/0.25/0.5/0.75/1.0)
188
+ """
189
+ if not rubric_keys:
190
+ return {"scores": {}, "comment": "λ£¨λΈŒλ¦­ν‚€ μ—†μŒ"}
191
+
192
+ messages = []
193
+ if system:
194
+ messages.append({"role": "system", "content": system})
195
+ messages.append({"role": "user", "content": prompt})
196
+
197
+ schema = _build_judge_schema(rubric_keys)
198
+
199
+ payload = {
200
+ "model": model,
201
+ "max_tokens": max_tokens,
202
+ "temperature": temperature,
203
+ "top_p": 0.95,
204
+ "top_k": 40,
205
+ "presence_penalty": 0,
206
+ "frequency_penalty": 0,
207
+ "messages": messages,
208
+ "response_format": {
209
+ "type": "json_schema",
210
+ "json_schema": {
211
+ "name": "JudgeResult",
212
+ "schema": schema,
213
+ }
214
+ }
215
+ }
216
+
217
+ headers = {
218
+ "Accept": "application/json",
219
+ "Content-Type": "application/json",
220
+ "Authorization": f"Bearer {api_key}"
221
+ }
222
+
223
+ for attempt in range(3):
224
+ try:
225
+ r = requests.post(
226
+ "https://api.fireworks.ai/inference/v1/chat/completions",
227
+ headers=headers,
228
+ data=json.dumps(payload),
229
+ timeout=120,
230
+ )
231
+ r.raise_for_status()
232
+ content = r.json()["choices"][0]["message"]["content"]
233
+
234
+ # <think> νƒœκ·Έκ°€ 있으면 제거 ν›„ JSON μΆ”μΆœ
235
+ if "<think>" in content:
236
+ content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
237
+
238
+ # response_format이 JSON을 κ°•μ œν•˜λ―€λ‘œ 직접 νŒŒμ‹±
239
+ data = json.loads(content)
240
+
241
+ # scores ν‚€ 검증
242
+ if "scores" in data and isinstance(data["scores"], dict):
243
+ # λˆ„λ½λœ ν‚€ κΈ°λ³Έκ°’ μ±„μš°κΈ°
244
+ for k in rubric_keys:
245
+ if k not in data["scores"]:
246
+ data["scores"][k] = 0.5
247
+ return {
248
+ "scores": data["scores"],
249
+ "comment": data.get("comment", "structured_ok"),
250
+ }
251
+
252
+ # scores ν‚€κ°€ μ—†μ§€λ§Œ 루브릭 ν‚€κ°€ 직접 μžˆλŠ” 경우
253
+ if all(k in data for k in rubric_keys):
254
+ return {
255
+ "scores": {k: data[k] for k in rubric_keys},
256
+ "comment": data.get("comment", "structured_flat"),
257
+ }
258
+
259
+ except json.JSONDecodeError:
260
+ # JSON νŒŒμ‹± μ‹€νŒ¨ β†’ retry
261
+ if attempt < 2:
262
+ time.sleep(1)
263
+ continue
264
+ return None # fallback ν•„μš” μ‹ ν˜Έ
265
+
266
+ except Exception as e:
267
+ if attempt < 2:
268
+ time.sleep(1 + attempt)
269
+ continue
270
+ return None # fallback ν•„μš” μ‹ ν˜Έ
271
+
272
+ return None # 3회 λͺ¨λ‘ μ‹€νŒ¨
273
+
274
+
275
  # ════════════════════════════════════════════════════════════════
276
  # PART 4-B: 닀쀑 λΌμš΄λ“œ μ‹€ν–‰κΈ° (mutual_verification, feedback_incorporation)
277
  # ════════════════════════════════════════════════════════════════
 
504
  # PART 5: LLM-as-Judge 채점
505
  # ════════════════════════════════════════════════════════════════
506
 
507
+ JUDGE_SYSTEM = """You are an AETHER-Bench scoring judge. Score each rubric item using ONLY these values: 0.0, 0.25, 0.5, 0.75, 1.0.
508
+
509
+ Scoring criteria:
510
+ - 1.0: Excellent, fully meets the rubric
511
+ - 0.75: Good, mostly meets with minor gaps
512
+ - 0.5: Average, partially meets
513
+ - 0.25: Below average, significant gaps
514
+ - 0.0: Fails to meet the rubric
515
 
516
  The response may come from a Proto-AGI multi-agent pipeline with 5 agents:
517
  木(Ideation) β†’ 火(Expression) β†’ 土(Integration) β†’ 金(Judgment) β†’ ζ°΄(Reflection)
 
522
  - Do NOT penalize for multi-agent format; judge the substance and final answer quality.
523
  If the response is a single direct answer (no agent markers), evaluate it as-is.
524
 
525
+ Output a JSON object with "scores" and "comment" (1-sentence Korean summary).
526
+ Every rubric key MUST appear in scores."""
 
 
 
 
 
 
527
 
528
  def build_judge_prompt(task, response):
529
  rubric = task.scoring_rubric
 
892
  from concurrent.futures import ThreadPoolExecutor, as_completed
893
 
894
  def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, proto_agi=False):
895
+ """단일 과제 평가 β€” β˜… v0.3.4: Structured Output Judge (fallback: legacy νŒŒμ„œ)"""
896
  try:
897
  model_response = _execute_task(task, api_key, eval_model, proto_agi=proto_agi)
898
 
 
906
  rubric_keys = list(task.scoring_rubric.keys())
907
  judge_data = None
908
 
909
+ # ── 1μ°¨: Structured Output Judge (JSON 100% 보μž₯) ──
910
+ judge_prompt = build_judge_prompt(task, model_response)
911
+ judge_data = call_judge_structured(
912
+ judge_prompt,
913
+ system=JUDGE_SYSTEM,
914
+ api_key=api_key,
915
+ model=judge_model,
916
+ rubric_keys=rubric_keys,
917
+ temperature=0.1,
918
+ max_tokens=1024,
919
+ )
 
920
 
921
+ if judge_data is not None:
922
+ # Structured Output 성곡
923
+ with state["lock"]:
924
+ state["parse_ok"] += 1
925
+ else:
926
+ # ── 2μ°¨ Fallback: Legacy ν…μŠ€νŠΈ νŒŒμ„œ ──
927
+ for judge_attempt in range(2):
928
+ if judge_attempt > 0:
929
+ judge_prompt += "\n\nIMPORTANT: Output ONLY the JSON object."
930
+
931
+ judge_raw = call_llm(
932
+ judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
933
+ model=judge_model,
934
+ temperature=0.05 if judge_attempt > 0 else 0.1,
935
+ max_tokens=512,
936
+ strip_think=True,
937
+ )
938
+
939
+ judge_data = parse_judge_response(judge_raw, rubric_keys)
940
+
941
+ if judge_data["comment"] != "νŒŒμ‹±μ‹€νŒ¨":
942
+ with state["lock"]:
943
+ state["parse_ok"] += 1
944
+ break
945
+ if judge_attempt < 1:
946
+ time.sleep(0.5)
947
 
948
+ if judge_data["comment"] == "νŒŒμ‹±μ‹€νŒ¨":
949
  with state["lock"]:
950
+ state["parse_fail"] += 1
 
 
 
 
 
 
 
951
 
952
  weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
953
  judge_json = json.dumps(judge_data, ensure_ascii=False)
 
1021
  if p_total > 0:
1022
  p_rate = p_ok / p_total * 100
1023
  p_color = "#4caf50" if p_rate >= 90 else ("#ff9800" if p_rate >= 70 else "#f44336")
1024
+ out += f'<div style="margin-top:6px;font-size:0.82em;">🎯 Judge (Structured): <b style="color:{p_color}">{p_ok}/{p_total} ({p_rate:.0f}%)</b> 성곡</div>'
1025
 
1026
  out += '</div>'
1027
  return out
 
1326
 
1327
  HEADER = """
1328
  <div style="text-align:center;padding:16px 0;">
1329
+ <h1 style="margin:0;font-size:1.8em;">πŸŒ€ AETHER-Bench v0.3.4</h1>
1330
  <h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 평가 μ‹œμŠ€ν…œ + Proto-AGI μ˜€ν–‰ λ©€ν‹°μ—μ΄μ „νŠΈ</h2>
1331
  <p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
1332
  120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
1333
  🌟 <b>Proto-AGI</b>: ζœ¨β†’η«β†’εœŸβ†’ι‡‘β†’ζ°΄ μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ + λ§ˆλ°©μ§„ μ†Œν†΅ 맀트릭슀<br>
1334
  πŸ€– <b>단일 LLM</b>: 순수 μ‹œν—˜ 평가 | CSV β†’ HuggingFace PRIVATE 기둝<br>
1335
+ ⚑ <b>v0.3.4</b>: Structured Output Judge (GLM-4.7) β€” JSON 100% Β· νŒŒμ‹±μ‹€νŒ¨ 0%
1336
  </p>
1337
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
1338
  <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">🌳 木 λ°œμƒ(仁)</span>
 
1345
 
1346
 
1347
  def create_app():
1348
+ with gr.Blocks(title="AETHER-Bench v0.3.4 + Proto-AGI", theme=gr.themes.Soft(),
1349
  css=".gradio-container{max-width:1100px !important}") as app:
1350
  gr.HTML(HEADER)
1351
 
 
1355
 
1356
  with gr.Row():
1357
  eval_model = gr.Textbox(label="πŸ€– 피평가 λͺ¨λΈ",
1358
+ value="accounts/fireworks/models/glm-4p7", scale=3)
1359
+ judge_model = gr.Textbox(label="βš–οΈ μ‹¬νŒ λͺ¨λΈ (Structured Output)",
1360
+ value="accounts/fireworks/models/glm-4p7", scale=3)
1361
 
1362
  with gr.Row():
1363
  proto_agi_toggle = gr.Checkbox(
 
1424
  )
1425
 
1426
  gr.Markdown("""---
1427
+ <center>AETHER-Bench v0.3.4 Β· Apache 2.0 Β· Ginigen AI (μ§€λ‹ˆμ  AI)<br>
1428
+ 🌟 Proto-AGI μ˜€ν–‰ νŒŒμ΄ν”„λΌμΈ | Fireworks: <b>glm-4p7</b> (피평가) + <b>glm-4p7</b> (Structured Judge)<br>
1429
+ ⚑ JSON 100% 보μž₯ Β· νŒŒμ‹±μ‹€νŒ¨ 0% Β· λ°±κ·ΈλΌμš΄λ“œ μ‹€ν–‰ | <code>HF_TOKEN</code> μ„€μ • μ‹œ PRIVATE μžλ™ 기둝</center>""")
1430
  return app
1431
 
1432
  # ════════════════════════════════════════════════════════════════
 
1437
  stats = {}
1438
  for t in ALL_TASKS:
1439
  stats[t.pillar] = stats.get(t.pillar, 0) + 1
1440
+ print(f"AETHER-Bench v0.3.4 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
1441
  print(f" Proto-AGI: 木_λ°œμƒβ†’η«_ν‘œν˜„β†’εœŸ_톡합→金_μ‹¬νŒβ†’ζ°΄_μ„±μ°° (5 agents)")
1442
+ print(f" β˜… Structured Output Judge (GLM-4.7) β€” JSON 100%")
1443
  print(f" β˜… Background thread + Timer polling (session-safe)")
1444
  for p, n in stats.items():
1445
  info = PILLAR_INFO[p]
 
1451
  server_name="0.0.0.0",
1452
  server_port=7860,
1453
  ssr_mode=False,
1454
+ )