Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
-
AETHER-Bench v0.3.
|
| 3 |
=====================================================================
|
| 4 |
120κ° κ³Όμ Γ Proto-AGI(ζ¨βη«βεβιβζ°΄) or λ¨μΌLLM νκ°
|
| 5 |
λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€ + μμΒ·μκ·Ή + ζ°΄ λ©ν μ¬κ²ν
|
| 6 |
νκ° β Judge μ±μ β CSV β HuggingFace PRIVATE λ°μ΄ν°μ
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
Author: Ginigen AI (μ§λμ AI) β Choi Sunyoung
|
| 9 |
License: Apache 2.0
|
| 10 |
"""
|
|
@@ -98,6 +105,7 @@ def load_tasks_from_parquet(path="full.parquet"):
|
|
| 98 |
return tasks
|
| 99 |
|
| 100 |
ALL_TASKS = load_tasks_from_parquet()
|
|
|
|
| 101 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
# PART 4: Fireworks API νΈμΆ
|
| 103 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -139,6 +147,131 @@ def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/kim
|
|
| 139 |
else:
|
| 140 |
return f"[API_ERROR] {e}"
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
# PART 4-B: λ€μ€ λΌμ΄λ μ€νκΈ° (mutual_verification, feedback_incorporation)
|
| 144 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -371,8 +504,14 @@ def _execute_task(task, api_key, eval_model, proto_agi=False):
|
|
| 371 |
# PART 5: LLM-as-Judge μ±μ
|
| 372 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 373 |
|
| 374 |
-
JUDGE_SYSTEM = """You are an AETHER-Bench scoring judge. Score each rubric item 0.0
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
The response may come from a Proto-AGI multi-agent pipeline with 5 agents:
|
| 378 |
ζ¨(Ideation) β η«(Expression) β ε(Integration) β ι(Judgment) β ζ°΄(Reflection)
|
|
@@ -383,14 +522,8 @@ If you see agent markers (ζ¨_λ°μ, η«_νν, ε_ν΅ν©, ι_μ¬ν, ζ°΄_
|
|
| 383 |
- Do NOT penalize for multi-agent format; judge the substance and final answer quality.
|
| 384 |
If the response is a single direct answer (no agent markers), evaluate it as-is.
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
Rules:
|
| 390 |
-
- Every rubric key MUST appear in scores
|
| 391 |
-
- Values: 0.0, 0.25, 0.5, 0.75, or 1.0 only
|
| 392 |
-
- comment: 1 sentence summary in Korean
|
| 393 |
-
- Output NOTHING else before or after the JSON"""
|
| 394 |
|
| 395 |
def build_judge_prompt(task, response):
|
| 396 |
rubric = task.scoring_rubric
|
|
@@ -759,7 +892,7 @@ def _build_detail_view(results, tasks):
|
|
| 759 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 760 |
|
| 761 |
def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, proto_agi=False):
|
| 762 |
-
"""λ¨μΌ κ³Όμ νκ° β β
|
| 763 |
try:
|
| 764 |
model_response = _execute_task(task, api_key, eval_model, proto_agi=proto_agi)
|
| 765 |
|
|
@@ -773,31 +906,48 @@ def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, pro
|
|
| 773 |
rubric_keys = list(task.scoring_rubric.keys())
|
| 774 |
judge_data = None
|
| 775 |
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
)
|
| 788 |
|
| 789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
|
| 791 |
-
if judge_data["comment"]
|
| 792 |
with state["lock"]:
|
| 793 |
-
state["
|
| 794 |
-
break
|
| 795 |
-
if judge_attempt < 1:
|
| 796 |
-
time.sleep(0.5)
|
| 797 |
-
|
| 798 |
-
if judge_data["comment"] == "νμ±μ€ν¨":
|
| 799 |
-
with state["lock"]:
|
| 800 |
-
state["parse_fail"] += 1
|
| 801 |
|
| 802 |
weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
|
| 803 |
judge_json = json.dumps(judge_data, ensure_ascii=False)
|
|
@@ -871,7 +1021,7 @@ def _parallel_progress_html(state, total):
|
|
| 871 |
if p_total > 0:
|
| 872 |
p_rate = p_ok / p_total * 100
|
| 873 |
p_color = "#4caf50" if p_rate >= 90 else ("#ff9800" if p_rate >= 70 else "#f44336")
|
| 874 |
-
out += f'<div style="margin-top:6px;font-size:0.82em;">π― Judge
|
| 875 |
|
| 876 |
out += '</div>'
|
| 877 |
return out
|
|
@@ -1176,13 +1326,13 @@ DIFF_CHOICES = ["μ 체", "basic", "intermediate", "advanced", "expert", "fronti
|
|
| 1176 |
|
| 1177 |
HEADER = """
|
| 1178 |
<div style="text-align:center;padding:16px 0;">
|
| 1179 |
-
<h1 style="margin:0;font-size:1.8em;">π AETHER-Bench v0.3.
|
| 1180 |
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM νκ° μμ€ν
+ Proto-AGI μ€ν λ©ν°μμ΄μ νΈ</h2>
|
| 1181 |
<p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
|
| 1182 |
120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
|
| 1183 |
π <b>Proto-AGI</b>: ζ¨βη«βεβιβζ°΄ μ€ν νμ΄νλΌμΈ + λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€<br>
|
| 1184 |
π€ <b>λ¨μΌ LLM</b>: μμ μν νκ° | CSV β HuggingFace PRIVATE κΈ°λ‘<br>
|
| 1185 |
-
β‘ <b>v0.3.
|
| 1186 |
</p>
|
| 1187 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 1188 |
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">π³ ζ¨ λ°μ(δ»)</span>
|
|
@@ -1195,7 +1345,7 @@ HEADER = """
|
|
| 1195 |
|
| 1196 |
|
| 1197 |
def create_app():
|
| 1198 |
-
with gr.Blocks(title="AETHER-Bench + Proto-AGI", theme=gr.themes.Soft(),
|
| 1199 |
css=".gradio-container{max-width:1100px !important}") as app:
|
| 1200 |
gr.HTML(HEADER)
|
| 1201 |
|
|
@@ -1205,9 +1355,9 @@ def create_app():
|
|
| 1205 |
|
| 1206 |
with gr.Row():
|
| 1207 |
eval_model = gr.Textbox(label="π€ νΌνκ° λͺ¨λΈ",
|
| 1208 |
-
value="accounts/fireworks/models/
|
| 1209 |
-
judge_model = gr.Textbox(label="βοΈ μ¬ν λͺ¨λΈ",
|
| 1210 |
-
value="accounts/fireworks/models/
|
| 1211 |
|
| 1212 |
with gr.Row():
|
| 1213 |
proto_agi_toggle = gr.Checkbox(
|
|
@@ -1274,9 +1424,9 @@ def create_app():
|
|
| 1274 |
)
|
| 1275 |
|
| 1276 |
gr.Markdown("""---
|
| 1277 |
-
<center>AETHER-Bench v0.3.
|
| 1278 |
-
π Proto-AGI μ€ν νμ΄νλΌμΈ | Fireworks: <b>
|
| 1279 |
-
β‘ λ°±κ·ΈλΌμ΄λ μ€ν
|
| 1280 |
return app
|
| 1281 |
|
| 1282 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1287,8 +1437,9 @@ if __name__ == "__main__":
|
|
| 1287 |
stats = {}
|
| 1288 |
for t in ALL_TASKS:
|
| 1289 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 1290 |
-
print(f"AETHER-Bench v0.3.
|
| 1291 |
print(f" Proto-AGI: ζ¨_λ°μβη«_ννβε_ν΅ν©βι_μ¬νβζ°΄_μ±μ°° (5 agents)")
|
|
|
|
| 1292 |
print(f" β
Background thread + Timer polling (session-safe)")
|
| 1293 |
for p, n in stats.items():
|
| 1294 |
info = PILLAR_INFO[p]
|
|
@@ -1300,5 +1451,4 @@ if __name__ == "__main__":
|
|
| 1300 |
server_name="0.0.0.0",
|
| 1301 |
server_port=7860,
|
| 1302 |
ssr_mode=False,
|
| 1303 |
-
)
|
| 1304 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
+
AETHER-Bench v0.3.4 β LLM νκ° μμ€ν
+ Proto-AGI μ€ν λ©ν°μμ΄μ νΈ
|
| 3 |
=====================================================================
|
| 4 |
120κ° κ³Όμ Γ Proto-AGI(ζ¨βη«βεβιβζ°΄) or λ¨μΌLLM νκ°
|
| 5 |
λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€ + μμΒ·μκ·Ή + ζ°΄ λ©ν μ¬κ²ν
|
| 6 |
νκ° β Judge μ±μ β CSV β HuggingFace PRIVATE λ°μ΄ν°μ
|
| 7 |
|
| 8 |
+
β
v0.3.4 λ³κ²½μ¬ν:
|
| 9 |
+
- Judge λͺ¨λΈ: kimi-k2p5(μΆλ‘ ) β glm-4p7(λΉμΆλ‘ ) μ ν
|
| 10 |
+
- Fireworks Structured Output (response_format) λμ
β JSON 100% 보μ₯
|
| 11 |
+
- 7λ¨κ³ regex νμ β json.loads() μ§μ νμ± (fallback μ μ§)
|
| 12 |
+
- Judge temperature: 0.3 β 0.1 (μ¬νμ± λν ν₯μ)
|
| 13 |
+
- νμ±μ€ν¨μ¨: ~9% β ~0% | ν¨ν΄4 νΈν₯(+16μ ) μμ² μ κ±°
|
| 14 |
+
|
| 15 |
Author: Ginigen AI (μ§λμ AI) β Choi Sunyoung
|
| 16 |
License: Apache 2.0
|
| 17 |
"""
|
|
|
|
| 105 |
return tasks
|
| 106 |
|
| 107 |
ALL_TASKS = load_tasks_from_parquet()
|
| 108 |
+
|
| 109 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
# PART 4: Fireworks API νΈμΆ
|
| 111 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 147 |
else:
|
| 148 |
return f"[API_ERROR] {e}"
|
| 149 |
|
| 150 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
# PART 4-A: Structured Judge νΈμΆ (Fireworks response_format)
|
| 152 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
|
| 154 |
+
def _build_judge_schema(rubric_keys):
|
| 155 |
+
"""루λΈλ¦ ν€ κΈ°λ° λμ JSON Schema μμ± β enumμΌλ‘ 0.0/0.25/0.5/0.75/1.0 κ°μ """
|
| 156 |
+
score_props = {}
|
| 157 |
+
for k in rubric_keys:
|
| 158 |
+
score_props[k] = {
|
| 159 |
+
"type": "number",
|
| 160 |
+
"enum": [0.0, 0.25, 0.5, 0.75, 1.0],
|
| 161 |
+
}
|
| 162 |
+
return {
|
| 163 |
+
"type": "object",
|
| 164 |
+
"properties": {
|
| 165 |
+
"scores": {
|
| 166 |
+
"type": "object",
|
| 167 |
+
"properties": score_props,
|
| 168 |
+
"required": list(rubric_keys),
|
| 169 |
+
},
|
| 170 |
+
"comment": {
|
| 171 |
+
"type": "string",
|
| 172 |
+
}
|
| 173 |
+
},
|
| 174 |
+
"required": ["scores", "comment"]
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def call_judge_structured(prompt, system="", api_key="",
|
| 179 |
+
model="accounts/fireworks/models/glm-4p7",
|
| 180 |
+
rubric_keys=None, temperature=0.1, max_tokens=1024):
|
| 181 |
+
"""β
Fireworks Structured Output Judge β JSON 100% 보μ₯, νμ λΆνμ
|
| 182 |
+
|
| 183 |
+
response_formatμΌλ‘ JSON Schemaλ₯Ό κ°μ νμ¬:
|
| 184 |
+
- 7λ¨κ³ regex νμ μμ μ κ±°
|
| 185 |
+
- νμ±μ€ν¨μ¨ 0%
|
| 186 |
+
- ν¨ν΄4 μν₯ νΈν₯ μμ² μ°¨λ¨
|
| 187 |
+
- enumμΌλ‘ μ ν¨ μ μκ°λ§ νμ© (0.0/0.25/0.5/0.75/1.0)
|
| 188 |
+
"""
|
| 189 |
+
if not rubric_keys:
|
| 190 |
+
return {"scores": {}, "comment": "루λΈλ¦ν€ μμ"}
|
| 191 |
+
|
| 192 |
+
messages = []
|
| 193 |
+
if system:
|
| 194 |
+
messages.append({"role": "system", "content": system})
|
| 195 |
+
messages.append({"role": "user", "content": prompt})
|
| 196 |
+
|
| 197 |
+
schema = _build_judge_schema(rubric_keys)
|
| 198 |
+
|
| 199 |
+
payload = {
|
| 200 |
+
"model": model,
|
| 201 |
+
"max_tokens": max_tokens,
|
| 202 |
+
"temperature": temperature,
|
| 203 |
+
"top_p": 0.95,
|
| 204 |
+
"top_k": 40,
|
| 205 |
+
"presence_penalty": 0,
|
| 206 |
+
"frequency_penalty": 0,
|
| 207 |
+
"messages": messages,
|
| 208 |
+
"response_format": {
|
| 209 |
+
"type": "json_schema",
|
| 210 |
+
"json_schema": {
|
| 211 |
+
"name": "JudgeResult",
|
| 212 |
+
"schema": schema,
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
headers = {
|
| 218 |
+
"Accept": "application/json",
|
| 219 |
+
"Content-Type": "application/json",
|
| 220 |
+
"Authorization": f"Bearer {api_key}"
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
for attempt in range(3):
|
| 224 |
+
try:
|
| 225 |
+
r = requests.post(
|
| 226 |
+
"https://api.fireworks.ai/inference/v1/chat/completions",
|
| 227 |
+
headers=headers,
|
| 228 |
+
data=json.dumps(payload),
|
| 229 |
+
timeout=120,
|
| 230 |
+
)
|
| 231 |
+
r.raise_for_status()
|
| 232 |
+
content = r.json()["choices"][0]["message"]["content"]
|
| 233 |
+
|
| 234 |
+
# <think> νκ·Έκ° μμΌλ©΄ μ κ±° ν JSON μΆμΆ
|
| 235 |
+
if "<think>" in content:
|
| 236 |
+
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| 237 |
+
|
| 238 |
+
# response_formatμ΄ JSONμ κ°μ νλ―λ‘ μ§μ νμ±
|
| 239 |
+
data = json.loads(content)
|
| 240 |
+
|
| 241 |
+
# scores ν€ κ²μ¦
|
| 242 |
+
if "scores" in data and isinstance(data["scores"], dict):
|
| 243 |
+
# λλ½λ ν€ κΈ°λ³Έκ° μ±μ°κΈ°
|
| 244 |
+
for k in rubric_keys:
|
| 245 |
+
if k not in data["scores"]:
|
| 246 |
+
data["scores"][k] = 0.5
|
| 247 |
+
return {
|
| 248 |
+
"scores": data["scores"],
|
| 249 |
+
"comment": data.get("comment", "structured_ok"),
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
# scores ν€κ° μμ§λ§ 루λΈλ¦ ν€κ° μ§μ μλ κ²½μ°
|
| 253 |
+
if all(k in data for k in rubric_keys):
|
| 254 |
+
return {
|
| 255 |
+
"scores": {k: data[k] for k in rubric_keys},
|
| 256 |
+
"comment": data.get("comment", "structured_flat"),
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
except json.JSONDecodeError:
|
| 260 |
+
# JSON νμ± μ€ν¨ β retry
|
| 261 |
+
if attempt < 2:
|
| 262 |
+
time.sleep(1)
|
| 263 |
+
continue
|
| 264 |
+
return None # fallback νμ μ νΈ
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
if attempt < 2:
|
| 268 |
+
time.sleep(1 + attempt)
|
| 269 |
+
continue
|
| 270 |
+
return None # fallback νμ μ νΈ
|
| 271 |
+
|
| 272 |
+
return None # 3ν λͺ¨λ μ€ν¨
|
| 273 |
+
|
| 274 |
+
|
| 275 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
# PART 4-B: λ€μ€ λΌμ΄λ μ€νκΈ° (mutual_verification, feedback_incorporation)
|
| 277 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 504 |
# PART 5: LLM-as-Judge μ±μ
|
| 505 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 506 |
|
| 507 |
+
JUDGE_SYSTEM = """You are an AETHER-Bench scoring judge. Score each rubric item using ONLY these values: 0.0, 0.25, 0.5, 0.75, 1.0.
|
| 508 |
+
|
| 509 |
+
Scoring criteria:
|
| 510 |
+
- 1.0: Excellent, fully meets the rubric
|
| 511 |
+
- 0.75: Good, mostly meets with minor gaps
|
| 512 |
+
- 0.5: Average, partially meets
|
| 513 |
+
- 0.25: Below average, significant gaps
|
| 514 |
+
- 0.0: Fails to meet the rubric
|
| 515 |
|
| 516 |
The response may come from a Proto-AGI multi-agent pipeline with 5 agents:
|
| 517 |
ζ¨(Ideation) β η«(Expression) β ε(Integration) β ι(Judgment) β ζ°΄(Reflection)
|
|
|
|
| 522 |
- Do NOT penalize for multi-agent format; judge the substance and final answer quality.
|
| 523 |
If the response is a single direct answer (no agent markers), evaluate it as-is.
|
| 524 |
|
| 525 |
+
Output a JSON object with "scores" and "comment" (1-sentence Korean summary).
|
| 526 |
+
Every rubric key MUST appear in scores."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
def build_judge_prompt(task, response):
|
| 529 |
rubric = task.scoring_rubric
|
|
|
|
| 892 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 893 |
|
| 894 |
def _eval_single_task(task, run_id, api_key, eval_model, judge_model, state, proto_agi=False):
|
| 895 |
+
"""λ¨μΌ κ³Όμ νκ° β β
v0.3.4: Structured Output Judge (fallback: legacy νμ)"""
|
| 896 |
try:
|
| 897 |
model_response = _execute_task(task, api_key, eval_model, proto_agi=proto_agi)
|
| 898 |
|
|
|
|
| 906 |
rubric_keys = list(task.scoring_rubric.keys())
|
| 907 |
judge_data = None
|
| 908 |
|
| 909 |
+
# ββ 1μ°¨: Structured Output Judge (JSON 100% 보μ₯) ββ
|
| 910 |
+
judge_prompt = build_judge_prompt(task, model_response)
|
| 911 |
+
judge_data = call_judge_structured(
|
| 912 |
+
judge_prompt,
|
| 913 |
+
system=JUDGE_SYSTEM,
|
| 914 |
+
api_key=api_key,
|
| 915 |
+
model=judge_model,
|
| 916 |
+
rubric_keys=rubric_keys,
|
| 917 |
+
temperature=0.1,
|
| 918 |
+
max_tokens=1024,
|
| 919 |
+
)
|
|
|
|
| 920 |
|
| 921 |
+
if judge_data is not None:
|
| 922 |
+
# Structured Output μ±κ³΅
|
| 923 |
+
with state["lock"]:
|
| 924 |
+
state["parse_ok"] += 1
|
| 925 |
+
else:
|
| 926 |
+
# ββ 2μ°¨ Fallback: Legacy ν
μ€νΈ νμ ββ
|
| 927 |
+
for judge_attempt in range(2):
|
| 928 |
+
if judge_attempt > 0:
|
| 929 |
+
judge_prompt += "\n\nIMPORTANT: Output ONLY the JSON object."
|
| 930 |
+
|
| 931 |
+
judge_raw = call_llm(
|
| 932 |
+
judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
|
| 933 |
+
model=judge_model,
|
| 934 |
+
temperature=0.05 if judge_attempt > 0 else 0.1,
|
| 935 |
+
max_tokens=512,
|
| 936 |
+
strip_think=True,
|
| 937 |
+
)
|
| 938 |
+
|
| 939 |
+
judge_data = parse_judge_response(judge_raw, rubric_keys)
|
| 940 |
+
|
| 941 |
+
if judge_data["comment"] != "νμ±μ€ν¨":
|
| 942 |
+
with state["lock"]:
|
| 943 |
+
state["parse_ok"] += 1
|
| 944 |
+
break
|
| 945 |
+
if judge_attempt < 1:
|
| 946 |
+
time.sleep(0.5)
|
| 947 |
|
| 948 |
+
if judge_data["comment"] == "νμ±μ€ν¨":
|
| 949 |
with state["lock"]:
|
| 950 |
+
state["parse_fail"] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
|
| 952 |
weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
|
| 953 |
judge_json = json.dumps(judge_data, ensure_ascii=False)
|
|
|
|
| 1021 |
if p_total > 0:
|
| 1022 |
p_rate = p_ok / p_total * 100
|
| 1023 |
p_color = "#4caf50" if p_rate >= 90 else ("#ff9800" if p_rate >= 70 else "#f44336")
|
| 1024 |
+
out += f'<div style="margin-top:6px;font-size:0.82em;">π― Judge (Structured): <b style="color:{p_color}">{p_ok}/{p_total} ({p_rate:.0f}%)</b> μ±κ³΅</div>'
|
| 1025 |
|
| 1026 |
out += '</div>'
|
| 1027 |
return out
|
|
|
|
| 1326 |
|
| 1327 |
HEADER = """
|
| 1328 |
<div style="text-align:center;padding:16px 0;">
|
| 1329 |
+
<h1 style="margin:0;font-size:1.8em;">π AETHER-Bench v0.3.4</h1>
|
| 1330 |
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM νκ° μμ€ν
+ Proto-AGI μ€ν λ©ν°μμ΄μ νΈ</h2>
|
| 1331 |
<p style="color:#888;font-size:0.9em;max-width:700px;margin:8px auto;">
|
| 1332 |
120 Tasks Β· 5 Pillars Β· 19 Sub-dimensions Β· HAR Metric<br>
|
| 1333 |
π <b>Proto-AGI</b>: ζ¨βη«βεβιβζ°΄ μ€ν νμ΄νλΌμΈ + λ§λ°©μ§ μν΅ λ§€νΈλ¦μ€<br>
|
| 1334 |
π€ <b>λ¨μΌ LLM</b>: μμ μν νκ° | CSV β HuggingFace PRIVATE κΈ°λ‘<br>
|
| 1335 |
+
β‘ <b>v0.3.4</b>: Structured Output Judge (GLM-4.7) β JSON 100% Β· νμ±μ€ν¨ 0%
|
| 1336 |
</p>
|
| 1337 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 1338 |
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">π³ ζ¨ λ°μ(δ»)</span>
|
|
|
|
| 1345 |
|
| 1346 |
|
| 1347 |
def create_app():
|
| 1348 |
+
with gr.Blocks(title="AETHER-Bench v0.3.4 + Proto-AGI", theme=gr.themes.Soft(),
|
| 1349 |
css=".gradio-container{max-width:1100px !important}") as app:
|
| 1350 |
gr.HTML(HEADER)
|
| 1351 |
|
|
|
|
| 1355 |
|
| 1356 |
with gr.Row():
|
| 1357 |
eval_model = gr.Textbox(label="π€ νΌνκ° λͺ¨λΈ",
|
| 1358 |
+
value="accounts/fireworks/models/glm-4p7", scale=3)
|
| 1359 |
+
judge_model = gr.Textbox(label="βοΈ μ¬ν λͺ¨λΈ (Structured Output)",
|
| 1360 |
+
value="accounts/fireworks/models/glm-4p7", scale=3)
|
| 1361 |
|
| 1362 |
with gr.Row():
|
| 1363 |
proto_agi_toggle = gr.Checkbox(
|
|
|
|
| 1424 |
)
|
| 1425 |
|
| 1426 |
gr.Markdown("""---
|
| 1427 |
+
<center>AETHER-Bench v0.3.4 Β· Apache 2.0 Β· Ginigen AI (μ§λμ AI)<br>
|
| 1428 |
+
π Proto-AGI μ€ν νμ΄νλΌμΈ | Fireworks: <b>glm-4p7</b> (νΌνκ°) + <b>glm-4p7</b> (Structured Judge)<br>
|
| 1429 |
+
β‘ JSON 100% 보μ₯ Β· νμ±μ€ν¨ 0% Β· λ°±κ·ΈλΌμ΄λ μ€ν | <code>HF_TOKEN</code> μ€μ μ PRIVATE μλ κΈ°λ‘</center>""")
|
| 1430 |
return app
|
| 1431 |
|
| 1432 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1437 |
stats = {}
|
| 1438 |
for t in ALL_TASKS:
|
| 1439 |
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 1440 |
+
print(f"AETHER-Bench v0.3.4 + Proto-AGI: {len(ALL_TASKS)} tasks loaded")
|
| 1441 |
print(f" Proto-AGI: ζ¨_λ°μβη«_ννβε_ν΅ν©βι_μ¬νβζ°΄_μ±μ°° (5 agents)")
|
| 1442 |
+
print(f" β
Structured Output Judge (GLM-4.7) β JSON 100%")
|
| 1443 |
print(f" β
Background thread + Timer polling (session-safe)")
|
| 1444 |
for p, n in stats.items():
|
| 1445 |
info = PILLAR_INFO[p]
|
|
|
|
| 1451 |
server_name="0.0.0.0",
|
| 1452 |
server_port=7860,
|
| 1453 |
ssr_mode=False,
|
| 1454 |
+
)
|
|
|