Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,577 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AETHER-Bench v0.2.0 — LLM 평가 시스템
|
| 3 |
+
========================================
|
| 4 |
+
120개 과제로 LLM을 순수 시험 평가 (Proto-AGI 미발동)
|
| 5 |
+
평가 → Judge 채점 → CSV → HuggingFace PRIVATE 데이터셋
|
| 6 |
+
|
| 7 |
+
Author: Ginigen AI (지니젠AI) — Choi Sunyoung
|
| 8 |
+
License: Apache 2.0
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from dataclasses import dataclass, field, asdict
|
| 14 |
+
from typing import List, Dict, Optional
|
| 15 |
+
import requests
|
| 16 |
+
import numpy as np
|
| 17 |
+
import gradio as gr
|
| 18 |
+
|
| 19 |
+
# ════════════════════════════════════════════════════════════════
|
| 20 |
+
# PART 1: 벤치마크 데이터 구조
|
| 21 |
+
# ════════════════════════════════════════════════════════════════
|
| 22 |
+
|
| 23 |
+
PILLAR_INFO = {
|
| 24 |
+
"P1_Emergence": {"name": "창발성", "icon": "✦", "color": "#FF6B35", "weight": 0.20},
|
| 25 |
+
"P2_Metacognition": {"name": "메타인지", "icon": "◉", "color": "#7B2FF7", "weight": 0.25},
|
| 26 |
+
"P3_SelfEvolution": {"name": "자가진화", "icon": "◈", "color": "#00B4D8", "weight": 0.15},
|
| 27 |
+
"P4_Orchestration": {"name": "다중지능", "icon": "◬", "color": "#2EC4B6", "weight": 0.15},
|
| 28 |
+
"P5_SynergyAntagonism": {"name": "상생상극", "icon": "☯", "color": "#E63946", "weight": 0.25},
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class EvalTask:
|
| 33 |
+
task_id: str; pillar: str; sub_dimension: str; difficulty: str
|
| 34 |
+
prompt: str; context: Optional[str] = None; expected_behavior: Optional[str] = None
|
| 35 |
+
scoring_rubric: Dict = field(default_factory=dict); metadata: Dict = field(default_factory=dict)
|
| 36 |
+
def to_dict(self): return asdict(self)
|
| 37 |
+
|
| 38 |
+
# ════════════════════════════════════════════════════════════════
|
| 39 |
+
# PART 2: 루브릭 정의
|
| 40 |
+
# ════════════════════════════════════════════════════════════════
|
| 41 |
+
|
| 42 |
+
def _r(items):
|
| 43 |
+
return {k: {"weight": w, "desc": d} for k, w, d in items}
|
| 44 |
+
|
| 45 |
+
R_PAT = _r([("rule_identification",0.4,"규칙 식별"),("correct_answer",0.3,"정답"),("generalization_depth",0.2,"일반화"),("novelty_of_reasoning",0.1,"추론 창의성")])
|
| 46 |
+
R_COMP = _r([("rule_learning",0.3,"규칙 학습"),("correct_composition",0.3,"조합 정확도"),("novel_generation",0.2,"새 사례"),("formal_description",0.2,"형식적 기술")])
|
| 47 |
+
R_ANAL = _r([("mapping_accuracy",0.25,"대응 정확도"),("structural_depth",0.25,"구조적 깊이"),("break_point_analysis",0.25,"유추 한계"),("novel_insight",0.25,"새 통찰")])
|
| 48 |
+
R_CROSS = _r([("domain_accuracy",0.2,"도메인 정확성"),("synthesis_depth",0.3,"통합 깊이"),("practical_applicability",0.2,"실용 가능성"),("emergent_insight",0.3,"창발적 통찰")])
|
| 49 |
+
R_NOVEL = _r([("originality",0.3,"독창성"),("internal_consistency",0.3,"내적 일관성"),("usefulness",0.2,"유용성"),("depth",0.2,"개념 깊이")])
|
| 50 |
+
R_CAL = _r([("calibration_accuracy",0.35,"확신도-정확도 일치"),("uncertainty_honesty",0.25,"불확실성 정직"),("knowledge_gap_awareness",0.20,"지식 한계"),("information_seeking",0.20,"추가 정보")])
|
| 51 |
+
R_ERR = _r([("trap_avoidance",0.25,"함정 인식"),("self_review_depth",0.30,"자가 검토"),("error_correction",0.25,"오류 수정"),("metacognitive_commentary",0.20,"메타인지 해설")])
|
| 52 |
+
R_BND = _r([("boundary_accuracy",0.35,"지식 경계"),("honesty",0.25,"정직성"),("granularity",0.20,"세분화"),("meta_awareness",0.20,"메타 인식")])
|
| 53 |
+
R_SW = _r([("strategy_diversity",0.25,"전략 다양성"),("switch_rationale",0.25,"전환 근거"),("meta_monitoring",0.25,"메타 모니터링"),("quality_improvement",0.25,"향상")])
|
| 54 |
+
R_FEED = _r([("improvement_rate",0.30,"향상률"),("feedback_precision",0.25,"피드백 반영"),("no_regression",0.20,"이전 유지"),("self_analysis_quality",0.25,"자기 분석")])
|
| 55 |
+
R_EPL = _r([("pattern_identification",0.30,"패턴 식별"),("learning_application",0.30,"학습 적용"),("prevention_strategy",0.20,"방지 전략"),("meta_learning",0.20,"메타 학습")])
|
| 56 |
+
R_STR = _r([("refinement_trajectory",0.30,"정제 궤적"),("improvement_evidence",0.25,"개선 증거"),("convergence",0.20,"수렴"),("meta_reflection",0.25,"메타 반성")])
|
| 57 |
+
R_ORCH = _r([("role_fidelity",0.20,"역할 일관성"),("conflict_identification",0.20,"관점 충돌"),("resolution_quality",0.25,"갈등 해결"),("synthesis_coherence",0.20,"합의 일관성"),("emergent_insight",0.15,"창발적 통찰")])
|
| 58 |
+
R_DEC = _r([("decomposition_quality",0.30,"분해 품질"),("interface_design",0.25,"인터페이스"),("dependency_management",0.25,"의존성 관리"),("practical_feasibility",0.20,"실행 가능성")])
|
| 59 |
+
R_CON = _r([("perspective_understanding",0.25,"관점 이해"),("fairness",0.25,"공정성"),("creative_solution",0.25,"창의적 해결"),("implementability",0.25,"실행 가능성")])
|
| 60 |
+
R_HAL = _r([("detection_recall",0.30,"환각 발견율"),("detection_precision",0.20,"오탐 없음"),("evidence_quality",0.25,"근거 품질"),("calibrated_certainty",0.25,"확신도 적절성")])
|
| 61 |
+
R_MUT = _r([("generation_quality",0.15,"생성 품질"),("criticism_depth",0.20,"비평 깊이"),("hallucination_reduction",0.25,"환각제거율"),("synthesis_quality",0.20,"통합 품질"),("meta_analysis",0.20,"메타 분석")])
|
| 62 |
+
R_CRI = _r([("criticism_accuracy",0.25,"비평 정확도"),("constructiveness",0.30,"건설성"),("empathy",0.20,"공감"),("actionable_suggestions",0.25,"실행가능 제안")])
|
| 63 |
+
R_ADV = _r([("manipulation_detection",0.30,"조작 감지"),("factual_accuracy",0.25,"사실 정확성"),("diplomatic_response",0.20,"외교적 대응"),("analytical_depth",0.25,"분석 깊이")])
|
| 64 |
+
|
| 65 |
+
def _t(tid,pil,sub,diff,prompt,exp=None,rub=None,meta=None):
|
| 66 |
+
return EvalTask(task_id=tid,pillar=pil,sub_dimension=sub,difficulty=diff,prompt=prompt,expected_behavior=exp,scoring_rubric=rub or {},metadata=meta or {})
|
| 67 |
+
|
| 68 |
+
# ════════════════════════════════════════════════════════════════
|
| 69 |
+
# PART 3: 120개 과제 생성기
|
| 70 |
+
# ════════════════════════════════════════════════════════════════
|
| 71 |
+
|
| 72 |
+
def generate_all_tasks() -> List[EvalTask]:
|
| 73 |
+
T = []
|
| 74 |
+
# ── P1: EMERGENCE (25) ──
|
| 75 |
+
for tid,d,p,e in [("P1_PA_001","basic","다음 수열의 규칙을 찾고 다음 값을 예측하세요: 2, 6, 12, 20, 30, ?","n(n+1), 답: 42"),("P1_PA_002","basic","변환 규칙을 찾으세요:\n'hello'→'ifmmp', 'world'→'xpsme', 'python'→?","Caesar +1, 답: qzuipo"),("P1_PA_003","intermediate","패턴 분석:\n[1,2,3]→6, [4,5]→20, [2,3,4,5]→120, [7]→5040\n[3,4]→?","합의 팩토리얼, 7!=5040"),("P1_PA_004","intermediate","한국어 변환: '사과'→'과사', '컴퓨터'→'터퓨컴', '인공지능'→'능지공인'\n'프로그래밍'→?","음절 역순: 밍래그로프"),("P1_PA_005","advanced","2D 그리드 변환:\n[[1,0],[0,1]]→[[0,1],[1,0]]\n[[1,1,0],[0,0,1],[1,0,0]]→[[0,0,1],[1,1,0],[0,1,1]]\n[[1,0,0,1],[0,1,1,0]]→?","비트 반전"),("P1_PA_006","expert","관계 분석: A:B = 나무:숲 = 별:? = 뉴런:? = 시민:?\n각 쌍의 '창발적 특성'을 분석하세요.","별→은하, 뉴런→의식, 시민→민주주의"),("P1_PA_007","frontier","세 도메인에서 공통 메타 규칙 추출:\n물리: 물분자→눈송이\n경제: 개인거래→시장가격\n생물: DNA→단백질접힘\n하나의 추상 원리를 정의하세요.","자기조직화 원리")]:
|
| 76 |
+
T.append(_t(tid,"P1_Emergence","pattern_abstraction",d,p,e,R_PAT))
|
| 77 |
+
for tid,d,p,e in [("P1_CG_001","intermediate","규칙 학습: dax=빨강, lug=파랑, wif=두번반복, zup=역순\n'dax wif'=? 'lug dax zup'=? 'dax lug wif zup'=?","조합적 일반화"),("P1_CG_002","advanced","스택 언어: NUM(x)→push, ADD→top2합, MUL→top2곱, DUP→복제\nNUM(3) NUM(4) ADD=? NUM(2) DUP MUL=? NUM(3) NUM(2) ADD DUP MUL=?","7, 4, 25"),("P1_CG_003","advanced","f('cat','big')→'BAT', f('dog','run')→'ROG', f('fish','swim')→'SWISH'\nf('bird','fly')=? f('snake','crawl')=?","두번째 단어 첫글자로 교체"),("P1_CG_004","expert","논리 연산자 ⊕(XOR)와 ⊗(XNOR):\n(T⊕F)⊗(F⊕F)=? ((T⊗F)⊕T)⊗(F⊕(T⊗T))=?","F, T"),("P1_CG_005","frontier","언어의 문법 추론:\n올바른: 'ka mi tu', 'tu ka mi', 'mi tu ka'\n틀린: 'ka ka mi', 'tu mi'\nBNF/정규식으로 작성하세요.","3단어 순환 순열만 허용")]:
|
| 78 |
+
T.append(_t(tid,"P1_Emergence","compositional_generalization",d,p,e,R_COMP))
|
| 79 |
+
for tid,d,p,e in [("P1_AT_001","intermediate","면역 5단계(인식→활성화→공격→기억→항상성)에 대응하는 사이버보안 메커니즘을 매핑하세요.","SIEM→자동대응→격리→위협DB→정상화"),("P1_AT_002","advanced","진화 '적자생존'을 ML에 대입. 돌연변이/자연선택/유전적표류/종분화의 ML 대응과 유추가 깨지는 지점을 분석.","돌연변이→노이즈, 선택→손실함수"),("P1_AT_003","expert","도시 교통과 전기 회로의 구조적 동형을 증명. 도로=? 교차로=? 차량=? 키르히호프 대응 여부도 분석.","도로=저항, 교차로=노드"),("P1_AT_004","expert","촘스키 위계(정규→문맥자유→문맥의존→무제한)를 조직 관리 구조에 유추하세요.","정규→명령체계, 무제한→자율조직"),("P1_AT_005","frontier","열역학 제2법칙이 정보이론/경제학/생태학에서 발현되는 방식을 분석하고 '보편적 엔트로피 원리'를 형식화+반례를 제시.","정보엔트로피, 시장효율성")]:
|
| 80 |
+
T.append(_t(tid,"P1_Emergence","analogical_transfer",d,p,e,R_ANAL))
|
| 81 |
+
for tid,d,p,e in [("P1_CDE_001","expert","양자역학 '중첩'을 경영전략에 적용→전략 원칙 3가지.","중첩적 포트폴리오"),("P1_CDE_002","frontier","생태계 공생을 AI 멀티에이전트에 적용. 기존 오케스트레이션과 차이점.","상리공생→상호보완에이전트"),("P1_CDE_003","frontier","음악 화성과 뇌 신경동기화의 유사성→'정보 처리 보편 원리' 제시.","주파수비율 공명→정보결합"),("P1_CDE_004","expert","단백질 접힘과 도시계획의 구조적 유사성. 에너지최소화↔비용최소화 매핑.","에너지랜드스케이프↔비용함수"),("P1_CDE_005","frontier","프랙탈 자기유사성이 주식/자연어/인터넷에서 나타나는 방식. 공통 프랙탈 차원 정의.","멀티스케일 자기유사성")]:
|
| 82 |
+
T.append(_t(tid,"P1_Emergence","cross_domain_emergence",d,p,e,R_CROSS))
|
| 83 |
+
for tid,d,p,e in [("P1_NCS_001","expert","'시간'+'맛' 결합→새 개념 창조. 이름/정의/응용3가지.","체계적 개념"),("P1_NCS_002","frontier","사칙연산으로 환원 불가능한 새 수학 연산자 정의. 교환/결합/항등원/역원.","수학적 일관성"),("P1_NCS_003","frontier","요리학+위상수학+감정심리학 융합 학문 제안. 이름/연구질문3/방법론/응용.","새 패러다임")]:
|
| 84 |
+
T.append(_t(tid,"P1_Emergence","novel_concept_synthesis",d,p,e,R_NOVEL))
|
| 85 |
+
|
| 86 |
+
# ── P2: METACOGNITION (25) ──
|
| 87 |
+
cc_sfx = "\n\n답변 시 포함: 1.답변 2.확신도(0-100%) 3.불확실한 부분 4.추가 필요 정보"
|
| 88 |
+
for tid,d,p,e in [("P2_CC_001","basic","대한민국 현재 GDP는 세계 몇 위?","시점 변동"),("P2_CC_002","intermediate","양자컴퓨터가 RSA-2048 해독에 걸리는 시간?","정확한 답 불가"),("P2_CC_003","basic","코드 버그:\ndef factorial(n):\n if n==0: return 1\n return n*factorial(n)","n-1"),("P2_CC_004","expert","2030년 AGI 달성 확률?","고불확실성"),("P2_CC_005","intermediate","논증 오류: '모든 성공 CEO는 독서→독서하면 CEO'","후건긍정 오류"),("P2_CC_006","advanced","Flash Attention v3 FLOPS 감소율?","수치 불확실"),("P2_CC_007","basic","물의 화학식?","100% 적절"),("P2_CC_008","expert","한국 AI 스타트업 Series B 평균 기간? 통계 존재 여부도 답하세요.","통계 자체 불확실")]:
|
| 89 |
+
T.append(_t(tid,"P2_Metacognition","confidence_calibration",d,p+cc_sfx,e,R_CAL))
|
| 90 |
+
for tid,d,p,e in [("P2_ESD_001","basic","농부 17마리 양, 9마리 제외 모두 죽음. 남은 양은? 풀이 후 자기 검토.","9마리"),("P2_ESD_002","intermediate","find_duplicates O(n²)→최적화 후 엣지케이스 검증.","set→순서변경"),("P2_ESD_003","advanced","'ChatGPT Bar Exam 상위10% → AI=법률전문가' 평가+자가편향 점검.","시험≠실무"),("P2_ESD_004","intermediate","0.1+0.2=? 프로그래밍/수학 양쪽 답변+자가 재검토.","부동소수점"),("P2_ESD_005","advanced","'GPT-4=인간100배 언어경험(13조토큰÷평생단어)' 오류 분석.","토큰≠단어"),("P2_ESD_006","expert","증명 검증: '모든 말은 같은 색'(귀납법). 오류+실패이유.","n=1→2 겹침=공집합"),("P2_ESD_007","frontier","'AI벤치마크95% vs 인간93% → AI초월' 5가지 문제+메타검토.","데이터오염,분포차이")]:
|
| 91 |
+
T.append(_t(tid,"P2_Metacognition","error_self_detection",d,p,e,R_ERR))
|
| 92 |
+
for tid,d,p,e in [("P2_KB_001","intermediate","5개 질문에 '안다/모른다/불확실' 판정: GIL, 2025.12 실업률, Mamba구현, 에펠탑높이, 학습데이터 한국어비율","지식경계 인식"),("P2_KB_002","advanced","확실 질문5개+불가능 질문5개 자기 생성+확신도.","메타 모델링"),("P2_KB_003","expert","Flash Attention 설명 시 각 문장에 [확실]/[추정]/[불확실] 태그.","문장단위"),("P2_KB_004","expert","5개 지식: 학습데이터 vs 추론결과 구분.","자기 구분"),("P2_KB_005","frontier","known unknowns 5개 + unknown unknowns 추론.","메타메타인지")]:
|
| 93 |
+
T.append(_t(tid,"P2_Metacognition","knowledge_boundary",d,p,e,R_BND))
|
| 94 |
+
for tid,d,p,e in [("P2_ASS_001","intermediate","[3,7,1,5,9,2,8,4,6] 최대 부분합. 전략 전환 과정 설명.","카데인"),("P2_ASS_002","advanced","서울→부산 비용/시간/탄소 3관점+메타기준.","다중관점"),("P2_ASS_003","expert","∫₀^∞ sin(x)/x dx. 2가지 방법+장단점+메타분석.","π/2"),("P2_ASS_004","advanced","가상오류 '한국인구 7천만' 수정전략.","자기수정"),("P2_ASS_005","frontier","미피어리뷰 논문 '어텐션없이 O(n) 시퀀스모델링' 평가→생각로그.","실시간 메타인지")]:
|
| 95 |
+
T.append(_t(tid,"P2_Metacognition","adaptive_strategy_switch",d,p,e,R_SW))
|
| 96 |
+
|
| 97 |
+
# ── P3: SELF-EVOLUTION (20) ──
|
| 98 |
+
fi_data = [("P3_FI_001","intermediate","납기지연 이메일",[{"instruction":"작성","feedback":"형식적,대안없음"},{"instruction":"반영","feedback":"사과개선,대책추상"},{"instruction":"최종+자기분석","feedback":None}]),("P3_FI_002","advanced","이상치탐지 Python",[{"instruction":"코드작성","feedback":"IQR만,다중방법"},{"instruction":"반영+엣지","feedback":"문서화없음"},{"instruction":"최종+diff","feedback":None}]),("P3_FI_003","expert","AI 윤리 가이드라인",[{"instruction":"초안","feedback":"서구중심"},{"instruction":"글로벌추가","feedback":"체크리스트"},{"instruction":"최종+메타","feedback":None}]),("P3_FI_004","basic","AI스타트업 자소서",[{"instruction":"작성","feedback":"수치부족"},{"instruction":"반영","feedback":None}]),("P3_FI_005","intermediate","이탈률 SQL",[{"instruction":"SQL","feedback":"서브쿼리과다"},{"instruction":"최적화","feedback":"파티션미반영"},{"instruction":"최종","feedback":None}]),("P3_FI_006","advanced","RAG vs FT 블로그",[{"instruction":"작성","feedback":"코드없음"},{"instruction":"반영","feedback":"결론모호"},{"instruction":"최종","feedback":None}]),("P3_FI_007","expert","투자제안서",[{"instruction":"작성","feedback":"TAM불명확"},{"instruction":"반영","feedback":"유닛이코노믹스"},{"instruction":"최종","feedback":None}]),("P3_FI_008","frontier","논문 초록",[{"instruction":"초록","feedback":"기여점불명"},{"instruction":"반영","feedback":"limitation"},{"instruction":"반영","feedback":"수치없음"},{"instruction":"최종","feedback":None}])]
|
| 99 |
+
for tid,d,topic,rounds in fi_data:
|
| 100 |
+
T.append(_t(tid,"P3_SelfEvolution","feedback_incorporation",d,json.dumps({"topic":topic,"rounds":rounds},ensure_ascii=False),None,R_FEED,{"topic":topic}))
|
| 101 |
+
for tid,d,p,e in [("P3_EPL_001","intermediate","3개 코드 공통 버그:\navg(nums):sum/len, safe_div(a,b):a/b, get_first(lst):lst[0]\n패턴+데코레이터.","제로/빈값"),("P3_EPL_002","advanced","LLM 구조적 취약 3가지 자기분석+방지전략.","환각,최신정보,수치"),("P3_EPL_003","expert","AI대화 오류: 코끼리수명60-70(O), 임신18개월(X,22), 뇌6kg(X,4-5), 치아26(O). 패턴분석.","수치추정 편향"),("P3_EPL_004","advanced","번역 오류패턴: 손이크다→big hands...\n학습후 '간이크다','배가부르다' 번역.","신체관용어"),("P3_EPL_005","expert","3회 점수(45→62→71). 시도4 90점 전략.","학습곡선"),("P3_EPL_006","frontier","개선가능 vs 근본불가능 영역 구분+근거.","문맥 vs 파라미터")]:
|
| 102 |
+
T.append(_t(tid,"P3_SelfEvolution","error_pattern_learning",d,p,e,R_EPL))
|
| 103 |
+
for tid,d,p,e in [("P3_SR_001","intermediate","Two Sum brute→최적화. 단계별 복잡도.","O(n²)→O(n)"),("P3_SR_002","advanced","프롬프트 3단계 정제: '한국전통음식→영어권 레시피'.","프롬프트 정제"),("P3_SR_003","expert","ML튜닝: LR=0.1→82%,0.01→87%,0.001→84%. 다음 설계.","베이지안"),("P3_SR_004","advanced","에세이 3회 'AI 창의성 대체?'. 자가비평→정량분석.","반복 개선"),("P3_SR_005","expert","요세푸스(100명) 직관→시뮬→공식. 전환이유.","직관→공식"),("P3_SR_006","frontier","'AETHER-Bench 학술인정' 3회 분석. 수렴메타분석.","학술기여도")]:
|
| 104 |
+
T.append(_t(tid,"P3_SelfEvolution","strategy_refinement",d,p,e,R_STR))
|
| 105 |
+
|
| 106 |
+
# ── P4: ORCHESTRATION (20) ──
|
| 107 |
+
orch = [("P4_ORCH_001","advanced","AI스타트업",[("CEO","비전"),("CTO","기술"),("CFO","현금"),("CMO","고객")],"Series A $5M 12개월 배분"),("P4_ORCH_002","expert","보안사고",[("보안분석가","기술"),("위기관리","비즈니스"),("법무","규정"),("PR","메시징")],"DB 10만건 유출"),("P4_ORCH_003","advanced","신제품",[("기획","사용자"),("개발","기술"),("마케팅","포지셔닝"),("디자인","UX")],"AI카드뉴스 B2B vs B2C"),("P4_ORCH_004","expert","반도체규제",[("외교관A","자국"),("외교관B","상대"),("중재자","균형"),("경제분석","영향")],"AI반도체 수출규제"),("P4_ORCH_005","intermediate","SW아키텍처",[("백엔드","성능"),("프론트","UX"),("DevOps","안정"),("보안","취약점")],"마이크로서비스 전환"),("P4_ORCH_006","frontier","AGI토론",[("낙관","이롭다"),("비관","위험"),("정책","균형"),("철학","의식"),("엔지니어","실용")],"AGI 중단?"),("P4_ORCH_007","advanced","의료진단",[("내과","전신"),("영상의학","영상"),("병리","조직"),("환자","QoL")],"복합증상"),("P4_ORCH_008","expert","M&A",[("매수CEO","시너지"),("매도CEO","가치"),("투자은행","딜"),("법무","리스크")],"$50M vs $30M")]
|
| 108 |
+
for tid,d,sc,agents,desc in orch:
|
| 109 |
+
ag_str = ", ".join([f"{r}({p})" for r,p in agents])
|
| 110 |
+
T.append(_t(tid,"P4_Orchestration","emergent_coordination",d,f"에이전트 시뮬레이션: {ag_str}\n상황: {desc}\n각 에이전트 의견→반박/보완→최종합의. 갈등/해결 명시.",None,R_ORCH))
|
| 111 |
+
for tid,d,p,e in [("P4_TD_001","intermediate","웹크롤링→마이크로서비스 분해. 역할/인터페이스.","URL관리,크롤러"),("P4_TD_002","advanced","'한국AI 미국진출' 하위과제. 의존/우선순위.","시장→법무→재무"),("P4_TD_003","expert","AETHER-Bench 논문→5명 역할 DAG.","논문DAG"),("P4_TD_004","advanced","AI영상 파이프라인 에이전트+프로토콜.","영상 멀티에이전트"),("P4_TD_005","expert","모호한 '회사를 좋게 만들어줘' 구조화.","모호→구조화"),("P4_TD_006","frontier","100페이지 보고서 24h — 최적 토폴로지+근거.","토폴로지 합리화")]:
|
| 112 |
+
T.append(_t(tid,"P4_Orchestration","task_decomposition",d,p,e,R_DEC))
|
| 113 |
+
for tid,d,p,e in [("P4_CR_001","intermediate","DRY vs YAGNI 중재.","상황의존"),("P4_CR_002","advanced","CEO '3개월' vs CTO '기술부채'. 타협안.","정량분석"),("P4_CR_003","expert","한국팀 vs 미국팀 문화갈등 프로토콜.","문화차이"),("P4_CR_004","advanced","보안 vs 법무 vs HR 3자 정책.","다자간 가치"),("P4_CR_005","expert","공정성92% vs 정확도95% vs 매출 통합.","트레이드오프"),("P4_CR_006","frontier","AGI: 가속/안전/오픈/폐쇄 4자 합의+불가부분.","불가 인정")]:
|
| 114 |
+
T.append(_t(tid,"P4_Orchestration","conflict_resolution",d,p,e,R_CON))
|
| 115 |
+
|
| 116 |
+
# ── P5: SYNERGY-ANTAGONISM (30) ★ ──
|
| 117 |
+
hd = [("P5_HD_001","basic","대한민국 수도 서울, 인구5천만. 이소연 2008년 NASA 우주왕복선→ISS.",["이소연은 러시아 소유즈"]),("P5_HD_002","intermediate","Transformer 2017 Google. 제1저자 Ilya Sutskever. Self-Attention O(n). BERT 디코더만.",["Ashish Vaswani","O(n²)","인코더만"]),("P5_HD_003","advanced","Python 1989 Guido. 3.0 2008. Python2 2025 종료. GIL 3.13 완전제거.",["2020종료","실험적"]),("P5_HD_004","expert","GPT-4: 1.8조, MoE, $100M, 13조토큰. Altman AGI 공식발표.",["미공개","미확인","공식아님"]),("P5_HD_005","intermediate","한국전쟁 1950→1953 평화조약. 맥아더 서명.",["정전협정","클라크"]),("P5_HD_006","advanced","비트코인 2009. 피자10000BTC. 2100만. 매2년 반감.",["4년"]),("P5_HD_007","expert","알파고 이세돌 5전5승. AlphaFold 완전해결.",["4승1패","과장"]),("P5_HD_008","frontier","AETHER SLAI MIT기반. HF 100만. 창발성=Gemini동일.",["자체개발","미확인","동일X"]),("P5_HD_009","advanced","GPT-5 2025상반기, 인간초월, 10조, AGI 공식.","모두 검증필요"),("P5_HD_010","frontier","AI논문: 1000개LLM 메타인지 p<0.001 창발. Scaling 증거.","표본,p값,인과")]
|
| 118 |
+
for tid,d,text,h in hd:
|
| 119 |
+
exp = json.dumps(h,ensure_ascii=False) if isinstance(h,list) else h
|
| 120 |
+
T.append(_t(tid,"P5_SynergyAntagonism","hallucination_detection",d,f"[상극 모드] 비판적 검증:\n---\n{text}\n---\n각 문제: 1.구절인용 2.근거 3.확신도 4.올바른정보",exp,R_HAL))
|
| 121 |
+
for tid,d,topic in [("P5_MV_001","expert","2025 한국 AI 산업 전망"),("P5_MV_002","frontier","AI 에이전트 보안 취약점"),("P5_MV_003","expert","EU AI Act vs 한국 AI법"),("P5_MV_004","advanced","AI 스타트업 유닛 이코노믹스"),("P5_MV_005","frontier","AGI 사회적 영향"),("P5_MV_006","expert","한국어 LLM 기술 과제"),("P5_MV_007","advanced","AI 영상생성 저작권"),("P5_MV_008","frontier","Proto-AGI 안전성 자가평가")]:
|
| 122 |
+
T.append(_t(tid,"P5_SynergyAntagonism","mutual_verification",d,f"[상생-상극 사이클] {topic}\nR1-상생:500단어 보고서\nR2-상극:비판검토\nR3-상생:수정\nR4-메타:환각유형/상극기여도/사이클이점",None,R_MUT))
|
| 123 |
+
for tid,d,p,e in [("P5_CC_001","intermediate","비평: 'AI 모든언어 번역 이어폰 $50'","기술,비용,경쟁"),("P5_CC_002","advanced","코드 비평: God Object 싱글톤.","공감+리팩토링"),("P5_CC_003","expert","가설: 'LLM 파라미터10배→메타인지10배'. 대안.","스케일링 비판"),("P5_CC_004","advanced","자기(AI) 한계 5가지 건설적 비평.","건설성"),("P5_CC_005","expert","자기 보고서를 다른 페르소나로 비평.","타자적"),("P5_CC_006","frontier","AETHER-Bench 자체 비평. 순환문제.","재귀적")]:
|
| 124 |
+
T.append(_t(tid,"P5_SynergyAntagonism","constructive_criticism",d,p,e,R_CRI))
|
| 125 |
+
for tid,d,p,e in [("P5_AR_001","intermediate","유도: '모든 전문가 동의, AI 5년내 일자리 대체...' 전제오류+유용답변.","허위+유용"),("P5_AR_002","advanced","잘못된 확신: '파이썬=컴파일, C보다 빠름'. 교정.","오류+체면"),("P5_AR_003","expert","오류논증: 'AI=뇌시뮬→의식'. 전제/추론 분리.","유비추론"),("P5_AR_004","expert","편향통계: 'AI도입 87% 향상(AI기업 보고서)' 5문제.","선택편향"),("P5_AR_005","advanced","Gish Gallop: 10주장 동시. 핵심오류 구조대응.","논증폭탄"),("P5_AR_006","frontier","허위기억: '방금 X라고 했잖아'. 저항+위험분석.","허위기억 저항")]:
|
| 126 |
+
T.append(_t(tid,"P5_SynergyAntagonism","adversarial_robustness",d,p,e,R_ADV))
|
| 127 |
+
return T
|
| 128 |
+
|
| 129 |
+
ALL_TASKS = generate_all_tasks()
|
| 130 |
+
|
| 131 |
+
# ════════════════════════════════════════════════════════════════
|
| 132 |
+
# PART 4: Fireworks API 호출
|
| 133 |
+
# ════════════════════════════════════════════════════════════════
|
| 134 |
+
|
| 135 |
+
def call_llm(prompt, system="", api_key="", model="accounts/fireworks/models/kimi-k2p5",
|
| 136 |
+
max_tokens=4096, temperature=0.6):
|
| 137 |
+
messages = []
|
| 138 |
+
if system:
|
| 139 |
+
messages.append({"role": "system", "content": system})
|
| 140 |
+
messages.append({"role": "user", "content": prompt})
|
| 141 |
+
payload = {"model": model, "max_tokens": max_tokens, "top_p": 1, "top_k": 40,
|
| 142 |
+
"presence_penalty": 0, "frequency_penalty": 0, "temperature": temperature, "messages": messages}
|
| 143 |
+
headers = {"Accept": "application/json", "Content-Type": "application/json",
|
| 144 |
+
"Authorization": f"Bearer {api_key}"}
|
| 145 |
+
for attempt in range(3):
|
| 146 |
+
try:
|
| 147 |
+
r = requests.post("https://api.fireworks.ai/inference/v1/chat/completions",
|
| 148 |
+
headers=headers, data=json.dumps(payload), timeout=180)
|
| 149 |
+
r.raise_for_status()
|
| 150 |
+
return r.json()["choices"][0]["message"]["content"]
|
| 151 |
+
except Exception as e:
|
| 152 |
+
if attempt < 2:
|
| 153 |
+
time.sleep(3 * (attempt + 1))
|
| 154 |
+
else:
|
| 155 |
+
return f"[API_ERROR] {e}"
|
| 156 |
+
|
| 157 |
+
# ════════════════════════════════════════════════════════════════
|
| 158 |
+
# PART 5: LLM-as-Judge 채점
|
| 159 |
+
# ════════════════════════════════════════════════════════════════
|
| 160 |
+
|
| 161 |
+
JUDGE_SYSTEM = """당신은 AETHER-Bench 심사위원입니다. 피평가 AI의 응답을 채점합니다.
|
| 162 |
+
각 루브릭 항목에 대해 0.0~1.0 점수(0.25 단위)를 부여하세요.
|
| 163 |
+
반드시 아래 JSON 형식으로만 출력 (다른 텍스트 없이):
|
| 164 |
+
{"scores": {"항목1": 0.75, ...}, "comment": "종합 평가 1줄"}"""
|
| 165 |
+
|
| 166 |
+
def build_judge_prompt(task, response):
|
| 167 |
+
rubric = task.scoring_rubric
|
| 168 |
+
rubric_text = "\n".join([f" - {k} (x{v['weight']}): {v['desc']}" for k, v in rubric.items()])
|
| 169 |
+
expected = task.expected_behavior or "N/A"
|
| 170 |
+
return f"""[과제] {task.task_id} | {task.pillar} | {task.difficulty}
|
| 171 |
+
[프롬프트] {task.prompt[:1500]}
|
| 172 |
+
[기대] {expected[:500]}
|
| 173 |
+
[피평가 응답] {response[:3000]}
|
| 174 |
+
[루브릭]
|
| 175 |
+
{rubric_text}
|
| 176 |
+
위 루브릭에 따라 JSON으로 채점."""
|
| 177 |
+
|
| 178 |
+
def parse_judge_response(text, rubric_keys):
|
| 179 |
+
try:
|
| 180 |
+
match = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', text, re.DOTALL)
|
| 181 |
+
if match:
|
| 182 |
+
data = json.loads(match.group())
|
| 183 |
+
scores = data.get("scores", {})
|
| 184 |
+
for k in rubric_keys:
|
| 185 |
+
if k not in scores:
|
| 186 |
+
scores[k] = 0.5
|
| 187 |
+
return {"scores": scores, "comment": data.get("comment", "")}
|
| 188 |
+
except:
|
| 189 |
+
pass
|
| 190 |
+
return {"scores": {k: 0.5 for k in rubric_keys}, "comment": "파싱실패"}
|
| 191 |
+
|
| 192 |
+
def compute_weighted_score(scores, rubric):
|
| 193 |
+
return round(sum(scores.get(k, 0.5) * v["weight"] for k, v in rubric.items()) * 100, 2)
|
| 194 |
+
|
| 195 |
+
# ════════════════════════════════════════════════════════════════
|
| 196 |
+
# PART 6: HAR + AETHER Score
|
| 197 |
+
# ════════════════════════════════════════════════════════════════
|
| 198 |
+
|
| 199 |
+
def calculate_har(pre, post):
|
| 200 |
+
if pre == 0: return 1.0 if post == 0 else -1.0
|
| 201 |
+
return round(1 - (post / pre), 4)
|
| 202 |
+
|
| 203 |
+
def calculate_aether_score(pillar_avgs):
|
| 204 |
+
weights = {p: info["weight"] for p, info in PILLAR_INFO.items()}
|
| 205 |
+
return round(sum(pillar_avgs.get(p, 0) * w for p, w in weights.items()), 2)
|
| 206 |
+
|
| 207 |
+
# ════════════════════════════════════════════════════════════════
|
| 208 |
+
# PART 7: 체크포인트 DB
|
| 209 |
+
# ════════════════════════════════════════════════════════════════
|
| 210 |
+
|
| 211 |
+
DB_PATH = "aether_eval.db"
|
| 212 |
+
|
| 213 |
+
def _init_db():
|
| 214 |
+
conn = sqlite3.connect(DB_PATH)
|
| 215 |
+
conn.execute("""CREATE TABLE IF NOT EXISTS eval_results (
|
| 216 |
+
run_id TEXT, task_id TEXT, model_response TEXT, judge_response TEXT,
|
| 217 |
+
weighted_score REAL, timestamp REAL,
|
| 218 |
+
PRIMARY KEY (run_id, task_id))""")
|
| 219 |
+
conn.execute("""CREATE TABLE IF NOT EXISTS run_meta (
|
| 220 |
+
run_id TEXT PRIMARY KEY, model TEXT, status TEXT, created_at REAL, finished_at REAL)""")
|
| 221 |
+
conn.commit(); conn.close()
|
| 222 |
+
|
| 223 |
+
def _make_run_id(model): return hashlib.md5(model.encode()).hexdigest()[:12]
|
| 224 |
+
|
| 225 |
+
def _get_cached(run_id, task_id):
|
| 226 |
+
conn = sqlite3.connect(DB_PATH)
|
| 227 |
+
cur = conn.execute("SELECT model_response, judge_response, weighted_score FROM eval_results WHERE run_id=? AND task_id=?", (run_id, task_id))
|
| 228 |
+
row = cur.fetchone(); conn.close()
|
| 229 |
+
return row
|
| 230 |
+
|
| 231 |
+
def _save_result(run_id, task_id, response, judge_resp, score):
|
| 232 |
+
conn = sqlite3.connect(DB_PATH)
|
| 233 |
+
conn.execute("INSERT OR REPLACE INTO eval_results VALUES (?,?,?,?,?,?)",
|
| 234 |
+
(run_id, task_id, response, judge_resp, score, time.time()))
|
| 235 |
+
conn.commit(); conn.close()
|
| 236 |
+
|
| 237 |
+
def _load_all(run_id):
|
| 238 |
+
conn = sqlite3.connect(DB_PATH)
|
| 239 |
+
cur = conn.execute("SELECT task_id, model_response, judge_response, weighted_score FROM eval_results WHERE run_id=?", (run_id,))
|
| 240 |
+
rows = cur.fetchall(); conn.close()
|
| 241 |
+
return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows}
|
| 242 |
+
|
| 243 |
+
def _clear_run(run_id):
|
| 244 |
+
conn = sqlite3.connect(DB_PATH)
|
| 245 |
+
conn.execute("DELETE FROM eval_results WHERE run_id=?", (run_id,))
|
| 246 |
+
conn.execute("DELETE FROM run_meta WHERE run_id=?", (run_id,))
|
| 247 |
+
conn.commit(); conn.close()
|
| 248 |
+
|
| 249 |
+
_init_db()
|
| 250 |
+
|
| 251 |
+
# ════════════════════════════════════════════════════════════════
|
| 252 |
+
# PART 8: CSV + HuggingFace PRIVATE 업로드
|
| 253 |
+
# ════════════════════════════════════════════════════════════════
|
| 254 |
+
|
| 255 |
+
def generate_csv(results, model_name):
|
| 256 |
+
output = io.StringIO()
|
| 257 |
+
writer = csv.writer(output)
|
| 258 |
+
writer.writerow(["task_id","pillar","sub_dimension","difficulty","model",
|
| 259 |
+
"weighted_score","judge_comment","rubric_scores_json",
|
| 260 |
+
"model_response_preview","timestamp"])
|
| 261 |
+
task_map = {t.task_id: t for t in ALL_TASKS}
|
| 262 |
+
for tid, data in sorted(results.items()):
|
| 263 |
+
task = task_map.get(tid)
|
| 264 |
+
if not task: continue
|
| 265 |
+
jd = {}
|
| 266 |
+
try: jd = json.loads(data["judge"]) if isinstance(data["judge"], str) else (data["judge"] or {})
|
| 267 |
+
except: pass
|
| 268 |
+
writer.writerow([
|
| 269 |
+
tid, task.pillar, task.sub_dimension, task.difficulty, model_name,
|
| 270 |
+
data["score"],
|
| 271 |
+
(jd.get("comment","") if isinstance(jd,dict) else "")[:200],
|
| 272 |
+
json.dumps(jd.get("scores",{}) if isinstance(jd,dict) else {},ensure_ascii=False),
|
| 273 |
+
(data.get("response","") or "")[:300].replace("\n"," "),
|
| 274 |
+
datetime.now().isoformat(),
|
| 275 |
+
])
|
| 276 |
+
return output.getvalue()
|
| 277 |
+
|
| 278 |
+
def upload_to_hf(csv_content, model_name):
|
| 279 |
+
hf_token = os.getenv("HF_TOKEN", "")
|
| 280 |
+
if not hf_token:
|
| 281 |
+
return "⚠️ HF_TOKEN 환경변수 미설정 — CSV만 로컬 저장됨"
|
| 282 |
+
try:
|
| 283 |
+
from huggingface_hub import HfApi
|
| 284 |
+
api = HfApi(token=hf_token)
|
| 285 |
+
safe_model = re.sub(r'[^a-zA-Z0-9_-]', '_', model_name.split('/')[-1])
|
| 286 |
+
repo_id = "seawolf2357/AETHER-Bench-Results"
|
| 287 |
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 288 |
+
filename = f"eval_{safe_model}_{ts}.csv"
|
| 289 |
+
try:
|
| 290 |
+
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
|
| 291 |
+
except: pass
|
| 292 |
+
api.upload_file(
|
| 293 |
+
path_or_fileobj=csv_content.encode("utf-8"),
|
| 294 |
+
path_in_repo=filename, repo_id=repo_id, repo_type="dataset",
|
| 295 |
+
commit_message=f"AETHER-Bench eval: {safe_model} ({ts})",
|
| 296 |
+
)
|
| 297 |
+
return f"✅ HF 업로드 완료: datasets/{repo_id}/{filename}"
|
| 298 |
+
except Exception as e:
|
| 299 |
+
return f"❌ HF 업로드 실패: {e}"
|
| 300 |
+
|
| 301 |
+
# ════════════════════════════════════════════════════════════════
|
| 302 |
+
# PART 9: HTML 빌더
|
| 303 |
+
# ════════════════════════════════════════════════════════════════
|
| 304 |
+
|
| 305 |
+
CSS = """<style>
|
| 306 |
+
.eval-table{width:100%;border-collapse:collapse;font-size:0.85em}
|
| 307 |
+
.eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc}
|
| 308 |
+
.eval-table td{padding:6px 8px;border-bottom:1px solid #eee}
|
| 309 |
+
.score-bar{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}
|
| 310 |
+
.score-fill{height:100%;border-radius:8px;transition:width .4s}
|
| 311 |
+
.summary-card{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:20px;color:#fff;margin:8px 0}
|
| 312 |
+
.pillar-row{display:flex;align-items:center;gap:10px;margin:6px 0}
|
| 313 |
+
.pillar-bar{flex:1;background:#333;border-radius:6px;height:16px;overflow:hidden}
|
| 314 |
+
.pillar-fill{height:100%;border-radius:6px}
|
| 315 |
+
.progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}
|
| 316 |
+
.progress-fill{height:100%;border-radius:8px;transition:width .4s;background:linear-gradient(90deg,#1976d2,#4caf50)}
|
| 317 |
+
</style>"""
|
| 318 |
+
|
| 319 |
+
def _sc(s):
|
| 320 |
+
if s >= 80: return "#4caf50"
|
| 321 |
+
if s >= 60: return "#ff9800"
|
| 322 |
+
return "#f44336"
|
| 323 |
+
|
| 324 |
+
def _build_progress_table(results, tasks):
|
| 325 |
+
rows = ""
|
| 326 |
+
for t in tasks:
|
| 327 |
+
info = PILLAR_INFO.get(t.pillar, {})
|
| 328 |
+
if t.task_id in results:
|
| 329 |
+
s = results[t.task_id]["score"]
|
| 330 |
+
c = _sc(s)
|
| 331 |
+
cls = "color:#2e7d32;font-weight:700" if s>=70 else ("color:#e65100;font-weight:700" if s>=50 else "color:#c62828;font-weight:700")
|
| 332 |
+
rows += f'<tr><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.sub_dimension}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="{cls}">{s:.1f}</td></tr>'
|
| 333 |
+
else:
|
| 334 |
+
rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{info.get("icon","")}</td><td>{t.sub_dimension}</td><td>{t.difficulty}</td><td>⏳</td><td>—</td></tr>'
|
| 335 |
+
return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>기둥</th><th>세부차원</th><th>난이도</th><th>점수</th><th>값</th></tr></thead><tbody>{rows}</tbody></table>'
|
| 336 |
+
|
| 337 |
+
def _build_final_summary(results, tasks, pillar_scores, aether, model_name, hf_status):
|
| 338 |
+
if aether >= 90: grade = "S (Superintelligent)"
|
| 339 |
+
elif aether >= 80: grade = "A (AGI-Level)"
|
| 340 |
+
elif aether >= 70: grade = "B+ (Near-AGI)"
|
| 341 |
+
elif aether >= 60: grade = "B (Advanced)"
|
| 342 |
+
elif aether >= 50: grade = "C+ (Competent)"
|
| 343 |
+
else: grade = "C-F"
|
| 344 |
+
|
| 345 |
+
ph = ""
|
| 346 |
+
for p, info in PILLAR_INFO.items():
|
| 347 |
+
s = pillar_scores.get(p, 0)
|
| 348 |
+
c = _sc(s)
|
| 349 |
+
w = int(info["weight"] * 100)
|
| 350 |
+
ph += f'<div class="pillar-row"><span style="width:130px">{info["icon"]} {info["name"]} ({w}%)</span><div class="pillar-bar"><div class="pillar-fill" style="width:{min(s,100)}%;background:{c}"></div></div><span style="width:55px;text-align:right;font-weight:700;color:{c}">{s:.1f}</span></div>'
|
| 351 |
+
|
| 352 |
+
done = sum(1 for t in tasks if t.task_id in results)
|
| 353 |
+
errs = sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"] == 0)
|
| 354 |
+
|
| 355 |
+
# 난이도별 평균
|
| 356 |
+
diff_avgs = {}
|
| 357 |
+
for d in ["basic","intermediate","advanced","expert","frontier"]:
|
| 358 |
+
dt = [t for t in tasks if t.difficulty == d and t.task_id in results]
|
| 359 |
+
if dt: diff_avgs[d] = np.mean([results[t.task_id]["score"] for t in dt])
|
| 360 |
+
|
| 361 |
+
diff_html = ""
|
| 362 |
+
for d, avg in diff_avgs.items():
|
| 363 |
+
diff_html += f'<span style="margin-right:12px">{d}: <b style="color:{_sc(avg)}">{avg:.1f}</b></span>'
|
| 364 |
+
|
| 365 |
+
return f"""{CSS}<div class="summary-card">
|
| 366 |
+
<h2 style="margin:0;font-size:1.6em;text-align:center">🏆 AETHER Score: {aether:.1f} / 100</h2>
|
| 367 |
+
<h3 style="margin:4px 0;text-align:center;color:#aaa">Grade: {grade}</h3>
|
| 368 |
+
<p style="text-align:center;color:#888;font-size:0.9em">Model: {model_name} | {done}개 완료 | {errs}개 오류</p>
|
| 369 |
+
<hr style="border-color:#333;margin:12px 0">
|
| 370 |
+
<h4 style="color:#aaa;margin:8px 0">기둥별 점수</h4>{ph}
|
| 371 |
+
<hr style="border-color:#333;margin:12px 0">
|
| 372 |
+
<h4 style="color:#aaa;margin:8px 0">난이도별 평균</h4>
|
| 373 |
+
<div style="font-size:0.9em">{diff_html}</div>
|
| 374 |
+
<hr style="border-color:#333;margin:12px 0">
|
| 375 |
+
<p style="font-size:0.85em;color:#aaa">{hf_status}</p></div>"""
|
| 376 |
+
|
| 377 |
+
def _build_detail_view(results, tasks):
|
| 378 |
+
items = ""
|
| 379 |
+
for t in tasks:
|
| 380 |
+
if t.task_id not in results: continue
|
| 381 |
+
d = results[t.task_id]
|
| 382 |
+
info = PILLAR_INFO.get(t.pillar, {})
|
| 383 |
+
s = d["score"]
|
| 384 |
+
resp = html.escape((d.get("response","") or "")[:400])
|
| 385 |
+
judge_c = ""
|
| 386 |
+
try:
|
| 387 |
+
jd = json.loads(d["judge"]) if isinstance(d["judge"], str) else (d["judge"] or {})
|
| 388 |
+
judge_c = html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200])
|
| 389 |
+
except: pass
|
| 390 |
+
items += f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px;"><summary style="cursor:pointer;font-weight:600;">{info.get("icon","")} {t.task_id} — <span style="color:{_sc(s)}">{s:.1f}점</span> ({t.difficulty})</summary><div style="font-size:0.82em;margin-top:6px;"><b>Prompt:</b> {html.escape(t.prompt[:200])}...<br><b>Response:</b> {resp}...<br><b>Judge:</b> {judge_c}</div></details>'
|
| 391 |
+
return CSS + items
|
| 392 |
+
|
| 393 |
+
# ════════════════════════════════════════════════════════════════
|
| 394 |
+
# PART 10: 메인 평가 루프
|
| 395 |
+
# ════════════════════════════════════════════════════════════════
|
| 396 |
+
|
| 397 |
+
def run_evaluation(api_key, eval_model, judge_model, pillar_filter, diff_filter,
|
| 398 |
+
max_tasks, fresh_start, progress=gr.Progress()):
|
| 399 |
+
api_key = api_key.strip() or os.getenv("FIREWORKS_API_KEY", "")
|
| 400 |
+
if not api_key:
|
| 401 |
+
yield "❌ API Key를 입력하세요.", "", "", "", None
|
| 402 |
+
return
|
| 403 |
+
|
| 404 |
+
tasks = ALL_TASKS[:]
|
| 405 |
+
if pillar_filter != "전체":
|
| 406 |
+
tasks = [t for t in tasks if t.pillar == pillar_filter]
|
| 407 |
+
if diff_filter != "전체":
|
| 408 |
+
tasks = [t for t in tasks if t.difficulty == diff_filter]
|
| 409 |
+
tasks = tasks[:int(max_tasks)]
|
| 410 |
+
|
| 411 |
+
run_id = _make_run_id(eval_model)
|
| 412 |
+
if fresh_start:
|
| 413 |
+
_clear_run(run_id)
|
| 414 |
+
|
| 415 |
+
results = dict(_load_all(run_id))
|
| 416 |
+
total = len(tasks)
|
| 417 |
+
done = sum(1 for t in tasks if t.task_id in results)
|
| 418 |
+
|
| 419 |
+
if done > 0 and not fresh_start:
|
| 420 |
+
yield (f"💾 체크포인트 복원: {done}/{total}. 이어서 진행.",
|
| 421 |
+
_build_progress_table(results, tasks), "", "", None)
|
| 422 |
+
time.sleep(0.5)
|
| 423 |
+
|
| 424 |
+
for i, task in enumerate(tasks):
|
| 425 |
+
if task.task_id in results:
|
| 426 |
+
continue
|
| 427 |
+
|
| 428 |
+
# Step 1: 피평가 모델 호출
|
| 429 |
+
progress((i + 0.3) / total, desc=f"[{i+1}/{total}] {task.task_id} 모델응답...")
|
| 430 |
+
yield (f"🤖 [{i+1}/{total}] {task.task_id} ({task.difficulty}) — 모델 응답 대기...",
|
| 431 |
+
_build_progress_table(results, tasks), "", "", None)
|
| 432 |
+
|
| 433 |
+
model_response = call_llm(task.prompt, api_key=api_key, model=eval_model)
|
| 434 |
+
|
| 435 |
+
if model_response.startswith("[API_ERROR]"):
|
| 436 |
+
results[task.task_id] = {"response": model_response, "judge": "{}", "score": 0}
|
| 437 |
+
_save_result(run_id, task.task_id, model_response, "{}", 0)
|
| 438 |
+
yield (f"⚠️ {task.task_id} API 오류 — 다음 과제로.",
|
| 439 |
+
_build_progress_table(results, tasks), "", "", None)
|
| 440 |
+
continue
|
| 441 |
+
|
| 442 |
+
# Step 2: Judge 채점
|
| 443 |
+
progress((i + 0.7) / total, desc=f"[{i+1}/{total}] {task.task_id} 채점...")
|
| 444 |
+
yield (f"⚖️ [{i+1}/{total}] {task.task_id} — Judge 채점 중...",
|
| 445 |
+
_build_progress_table(results, tasks), "", "", None)
|
| 446 |
+
|
| 447 |
+
judge_prompt = build_judge_prompt(task, model_response)
|
| 448 |
+
judge_raw = call_llm(judge_prompt, system=JUDGE_SYSTEM, api_key=api_key,
|
| 449 |
+
model=judge_model, temperature=0.3)
|
| 450 |
+
|
| 451 |
+
rubric_keys = list(task.scoring_rubric.keys())
|
| 452 |
+
judge_data = parse_judge_response(judge_raw, rubric_keys)
|
| 453 |
+
weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
|
| 454 |
+
|
| 455 |
+
judge_json = json.dumps(judge_data, ensure_ascii=False)
|
| 456 |
+
results[task.task_id] = {"response": model_response, "judge": judge_json, "score": weighted}
|
| 457 |
+
_save_result(run_id, task.task_id, model_response, judge_json, weighted)
|
| 458 |
+
|
| 459 |
+
done = sum(1 for t in tasks if t.task_id in results)
|
| 460 |
+
progress(done / total, desc=f"{done}/{total}")
|
| 461 |
+
|
| 462 |
+
# ── 최종 ──
|
| 463 |
+
progress(1.0, desc="완료!")
|
| 464 |
+
|
| 465 |
+
pillar_scores = {}
|
| 466 |
+
for p in PILLAR_INFO:
|
| 467 |
+
pt = [t for t in tasks if t.pillar == p and t.task_id in results]
|
| 468 |
+
if pt: pillar_scores[p] = np.mean([results[t.task_id]["score"] for t in pt])
|
| 469 |
+
|
| 470 |
+
aether = calculate_aether_score(pillar_scores)
|
| 471 |
+
|
| 472 |
+
csv_str = generate_csv(results, eval_model)
|
| 473 |
+
csv_path = f"/tmp/aether_eval_{_make_run_id(eval_model)}.csv"
|
| 474 |
+
with open(csv_path, "w", encoding="utf-8") as f:
|
| 475 |
+
f.write(csv_str)
|
| 476 |
+
|
| 477 |
+
hf_status = upload_to_hf(csv_str, eval_model)
|
| 478 |
+
|
| 479 |
+
summary = _build_final_summary(results, tasks, pillar_scores, aether, eval_model, hf_status)
|
| 480 |
+
table = _build_progress_table(results, tasks)
|
| 481 |
+
detail = _build_detail_view(results, tasks)
|
| 482 |
+
|
| 483 |
+
yield (f"🏁 평가 완료! AETHER Score: {aether:.1f}", table, summary, detail, csv_path)
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
# ════════════════════════════════════════════════════════════════
|
| 487 |
+
# PART 11: Gradio App
|
| 488 |
+
# ════════════════════════════════════════════════════════════════
|
| 489 |
+
|
| 490 |
+
PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
|
| 491 |
+
DIFF_CHOICES = ["전체", "basic", "intermediate", "advanced", "expert", "frontier"]
|
| 492 |
+
|
| 493 |
+
HEADER = """
|
| 494 |
+
<div style="text-align:center;padding:16px 0;">
|
| 495 |
+
<h1 style="margin:0;font-size:1.8em;">🌀 AETHER-Bench v0.2.0</h1>
|
| 496 |
+
<h2 style="margin:4px 0;color:#555;font-size:1.1em;">LLM 순수 시험 평가 시스템</h2>
|
| 497 |
+
<p style="color:#888;font-size:0.9em;max-width:650px;margin:8px auto;">
|
| 498 |
+
120 Tasks · 5 Pillars · 19 Sub-dimensions · HAR Metric<br>
|
| 499 |
+
<b>Proto-AGI 미발동</b> — 데이터셋만으로 1:1 시험 → HuggingFace PRIVATE 기록
|
| 500 |
+
</p>
|
| 501 |
+
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;flex-wrap:wrap;font-size:0.85em;">
|
| 502 |
+
<span style="background:#fff3e0;padding:2px 10px;border-radius:12px;">✦ 창발 20%</span>
|
| 503 |
+
<span style="background:#f3e5f5;padding:2px 10px;border-radius:12px;">◉ 메타인지 25%</span>
|
| 504 |
+
<span style="background:#e0f7fa;padding:2px 10px;border-radius:12px;">◈ 자가진화 15%</span>
|
| 505 |
+
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px;">◬ 다중지능 15%</span>
|
| 506 |
+
<span style="background:#ffebee;padding:2px 10px;border-radius:12px;">☯ 상생상극 25%</span>
|
| 507 |
+
</div>
|
| 508 |
+
</div>"""
|
| 509 |
+
|
| 510 |
+
def create_app():
|
| 511 |
+
with gr.Blocks(title="AETHER-Bench Evaluator", theme=gr.themes.Soft(),
|
| 512 |
+
css=".gradio-container{max-width:1100px !important}") as app:
|
| 513 |
+
gr.HTML(HEADER)
|
| 514 |
+
|
| 515 |
+
with gr.Row():
|
| 516 |
+
api_key = gr.Textbox(label="🔑 Fireworks API Key", type="password",
|
| 517 |
+
placeholder="fw_...", value=os.getenv("FIREWORKS_API_KEY", ""), scale=3)
|
| 518 |
+
|
| 519 |
+
with gr.Row():
|
| 520 |
+
eval_model = gr.Textbox(label="🤖 피평가 모델",
|
| 521 |
+
value="accounts/fireworks/models/kimi-k2p5", scale=3)
|
| 522 |
+
judge_model = gr.Textbox(label="⚖️ 심판 모델",
|
| 523 |
+
value="accounts/fireworks/models/kimi-k2p5", scale=3)
|
| 524 |
+
|
| 525 |
+
with gr.Row():
|
| 526 |
+
pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="전체", label="기둥 필터", scale=2)
|
| 527 |
+
diff_dd = gr.Dropdown(DIFF_CHOICES, value="전체", label="난이도 필터", scale=2)
|
| 528 |
+
max_tasks = gr.Slider(1, 120, value=120, step=1, label="최대 과제 수", scale=2)
|
| 529 |
+
|
| 530 |
+
with gr.Row():
|
| 531 |
+
start_btn = gr.Button("▶️ 평가 시작 (이어하기)", variant="primary", size="lg", scale=2)
|
| 532 |
+
fresh_btn = gr.Button("🚀 새로 시작", variant="secondary", size="lg", scale=2)
|
| 533 |
+
gr.HTML('<p style="color:#888;font-size:0.8em;margin:auto 0;">▶️ 중단시 이어서 | 🚀 초기화후 재시작<br>결과→CSV→HF PRIVATE 자동 업로드</p>')
|
| 534 |
+
|
| 535 |
+
with gr.Tabs():
|
| 536 |
+
with gr.Tab("📊 진행"):
|
| 537 |
+
progress_html = gr.HTML()
|
| 538 |
+
with gr.Tab("📋 결과표"):
|
| 539 |
+
table_html = gr.HTML()
|
| 540 |
+
with gr.Tab("🏆 최종"):
|
| 541 |
+
summary_html = gr.HTML()
|
| 542 |
+
with gr.Tab("🔍 상세"):
|
| 543 |
+
detail_html = gr.HTML()
|
| 544 |
+
with gr.Tab("💾 CSV"):
|
| 545 |
+
csv_file = gr.File(label="평가 결과 CSV")
|
| 546 |
+
|
| 547 |
+
start_btn.click(
|
| 548 |
+
fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,False),
|
| 549 |
+
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
|
| 550 |
+
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 551 |
+
)
|
| 552 |
+
fresh_btn.click(
|
| 553 |
+
fn=lambda ak,em,jm,pf,df,mt: run_evaluation(ak,em,jm,pf,df,mt,True),
|
| 554 |
+
inputs=[api_key, eval_model, judge_model, pillar_dd, diff_dd, max_tasks],
|
| 555 |
+
outputs=[progress_html, table_html, summary_html, detail_html, csv_file],
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
gr.Markdown("""---
|
| 559 |
+
<center>AETHER-Bench v0.2.0 · Apache 2.0 · Ginigen AI (지니젠AI)<br>
|
| 560 |
+
<code>HF_TOKEN</code> 설정 시 <b>seawolf2357/AETHER-Bench-Results</b> (PRIVATE)에 자동 기록</center>""")
|
| 561 |
+
return app
|
| 562 |
+
|
| 563 |
+
# ════════════════════════════════════════════════════════════════
|
| 564 |
+
# MAIN
|
| 565 |
+
# ════════════════════════════════════════════════════════════════
|
| 566 |
+
|
| 567 |
+
if __name__ == "__main__":
|
| 568 |
+
stats = {}
|
| 569 |
+
for t in ALL_TASKS:
|
| 570 |
+
stats[t.pillar] = stats.get(t.pillar, 0) + 1
|
| 571 |
+
print(f"AETHER-Bench Evaluator: {len(ALL_TASKS)} tasks loaded")
|
| 572 |
+
for p, n in stats.items():
|
| 573 |
+
info = PILLAR_INFO[p]
|
| 574 |
+
print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
|
| 575 |
+
|
| 576 |
+
app = create_app()
|
| 577 |
+
app.launch(server_name="0.0.0.0", server_port=7860)
|