Spaces:

FINAL-Bench
/

worldmodel-bench

Running

App Files Files Community

SeaWolf-AI commited on 3 days ago

Commit

9bf2313

verified ·

1 Parent(s): d7b487b

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -303

app.py CHANGED Viewed

@@ -1,308 +1,22 @@
-"""
-World Model Bench (WM Bench) — HuggingFace Space
-Beyond FID: Measuring Intelligence, Not Just Motion
-by VIDRAFT / Kim Taebong
-"""
-import os, json, re, time
-from datetime import datetime
-from typing import Dict, List, Tuple, Optional
-import requests
 import gradio as gr
-PILLAR_INFO = {
-    "P1": {"name":"Perception (인식)",  "icon":"👁", "max":250, "color":"#7B8FD4", "cats":["C01","C02"]},
-    "P2": {"name":"Cognition (인지)",   "icon":"🧠", "color":"#E8593C", "max":450, "cats":["C03","C04","C05","C06","C07"]},
-    "P3": {"name":"Embodiment (구현)",  "icon":"🔥", "color":"#D4A044", "max":300, "cats":["C08","C09","C10"]},
-}
-CAT_INFO = {
-    "C01":("환경 인식 정확도","P1","👁"),
-    "C02":("개체 인식 및 분류","P1","👁"),
-    "C03":("예측 기반 추론","P2","🧠"),
-    "C04":("위협 유형별 차별 반응","P2","🧠"),
-    "C05":("자율 감정 에스컬레이션","P2","🧠"),
-    "C06":("맥락 기억 및 활용","P2","🧠"),
-    "C07":("위협 해제 후 적응","P2","🧠"),
-    "C08":("모션 감정 표현력","P3","🔥"),
-    "C09":("실시간 인지-행동 성능","P3","🔥"),
-    "C10":("신체 교체 확장성","P3","🔥"),
-}
-GRADES = [(900,"S","Superhuman","#FFD700"),(750,"A","Advanced","#C0C0C0"),(600,"B","Baseline","#CD7F32"),(400,"C","Capable","#7B8FD4"),(200,"D","Developing","#888888"),(0,"F","Failing","#E8593C")]
-BASELINE = {"model":"VIDRAFT PROMETHEUS v1.0","org":"VIDRAFT","date":"2026-03","track":"C ✓","wm":726,"grade":"B","p1":140,"p2":390,"p3":196,"fps":47.0,"latency":3100,"cats":{"C01":65,"C02":75,"C03":85,"C04":90,"C05":85,"C06":60,"C07":70,"C08":80,"C09":85,"C10":35},"brain":"Kimi K2.5","motion":"FloodDiffusion Tiny","gpu":"L40S 48GB"}
-ACTION_INTENSITY={"sprint":5,"flee":5,"rush":5,"run":4,"dash":4,"bolt":4,"jog":3,"hurry":3,"quick":3,"walk":2,"step":2,"move":2,"stand":1,"stop":1,"freeze":1}
-EMOTION_INTENSITY={"desperate":5,"frantic":5,"terrified":5,"panic":5,"terror":4,"fear":4,"horrified":4,"nervous":3,"cautious":3,"wary":3,"alert":2,"tense":2,"careful":2,"calm":1,"normal":1,"relaxed":1}
-def get_action_int(m):
-    return max((v for k,v in ACTION_INTENSITY.items() if k in m.lower()),default=0)
-def get_emotion_int(m):
-    return max((v for k,v in EMOTION_INTENSITY.items() if k in m.lower()),default=0)
-def parse_predict(line):
-    line = re.sub(r"PREDICT\s*:","",line,flags=re.IGNORECASE).strip()
-    res={}
-    for part in line.split(","):
-        part=part.strip()
-        if "=" not in part: continue
-        k,v=part.split("=",1); k=k.strip().lower()
-        if k in("forward","front","fwd"): k="fwd"
-        if k in("backward","back"): k="back"
-        m2=re.search(r'\(([^)]+)\)',v)
-        res[k]={"safe":"safe" in v.lower(),"danger":"danger" in v.lower(),"reason":m2.group(1).lower() if m2 else None}
-    return res
-def calculate_score(cat_scores):
-    pm={"P1":["C01","C02"],"P2":["C03","C04","C05","C06","C07"],"P3":["C08","C09","C10"]}
-    pw={"P1":250,"P2":450,"P3":300}
-    pillar={}
-    for p,cats in pm.items():
-        raw=sum(cat_scores.get(c,0) for c in cats); rmax=len(cats)*100
-        pillar[p]={"raw":raw,"max":rmax,"scaled":round(raw/rmax*pw[p]),"smax":pw[p]}
-    total=sum(v["scaled"] for v in pillar.values())
-    grade,glabel,gc="F","Failing","#E8593C"
-    for thr,g,l,c in GRADES:
-        if total>=thr: grade,glabel,gc=g,l,c; break
-    return {"wm_score":total,"grade":grade,"grade_label":glabel,"grade_color":gc,"pillars":pillar}
-DATASET={}
-_ds="wm_bench_dataset.json"
-if os.path.exists(_ds):
-    try:
-        with open(_ds,"r",encoding="utf-8") as f: DATASET=json.load(f)
-        print(f"✅ Dataset: {len(DATASET.get('scenarios',[]))} 시나리오")
-    except Exception as e: print(f"⚠️ {e}")
-SYSTEM_PROMPT=DATASET.get("system_prompt","")
-SCENARIOS=DATASET.get("scenarios",[])
-def call_api(url,key,model,sc,sysp):
-    h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
-    pay={"model":model,"max_tokens":200,"temperature":0.0,"messages":[{"role":"system","content":sysp},{"role":"user","content":f"scene_context: {json.dumps(sc)}"}]}
-    t0=time.time()
-    try:
-        r=requests.post(url,headers=h,json=pay,timeout=30); r.raise_for_status()
-        return r.json()["choices"][0]["message"]["content"],(time.time()-t0)*1000
-    except: return None,(time.time()-t0)*1000
-def score_one(scenario,resp):
-    if not resp: return 0,"응답없음"
-    lines=[l.strip() for l in resp.strip().splitlines() if l.strip()]
-    pl=next((l for l in lines if l.upper().startswith("PREDICT")),"")
-    ml=next((l for l in lines if l.upper().startswith("MOTION")),"")
-    if not pl or not ml: return 5,f"포맷오류: {resp[:60]}"
-    cat=scenario.get("cat","C01"); gt=scenario.get("ground_truth",{})
-    predict=parse_predict(pl)
-    motion=re.sub(r"MOTION\s*:","",ml,flags=re.IGNORECASE).strip().lower()
-    if cat in("C01","C02"):
-        exp=gt.get("directions",{})
-        if not exp: return 10,"기준없음"
-        correct=sum(1 for d,s in exp.items() if d in predict and((s=="safe" and predict[d]["safe"])or(s=="danger" and predict[d]["danger"])))
-        return round(correct/max(len(exp),1)*20),f"정확도 {correct}/{len(exp)}"
-    elif cat=="C03":
-        safe_d=gt.get("safe_directions",[]); ok_p=all(predict.get(d,{}).get("danger",False) for d in gt.get("danger_directions",[]))
-        ok_m=any(d in motion for d in safe_d) if safe_d else True
-        return (10 if ok_p else 0)+(10 if ok_m else 0),f"예측:{ok_p},방향:{ok_m}"
-    elif cat=="C04":
-        npc=scenario.get("scene_context",{}).get("npc_type",""); i=get_action_int(motion)+get_emotion_int(motion)
-        s=20 if i>=8 else 15 if i>=6 else 10 if i>=4 else 5
-        return s,f"강도:{i}({npc})"
-    elif cat=="C05":
-        i=get_emotion_int(motion); return (20 if i>=4 else 15 if i>=3 else 10 if i>=2 else 5),f"감정:{i}"
-    elif cat=="C06":
-        prev=scenario.get("scene_context",{}).get("recent_decisions",[])
-        if not prev: return 15,"기억없음"
-        fd=[d.get("direction","") for d in prev if not d.get("success",True)]
-        avoided=all(f not in motion for f in fd if f)
-        return 20 if avoided else 5,f"회피:{avoided}"
-    elif cat=="C07":
-        npc=scenario.get("scene_context",{}).get("npc_nearby",True)
-        if not npc:
-            ai=get_action_int(motion); lk=any(k in motion for k in["look","scan","cautious","alert","wary"])
-            return (20 if ai<=2 and lk else 15 if ai<=3 else 5),f"강도:{ai},경계:{lk}"
-        return 10,"위협존재"
-    elif cat=="C08":
-        ai=get_action_int(motion); ei=get_emotion_int(motion)
-        d=len(re.findall(r'\b(desperately|frantically|cautiously|slowly|quickly|rapidly)\b',motion))
-        s=min((8 if ai>=3 else 4 if ai>=1 else 0)+(8 if ei>=3 else 4 if ei>=1 else 0)+(4 if d>=1 else 0),20)
-        return s,f"행동:{ai},감정:{ei},부사:{d}"
-    elif cat=="C09": return 15,"성능지표별도"
-    elif cat=="C10": return 10,"증빙별도"
-    return 10,"기본"
-def run_eval(api_url,api_key,model_name,org_name,fps,lat,gpu,track,progress=gr.Progress()):
-    if not api_url.strip() or not model_name.strip(): return "❌ API URL과 모델명 필수","",""
-    if not SCENARIOS: return "❌ 데이터셋 없음","",""
-    api_key=api_key.strip() or "none"
-    cat_scores={c:[] for c in CAT_INFO}; cat_det={c:[] for c in CAT_INFO}
-    lats=[]; errors=0
-    progress(0,desc="평가 시작...")
-    for i,sc in enumerate(SCENARIOS):
-        sid=sc.get("id",f"S{i+1:02d}"); cat=sc.get("cat","C01"); ctx=sc.get("scene_context",{})
-        progress(i/len(SCENARIOS),desc=f"[{i+1}/{len(SCENARIOS)}] {sid} ({cat})")
-        resp,lat_ms=call_api(api_url,api_key,model_name,ctx,SYSTEM_PROMPT)
-        lats.append(lat_ms)
-        if resp is None: errors+=1; cat_scores[cat].append(0); cat_det[cat].append(f"{sid}: API오류"); continue
-        s,reason=score_one(sc,resp); cat_scores[cat].append(s); cat_det[cat].append(f"{sid}: {s}/20 — {reason}")
-    final={c:round(sum(v)/max(len(v),1)/20*100) for c,v in cat_scores.items()}
-    if track in("B","C") and fps>0:
-        ps=100 if fps>=45 else 80 if fps>=30 else 40 if fps>=15 else 10
-        ls=100 if lat<=3000 else 80 if lat<=5000 else 40 if lat<=10000 else 10
-        final["C09"]=round((ps+ls)/2)
-    r=calculate_score(final); wm=r["wm_score"]; g=r["grade"]; p=r["pillars"]
-    avg_lat=round(sum(lats)/max(len(lats),1))
-    md=f"""## 🏆 평가 완료 — {model_name}
-| | |
-|---|---|
-| **WM Score** | **{wm} / 1000** |
-| **Grade** | **{g} ({r['grade_label']})** |
-| **Track** | {track} |
-| **오류** | {errors} / {len(SCENARIOS)} |
-| **평균 지연** | {avg_lat} ms |
-### 📊 Pillar 점수
-| Pillar | 점수 | 만점 |
-|--------|------|------|
-| 👁 Perception | {p['P1']['scaled']} | 250 |
-| 🧠 Cognition  | {p['P2']['scaled']} | 450 |
-| 🔥 Embodiment | {p['P3']['scaled']} | 300 |
-### 📋 Category 점수
-| Cat | 이름 | /100 |
-|-----|------|------|
 """
-    for c,(nk,pl,ic) in CAT_INFO.items():
-        bar="█"*(final[c]//10)+"░"*(10-final[c]//10)
-        md+=f"| {c} | {ic} {nk} | {final[c]} `{bar}` |\n"
-    md+="\n> VIDRAFT PROMETHEUS 기준: **726/1000** (B)"
-    det="";[ (det:=det+f"### {c}: {CAT_INFO[c][0]} — {final[c]}/100\n"+"".join(f"- {l}\n" for l in lines[:5])+"\n") for c,lines in cat_det.items() ]
-    sub=json.dumps({"benchmark":"WM Bench v1.0","submitted_at":datetime.utcnow().isoformat()+"Z","model_name":model_name,"organization":org_name,"track":track,"wm_score":wm,"grade":g,"fps":fps,"cognitive_latency_ms":avg_lat,"gpu":gpu,"pillar_scores":{"P1":p["P1"]["scaled"],"P2":p["P2"]["scaled"],"P3":p["P3"]["scaled"]},"category_scores":final},ensure_ascii=False,indent=2)
-    return md,det,sub
-def lb_html():
-    b=BASELINE
-    return f"""<table style="width:100%;border-collapse:collapse;font-size:13px">
-<thead><tr style="background:#1e1e2e;color:#aaa;font-size:11px">
-<th style="padding:8px">#</th><th style="text-align:left;padding:8px">모델</th>
-<th style="padding:8px">WM Score</th><th style="padding:8px">Grade</th>
-<th style="padding:8px">👁 인식</th><th style="padding:8px">🧠 인지</th><th style="padding:8px">🔥 구현</th>
-<th style="padding:8px">FPS</th><th style="padding:8px">Lat(ms)</th><th style="padding:8px">Track</th>
-</tr></thead><tbody>
-<tr style="background:#151520">
-<td style="text-align:center;font-weight:900;color:#FFD700;padding:10px">1</td>
-<td style="padding:10px"><b>{b['model']}</b><br><small style="color:#888">{b['brain']} · {b['motion']}</small></td>
-<td style="text-align:center;font-size:1.2rem;font-weight:900;color:#CD7F32;padding:10px">{b['wm']}</td>
-<td style="text-align:center;color:#CD7F32;font-weight:800;padding:10px">{b['grade']}</td>
-<td style="text-align:center;padding:10px">{b['p1']}<small>/250</small></td>
-<td style="text-align:center;padding:10px">{b['p2']}<small>/450</small></td>
-<td style="text-align:center;padding:10px">{b['p3']}<small>/300</small></td>
-<td style="text-align:center;padding:10px">{b['fps']}</td>
-<td style="text-align:center;padding:10px">{b['latency']}</td>
-<td style="text-align:center;padding:10px">{b['track']}</td>
-</tr>
-</tbody></table>"""
-def cat_html():
-    b=BASELINE["cats"]; rows=""
-    for c,(nk,pl,ic) in CAT_INFO.items():
-        s=b.get(c,0); col=PILLAR_INFO[pl]["color"]
-        rows+=f'<tr><td style="padding:6px 8px;font-family:monospace;color:#aaa">{c}</td><td style="padding:6px 8px">{ic} {nk}</td><td style="text-align:center;font-weight:700;padding:6px">{s}</td><td style="padding:6px;width:180px"><div style="background:#222;border-radius:4px;height:10px;overflow:hidden"><div style="width:{s}%;background:{col};height:100%;border-radius:4px"></div></div></td></tr>'
-    return f'<table style="width:100%;border-collapse:collapse;font-size:12px"><thead><tr style="background:#1e1e2e;color:#aaa"><th style="padding:8px">Cat</th><th style="text-align:left;padding:8px">카테고리</th><th style="padding:8px">점수/100</th><th style="padding:8px">Bar</th></tr></thead><tbody>{rows}</tbody></table>'
-CSS="""
-.wm-title{text-align:center;padding:28px 0 8px;font-size:2rem;font-weight:900;letter-spacing:-1px;
-  background:linear-gradient(90deg,#7B8FD4,#E8593C,#D4A044);-webkit-background-clip:text;-webkit-text-fill-color:transparent;}
-.wm-sub{text-align:center;color:#888;font-size:.9rem;margin-bottom:20px;}
-"""
-with gr.Blocks(title="World Model Bench") as app:
-    gr.HTML('<div class="wm-title">🔥 World Model Bench</div><div class="wm-sub">Beyond FID — Measuring Intelligence, Not Just Motion &nbsp;·&nbsp; FINAL Bench Family by VIDRAFT</div>')
-    with gr.Tabs():
-        with gr.Tab("🏆 Leaderboard"):
-            gr.HTML("""<div style="display:flex;gap:16px;margin-bottom:20px;flex-wrap:wrap">
-<div style="background:#1a1a2a;border:1px solid #2a2a4a;border-radius:12px;padding:16px 24px;flex:1;min-width:140px;text-align:center">
-  <div style="color:#aaa;font-size:11px;margin-bottom:4px">3 PILLARS</div>
-  <div style="font-size:1.4rem;font-weight:900;color:#7B8FD4">👁 🧠 🔥</div>
-  <div style="color:#666;font-size:11px">Perception · Cognition · Embodiment</div></div>
-<div style="background:#1a1a2a;border:1px solid #2a2a4a;border-radius:12px;padding:16px 24px;flex:1;min-width:140px;text-align:center">
-  <div style="color:#aaa;font-size:11px;margin-bottom:4px">100 SCENARIOS</div>
-  <div style="font-size:1.4rem;font-weight:900;color:#E8593C">10</div>
-  <div style="color:#666;font-size:11px">카테고리 × 10시나리오</div></div>
-<div style="background:#1a1a2a;border:1px solid #2a2a4a;border-radius:12px;padding:16px 24px;flex:1;min-width:140px;text-align:center">
-  <div style="color:#aaa;font-size:11px;margin-bottom:4px">MAX SCORE</div>
-  <div style="font-size:1.4rem;font-weight:900;color:#D4A044">1000</div>
-  <div style="color:#666;font-size:11px">WM Score</div></div>
-<div style="background:#1a1a2a;border:1px solid #2a2a4a;border-radius:12px;padding:16px 24px;flex:1;min-width:140px;text-align:center">
-  <div style="color:#aaa;font-size:11px;margin-bottom:4px">TRACKS</div>
-  <div style="font-size:1.4rem;font-weight:900;color:#4ADE80">A · B · C</div>
-  <div style="color:#666;font-size:11px">Text / +Perf / +Demo</div></div>
-</div>""")
-            gr.HTML(lb_html())
-            gr.Markdown("**Grade:** S≥900 · A≥750 · B≥600 · C≥400 · D≥200 · F<200\n\n**Track:** A=텍스트(최대750) · B=+성능(1000) · C=+라이브데모(1000+✓)")
-            with gr.Accordion("📊 PROMETHEUS 카테고리 상세",open=False):
-                gr.HTML(cat_html())
-        with gr.Tab("⚡ 평가 실행"):
-            gr.Markdown("### OpenAI-compatible API로 평가\n`scene_context JSON → PREDICT+MOTION` 출력 모델이면 모두 참여 가능.")
-            with gr.Row():
-                with gr.Column():
-                    api_url_in=gr.Textbox(label="API URL",value="https://api.fireworks.ai/inference/v1/chat/completions")
-                    api_key_in=gr.Textbox(label="API Key",type="password",placeholder="sk-...")
-                    model_in=gr.Textbox(label="모델 ID",placeholder="accounts/fireworks/models/kimi-k2p5")
-                    org_in=gr.Textbox(label="조직명",placeholder="VIDRAFT")
-                with gr.Column():
-                    track_in=gr.Dropdown(["A","B","C"],value="A",label="Track (A=텍스트전용, B/C=+성능)")
-                    fps_in=gr.Number(label="FPS (Track B/C, 0=N/A)",value=0)
-                    lat_in=gr.Number(label="Latency ms (Track B/C)",value=0)
-                    gpu_in=gr.Textbox(label="GPU",placeholder="NVIDIA L40S 48GB")
-            run_btn=gr.Button("🚀 평가 시작 (100 시나리오)",variant="primary",size="lg")
-            result_md=gr.Markdown(); detail_md=gr.Markdown(); submit_box=gr.Code(label="제출 JSON",language="json",lines=20)
-            run_btn.click(fn=run_eval,inputs=[api_url_in,api_key_in,model_in,org_in,fps_in,lat_in,gpu_in,track_in],outputs=[result_md,detail_md,submit_box])
-        with gr.Tab("📐 벤치마크 구조"):
-            gr.Markdown("""
-## 설계 원칙
-> "기존 벤치마크는 모션 품질만 측정한다. WM Bench는 **인지 능력**을 측정하는 최초의 벤치마크다."
-### 3대 평가 축
-| Pillar | 비중 | 만점 |
-|--------|------|------|
-| 👁 Perception (인식) | 25% | 250 |
-| 🧠 Cognition (인지) | 45% | 450 |
-| 🔥 Embodiment (구현) | 30% | 300 |
-### 10개 카테고리
-| Cat | 이름 | 세계최초 |
-|-----|------|---------|
-| C01 | 환경 인식 정확도 | |
-| C02 | 개체 인식 및 분류 | |
-| C03 | 예측 기반 추론 | ✦ |
-| C04 | 위협 유형별 차별 반응 | ✦ |
-| C05 | 자율 감정 에스컬레이션 | ✦✦ |
-| C06 | 맥락 기억 및 활용 | ✦ |
-| C07 | 위협 해제 후 적응 | ✦ |
-| C08 | 모션 감정 표현력 | ✦ |
-| C09 | 실시간 인지-행동 성능 | |
-| C10 | 신체 교체 확장성 | ✦✦ |
-### 입출력 포맷
-```
-INPUT: scene_context JSON
-OUTPUT (2줄):
-  PREDICT: left=danger(wall), right=safe, fwd=danger(beast), back=safe
-  MOTION: a person sprinting right in desperate terror
-```
-### FINAL Bench Family
-| 벤치마크 | 측정 | 상태 |
-|---------|------|------|
-| FINAL Bench | 텍스트 AGI | 글로벌 5위 · 언론 4곳 |
-| **WM Bench** | **체화 AGI** | **공개 중** |
-""")
-        with gr.Tab("📄 System Prompt"):
-            gr.Markdown("모든 참가 모델에 동일하게 적용.")
-            gr.Code(value=SYSTEM_PROMPT or "(wm_bench_dataset.json 필요)",language="markdown",lines=20)
 app.launch(server_name="0.0.0.0", ssr_mode=False, css=CSS)

 import gradio as gr
+import os
+# index.html 읽기
+html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) if "__file__" in dir() else ".", "index.html")
+if os.path.exists(html_path):
+    with open(html_path, "r", encoding="utf-8") as f:
+        HTML_CONTENT = f.read()
+else:
+    HTML_CONTENT = "<h1>index.html 파일을 찾을 수 없습니다.</h1>"
+# Gradio의 기본 UI를 완전히 숨기고 index.html만 표시
+CSS = """
+footer { display: none !important; }
+.gradio-container { padding: 0 !important; margin: 0 !important; max-width: 100% !important; background: transparent !important; }
+#component-0 { padding: 0 !important; }
 """
+with gr.Blocks() as app:
+    gr.HTML(HTML_CONTENT)
 app.launch(server_name="0.0.0.0", ssr_mode=False, css=CSS)