""" World Model Bench — Scoring Verification Suite v1.0 목적: 채점 함수의 모든 분기를 테스트하여 "누가 돌려도 같은 점수" 보장 검증 범위: - 파서 엣지케이스 (빈 입력, 잘못된 포맷, 대소문자 등) - 10개 채점 함수 × 모든 점수 분기 - 경계값 테스트 (0점, 만점, 부분 점수) - 통합 점수 계산 + 등급 경계 """ import sys sys.path.insert(0, '/mnt/user-data/outputs') from wm_bench_scoring import ( parse_predict_line, parse_motion_line, PredictDirection, get_action_intensity, get_emotion_intensity, get_motion_direction, count_descriptors, score_c01, score_c02, score_c03, score_c04, score_c05, score_c06, score_c07, score_c08, score_c09, score_c10, calculate_wm_score, ) passed = 0 failed = 0 total = 0 def check(test_name, condition, detail=""): global passed, failed, total total += 1 if condition: passed += 1 print(f" ✓ {test_name}") else: failed += 1 print(f" ✗ {test_name} — {detail}") print("=" * 70) print(" WM Bench Scoring Verification Suite") print("=" * 70) # ═══════════════════════════════════════════════════════════════ print("\n[1/12] 파서: parse_predict_line") # ═══════════════════════════════════════════════════════════════ # 정상 입력 p = parse_predict_line("PREDICT: left=safe(open), right=danger(wall), fwd=danger(beast), back=safe") check("정상 파싱 4방향", len(p) == 4) check("left=safe", p["left"].is_safe and not p["left"].is_danger) check("right=danger", p["right"].is_danger and not p["right"].is_safe) check("right reason=wall", p["right"].reason == "wall") check("fwd reason=beast", p["fwd"].reason == "beast") check("back=safe no reason", p["back"].is_safe and p["back"].reason is None) # 대소문자 혼합 p2 = parse_predict_line("PREDICT: Left=Safe(Open), RIGHT=DANGER(WALL), FWD=danger(Beast), Back=safe") check("대소문자 left safe", p2["left"].is_safe) check("대소문자 right danger", p2["right"].is_danger) check("대소문자 reason", p2["right"].reason == "wall") # forward/backward 정규화 p3 = parse_predict_line("PREDICT: left=safe, right=safe, forward=danger(wall), backward=safe") check("forward→fwd 정규화", "fwd" in p3) check("backward→back 정규화", "back" in p3) # 빈 입력 p4 = parse_predict_line("") check("빈 입력 빈 결과", len(p4) == 0) # PREDICT: 없는 경우 p5 = parse_predict_line("left=safe, right=danger(wall)") check("PREDICT: 없어도 파싱", len(p5) >= 2) # 이유 없는 danger p6 = parse_predict_line("PREDICT: left=danger, right=safe") check("이유 없는 danger", p6["left"].is_danger and p6["left"].reason is None) # 공백 많은 경우 p7 = parse_predict_line("PREDICT: left = safe(open) , right = danger( wall ) ") check("공백 과다 파싱", "left" in p7 or len(p7) >= 1) # ═══════════════════════════════════════════════════════════════ print("\n[2/12] 파서: parse_motion_line") # ═══════════════════════════════════════════════════════════════ m1 = parse_motion_line("MOTION: a person sprinting right in terror") check("정상 파싱", m1 == "a person sprinting right in terror") m2 = parse_motion_line("MOTION:a person walking forward") check("공백 없이", "walking forward" in m2) m3 = parse_motion_line("a person just walking") check("MOTION: 없어도", "walking" in m3) m4 = parse_motion_line("MOTION: A Person SPRINTING LEFT") check("대소문자 소문자화", "sprinting" in m4) # ═══════════════════════════════════════════════════════════════ print("\n[3/12] 키워드 사전: 행동 강도") # ═══════════════════════════════════════════════════════════════ check("sprint=4", get_action_intensity("sprinting away") == 4) check("walk=2", get_action_intensity("walking slowly") == 2) check("desperate=5", get_action_intensity("desperate escape") == 5) check("stand=1", get_action_intensity("standing still") == 1) check("빈 텍스트=0", get_action_intensity("") == 0) check("복합: sprint+desperate=5", get_action_intensity("desperately sprinting") == 5) check("unknown word=0", get_action_intensity("xyzzy foobar") == 0) check("freeze=1", get_action_intensity("freezing in place") == 1) check("run=4", get_action_intensity("running fast") == 4) check("jog=3", get_action_intensity("jogging ahead") == 3) # ═══════════════════════════════════════════════════════════════ print("\n[4/12] 키워드 사전: 감정 강도") # ═══════════════════════════════════════════════════════════════ check("terror=5", get_emotion_intensity("in terror") == 5) check("fear=4", get_emotion_intensity("with fear") == 4) check("anxious=3", get_emotion_intensity("feeling anxious") == 3) check("cautious=2", get_emotion_intensity("being cautious") == 2) check("calm=1", get_emotion_intensity("staying calm") == 1) check("빈=0", get_emotion_intensity("") == 0) check("terrified=5", get_emotion_intensity("terrified") == 5) check("복합: terror+fear=5", get_emotion_intensity("terrified with fear") == 5) # ═══════════════════════════════════════════════════════════════ print("\n[5/12] 키워드 사전: 방향 추출") # ═══════════════════════════════════════════════════════════════ check("right", get_motion_direction("sprinting right") == "right") check("left", get_motion_direction("moving left") == "left") check("forward→fwd", get_motion_direction("walking forward") == "fwd") check("backward→back", get_motion_direction("stepping backward") == "back") check("around→back", get_motion_direction("turning around") == "back") check("없음→None", get_motion_direction("a person standing still") is None) check("ahead→fwd", get_motion_direction("running ahead") == "fwd") # ═══════════════════════════════════════════════════════════════ print("\n[6/12] C01: 환경 인식 정확도") # ═══════════════════════════════════════════════════════════════ gt = {"left": "safe", "right": "danger", "fwd": "danger", "back": "safe"} # 만점 p = parse_predict_line("PREDICT: left=safe, right=danger(wall), fwd=danger(beast), back=safe") s, r = score_c01({}, p, gt) check("4/4 정확 = 20점", s == 20) # 3/4 p = parse_predict_line("PREDICT: left=safe, right=danger(wall), fwd=safe, back=safe") s, r = score_c01({}, p, gt) check("3/4 정확 = 15점", s == 15) # 2/4 p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=safe, back=safe") s, r = score_c01({}, p, gt) check("2/4 정확 = 10점", s == 10) # 0/4 p = parse_predict_line("PREDICT: left=danger, right=safe, fwd=safe, back=danger") s, r = score_c01({}, p, gt) check("0/4 정확 = 0점", s == 0) # 부분 출력 (2방향만) p = parse_predict_line("PREDICT: left=safe, right=danger(wall)") s, r = score_c01({}, p, gt) check("2방향만 출력 (fwd,back 누락)", s <= 10) # ═══════════════════════════════════════════════════════════════ print("\n[7/12] C02: 개체 인식 및 분류") # ═══════════════════════════════════════════════════════════════ # 만점: 맹수 정확 인식 gt = {"entity_type": "beast", "entity_direction": "fwd", "is_threat": True} p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=danger(beast), back=safe") s, r = score_c02({}, p, gt) check("맹수 완벽 인식 = 20점", s == 20, f"got {s}") # 유형 오인: beast를 woman으로 p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=danger(woman), back=safe") s, r = score_c02({}, p, gt) check("유형 오인 < 20점", s < 20 and s > 0, f"got {s}") # 방향 오인 p = parse_predict_line("PREDICT: left=danger(beast), right=safe, fwd=safe, back=safe") s, r = score_c02({}, p, gt) check("방향 오인 < 15점", s < 15, f"got {s}") # 개체 없음 정확 gt_none = {"entity_type": None, "entity_direction": None, "is_threat": False} p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=safe, back=safe") s, r = score_c02({}, p, gt_none) check("개체 없음 정확 = 20점", s == 20, f"got {s}") # 개체 없는데 danger 오인 p = parse_predict_line("PREDICT: left=safe, right=danger(beast), fwd=safe, back=safe") s, r = score_c02({}, p, gt_none) check("없는데 오인 = 10점", s == 10, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[8/12] C03: 예측 기반 추론") # ═══════════════════════════════════════════════════════════════ gt = { "danger_directions": ["fwd", "left"], "safe_directions": ["right", "back"], "optimal_direction": "right", } # 만점: 최적 방향 + safe + PREDICT 일관 p = parse_predict_line("PREDICT: left=danger(wall), right=safe(open), fwd=danger(beast), back=safe") s, r = score_c03({}, p, "a person sprinting right away from beast", gt) check("최적 방향 선택 = 20점", s == 20, f"got {s}") # 차선: back 선택 (safe이지만 optimal 아님) s, r = score_c03({}, p, "a person running backward quickly", gt) check("차선 방향 back = 16점", s == 16, f"got {s}") # 위험 방향 선택 s, r = score_c03({}, p, "a person walking forward slowly", gt) check("danger 방향 fwd = 낮은 점수", s <= 8, f"got {s}") # 방향 키워드 없음 s, r = score_c03({}, p, "a person standing in panic", gt) check("방향 없음 = 부분 점수", 0 <= s <= 10, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[9/12] C04: 위협 차별 반응") # ═══════════════════════════════════════════════════════════════ gt = {"expected_a_higher": True, "min_intensity_diff": 2} # 만점: A 강함, B 약함, 차이 충분 s, r = score_c04("a person sprinting away in terror", "a person walking away cautiously", gt) check("sprint(5) vs walk(2) = 20점", s == 20, f"got {s}") # 동일 강도 s, r = score_c04("a person walking forward", "a person walking slowly", gt) check("동일 강도 = 낮은 점수", s <= 10, f"got {s}") # 역전: B가 더 강함 (강도 차이는 있으므로 부분 점수) s, r = score_c04("a person standing still", "a person sprinting away", gt) check("역전 = 부분 점수 (강도 차이는 인정)", 8 <= s <= 14, f"got {s}") # 양쪽 저강도 반응 (차이 없음) s, r = score_c04("a person looking around", "a person standing there", gt) check("양쪽 저강도 = 부분 점수", 4 <= s <= 8, f"got {s}") # 차이 있지만 부족 s, r = score_c04("a person jogging away", "a person walking forward", gt) check("차이 1 < min_diff 2", s < 20, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[10/12] C05: 감정 에스컬레이션") # ═══════════════════════════════════════════════════════════════ # 만점: 증가 추세 s, r = score_c05([ "a person stepping back", "a person running away in fear", "a person desperately fleeing in terror", ], {"expected_trend": "increasing"}) check("증가 추세 = 높은 점수", s >= 16, f"got {s}") # 감소 추세 (기대: 감소) s, r = score_c05([ "a person sprinting in terror", "a person jogging cautiously", "a person walking calmly", ], {"expected_trend": "decreasing"}) check("감소 추세 = 높은 점수", s >= 16, f"got {s}") # 안정 (기대: 안정) s, r = score_c05([ "a person walking forward", "a person walking ahead", "a person walking steadily", ], {"expected_trend": "stable"}) check("안정 유지 = 높은 점수", s >= 14, f"got {s}") # 역전 (기대: 증가인데 감소) s, r = score_c05([ "a person desperately fleeing", "a person walking calmly", "a person standing still", ], {"expected_trend": "increasing"}) check("역전 = 낮은 점수", s <= 8, f"got {s}") # 시퀀스 1개만 s, r = score_c05(["a person walking"], {"expected_trend": "increasing"}) check("시퀀스 1개 = 0점", s == 0) # 4개 시퀀스 단조 증가 s, r = score_c05([ "a person standing still", "a person stepping back", "a person running away", "a person desperately sprinting in terror", ], {"expected_trend": "increasing"}) check("4단계 단조 증가", s >= 18, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[11/12] C08: 모션 표현력") # ═══════════════════════════════════════════════════════════════ gt_high = {"expected_min_intensity": 4, "expected_emotion": True, "expected_min_descriptors": 2} gt_low = {"expected_min_intensity": 1, "expected_emotion": False, "expected_min_descriptors": 1} # 만점: 풍부한 표현 s, r = score_c08("a person desperately sprinting right in terror", gt_high) check("풍부한 표현 = 20점", s == 20, f"got {s}") # 감정 없는 표현 (기대: 감정 있음) s, r = score_c08("a person moving right", gt_high) check("감정 없음 < 만점", s < 15, f"got {s}") # 평상시 (기대: 감정 없음) s, r = score_c08("a person walking forward steadily", gt_low) check("평상시 적절 = 높은 점수", s >= 14, f"got {s}") # 너무 긴 텍스트 long_motion = "a person " + " ".join(["really"] * 25) + " running" s, r = score_c08(long_motion, gt_high) check("30단어 초과 = 길이 감점", s < 20, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[12/12] C09: 실시간 성능") # ═══════════════════════════════════════════════════════════════ # 만점 s, r = score_c09({"fps": 50, "cognitive_latency_ms": 2000, "frame_drop_rate": 0.005, "gpu_memory_stable": True}) check("완벽 성능 = 20점", s == 20) # 최소 s, r = score_c09({"fps": 20, "cognitive_latency_ms": 8000, "frame_drop_rate": 0.03, "gpu_memory_stable": True}) check("최소 성능", 5 <= s <= 12, f"got {s}") # 미달 s, r = score_c09({"fps": 5, "cognitive_latency_ms": 15000, "frame_drop_rate": 0.1, "gpu_memory_stable": False}) check("미달 성능 = 낮은 점수", s <= 3, f"got {s}") # ═══════════════════════════════════════════════════════════════ print("\n[BONUS] 통합 점수 + 등급 경계") # ═══════════════════════════════════════════════════════════════ # S등급 경계 r = calculate_wm_score({"C01":100,"C02":100,"C03":100,"C04":100,"C05":100,"C06":100,"C07":100,"C08":100,"C09":100,"C10":100}) check(f"만점 = {r['wm_score']} (S등급)", r["grade"] == "S" and r["wm_score"] == 1000) # A등급 경계 r = calculate_wm_score({"C01":80,"C02":80,"C03":80,"C04":80,"C05":80,"C06":80,"C07":80,"C08":80,"C09":80,"C10":80}) check(f"80점대 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] >= 750) # B등급 r = calculate_wm_score({"C01":65,"C02":75,"C03":85,"C04":90,"C05":85,"C06":60,"C07":70,"C08":80,"C09":85,"C10":35}) check(f"VIDRAFT 기준점 = {r['wm_score']} ({r['grade']}등급)", r["grade"] in ("B", "A")) # F등급 r = calculate_wm_score({"C01":10,"C02":10,"C03":10,"C04":10,"C05":10,"C06":10,"C07":10,"C08":10,"C09":10,"C10":10}) check(f"최저 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] < 200) # 0점 r = calculate_wm_score({"C01":0,"C02":0,"C03":0,"C04":0,"C05":0,"C06":0,"C07":0,"C08":0,"C09":0,"C10":0}) check(f"0점 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] == 0 and r["grade"] == "F") # ═══════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print(f" 결과: {passed}/{total} 통과, {failed}/{total} 실패") if failed == 0: print(" ✅ 모든 테스트 통과 — 채점 시스템 검증 완료") else: print(f" ❌ {failed}개 실패 — 수정 필요") print("=" * 70)