"""
World Model Bench — Scoring Verification Suite v1.0

목적: 채점 함수의 모든 분기를 테스트하여
     "누가 돌려도 같은 점수" 보장

검증 범위:
  - 파서 엣지케이스 (빈 입력, 잘못된 포맷, 대소문자 등)
  - 10개 채점 함수 × 모든 점수 분기
  - 경계값 테스트 (0점, 만점, 부분 점수)
  - 통합 점수 계산 + 등급 경계
"""

import sys
sys.path.insert(0, '/mnt/user-data/outputs')

from wm_bench_scoring import (
    parse_predict_line, parse_motion_line, PredictDirection,
    get_action_intensity, get_emotion_intensity, get_motion_direction,
    count_descriptors,
    score_c01, score_c02, score_c03, score_c04, score_c05,
    score_c06, score_c07, score_c08, score_c09, score_c10,
    calculate_wm_score,
)

passed = 0
failed = 0
total = 0

def check(test_name, condition, detail=""):
    global passed, failed, total
    total += 1
    if condition:
        passed += 1
        print(f"  ✓ {test_name}")
    else:
        failed += 1
        print(f"  ✗ {test_name} — {detail}")


print("=" * 70)
print("  WM Bench Scoring Verification Suite")
print("=" * 70)


# ═══════════════════════════════════════════════════════════════
print("\n[1/12] 파서: parse_predict_line")
# ═══════════════════════════════════════════════════════════════

# 정상 입력
p = parse_predict_line("PREDICT: left=safe(open), right=danger(wall), fwd=danger(beast), back=safe")
check("정상 파싱 4방향", len(p) == 4)
check("left=safe", p["left"].is_safe and not p["left"].is_danger)
check("right=danger", p["right"].is_danger and not p["right"].is_safe)
check("right reason=wall", p["right"].reason == "wall")
check("fwd reason=beast", p["fwd"].reason == "beast")
check("back=safe no reason", p["back"].is_safe and p["back"].reason is None)

# 대소문자 혼합
p2 = parse_predict_line("PREDICT: Left=Safe(Open), RIGHT=DANGER(WALL), FWD=danger(Beast), Back=safe")
check("대소문자 left safe", p2["left"].is_safe)
check("대소문자 right danger", p2["right"].is_danger)
check("대소문자 reason", p2["right"].reason == "wall")

# forward/backward 정규화
p3 = parse_predict_line("PREDICT: left=safe, right=safe, forward=danger(wall), backward=safe")
check("forward→fwd 정규화", "fwd" in p3)
check("backward→back 정규화", "back" in p3)

# 빈 입력
p4 = parse_predict_line("")
check("빈 입력 빈 결과", len(p4) == 0)

# PREDICT: 없는 경우
p5 = parse_predict_line("left=safe, right=danger(wall)")
check("PREDICT: 없어도 파싱", len(p5) >= 2)

# 이유 없는 danger
p6 = parse_predict_line("PREDICT: left=danger, right=safe")
check("이유 없는 danger", p6["left"].is_danger and p6["left"].reason is None)

# 공백 많은 경우
p7 = parse_predict_line("PREDICT:  left = safe(open) ,  right = danger( wall ) ")
check("공백 과다 파싱", "left" in p7 or len(p7) >= 1)


# ═══════════════════════════════════════════════════════════════
print("\n[2/12] 파서: parse_motion_line")
# ═══════════════════════════════════════════════════════════════

m1 = parse_motion_line("MOTION: a person sprinting right in terror")
check("정상 파싱", m1 == "a person sprinting right in terror")

m2 = parse_motion_line("MOTION:a person walking forward")
check("공백 없이", "walking forward" in m2)

m3 = parse_motion_line("a person just walking")
check("MOTION: 없어도", "walking" in m3)

m4 = parse_motion_line("MOTION: A Person SPRINTING LEFT")
check("대소문자 소문자화", "sprinting" in m4)


# ═══════════════════════════════════════════════════════════════
print("\n[3/12] 키워드 사전: 행동 강도")
# ═══════════════════════════════════════════════════════════════

check("sprint=4", get_action_intensity("sprinting away") == 4)
check("walk=2", get_action_intensity("walking slowly") == 2)
check("desperate=5", get_action_intensity("desperate escape") == 5)
check("stand=1", get_action_intensity("standing still") == 1)
check("빈 텍스트=0", get_action_intensity("") == 0)
check("복합: sprint+desperate=5", get_action_intensity("desperately sprinting") == 5)
check("unknown word=0", get_action_intensity("xyzzy foobar") == 0)
check("freeze=1", get_action_intensity("freezing in place") == 1)
check("run=4", get_action_intensity("running fast") == 4)
check("jog=3", get_action_intensity("jogging ahead") == 3)


# ═══════════════════════════════════════════════════════════════
print("\n[4/12] 키워드 사전: 감정 강도")
# ═══════════════════════════════════════════════════════════════

check("terror=5", get_emotion_intensity("in terror") == 5)
check("fear=4", get_emotion_intensity("with fear") == 4)
check("anxious=3", get_emotion_intensity("feeling anxious") == 3)
check("cautious=2", get_emotion_intensity("being cautious") == 2)
check("calm=1", get_emotion_intensity("staying calm") == 1)
check("빈=0", get_emotion_intensity("") == 0)
check("terrified=5", get_emotion_intensity("terrified") == 5)
check("복합: terror+fear=5", get_emotion_intensity("terrified with fear") == 5)


# ═══════════════════════════════════════════════════════════════
print("\n[5/12] 키워드 사전: 방향 추출")
# ═══════════════════════════════════════════════════════════════

check("right", get_motion_direction("sprinting right") == "right")
check("left", get_motion_direction("moving left") == "left")
check("forward→fwd", get_motion_direction("walking forward") == "fwd")
check("backward→back", get_motion_direction("stepping backward") == "back")
check("around→back", get_motion_direction("turning around") == "back")
check("없음→None", get_motion_direction("a person standing still") is None)
check("ahead→fwd", get_motion_direction("running ahead") == "fwd")


# ═══════════════════════════════════════════════════════════════
print("\n[6/12] C01: 환경 인식 정확도")
# ═══════════════════════════════════════════════════════════════

gt = {"left": "safe", "right": "danger", "fwd": "danger", "back": "safe"}

# 만점
p = parse_predict_line("PREDICT: left=safe, right=danger(wall), fwd=danger(beast), back=safe")
s, r = score_c01({}, p, gt)
check("4/4 정확 = 20점", s == 20)

# 3/4
p = parse_predict_line("PREDICT: left=safe, right=danger(wall), fwd=safe, back=safe")
s, r = score_c01({}, p, gt)
check("3/4 정확 = 15점", s == 15)

# 2/4
p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=safe, back=safe")
s, r = score_c01({}, p, gt)
check("2/4 정확 = 10점", s == 10)

# 0/4
p = parse_predict_line("PREDICT: left=danger, right=safe, fwd=safe, back=danger")
s, r = score_c01({}, p, gt)
check("0/4 정확 = 0점", s == 0)

# 부분 출력 (2방향만)
p = parse_predict_line("PREDICT: left=safe, right=danger(wall)")
s, r = score_c01({}, p, gt)
check("2방향만 출력 (fwd,back 누락)", s <= 10)


# ═══════════════════════════════════════════════════════════════
print("\n[7/12] C02: 개체 인식 및 분류")
# ═══════════════════════════════════════════════════════════════

# 만점: 맹수 정확 인식
gt = {"entity_type": "beast", "entity_direction": "fwd", "is_threat": True}
p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=danger(beast), back=safe")
s, r = score_c02({}, p, gt)
check("맹수 완벽 인식 = 20점", s == 20, f"got {s}")

# 유형 오인: beast를 woman으로
p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=danger(woman), back=safe")
s, r = score_c02({}, p, gt)
check("유형 오인 < 20점", s < 20 and s > 0, f"got {s}")

# 방향 오인
p = parse_predict_line("PREDICT: left=danger(beast), right=safe, fwd=safe, back=safe")
s, r = score_c02({}, p, gt)
check("방향 오인 < 15점", s < 15, f"got {s}")

# 개체 없음 정확
gt_none = {"entity_type": None, "entity_direction": None, "is_threat": False}
p = parse_predict_line("PREDICT: left=safe, right=safe, fwd=safe, back=safe")
s, r = score_c02({}, p, gt_none)
check("개체 없음 정확 = 20점", s == 20, f"got {s}")

# 개체 없는데 danger 오인
p = parse_predict_line("PREDICT: left=safe, right=danger(beast), fwd=safe, back=safe")
s, r = score_c02({}, p, gt_none)
check("없는데 오인 = 10점", s == 10, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[8/12] C03: 예측 기반 추론")
# ═══════════════════════════════════════════════════════════════

gt = {
    "danger_directions": ["fwd", "left"],
    "safe_directions": ["right", "back"],
    "optimal_direction": "right",
}

# 만점: 최적 방향 + safe + PREDICT 일관
p = parse_predict_line("PREDICT: left=danger(wall), right=safe(open), fwd=danger(beast), back=safe")
s, r = score_c03({}, p, "a person sprinting right away from beast", gt)
check("최적 방향 선택 = 20점", s == 20, f"got {s}")

# 차선: back 선택 (safe이지만 optimal 아님)
s, r = score_c03({}, p, "a person running backward quickly", gt)
check("차선 방향 back = 16점", s == 16, f"got {s}")

# 위험 방향 선택
s, r = score_c03({}, p, "a person walking forward slowly", gt)
check("danger 방향 fwd = 낮은 점수", s <= 8, f"got {s}")

# 방향 키워드 없음
s, r = score_c03({}, p, "a person standing in panic", gt)
check("방향 없음 = 부분 점수", 0 <= s <= 10, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[9/12] C04: 위협 차별 반응")
# ═══════════════════════════════════════════════════════════════

gt = {"expected_a_higher": True, "min_intensity_diff": 2}

# 만점: A 강함, B 약함, 차이 충분
s, r = score_c04("a person sprinting away in terror", "a person walking away cautiously", gt)
check("sprint(5) vs walk(2) = 20점", s == 20, f"got {s}")

# 동일 강도
s, r = score_c04("a person walking forward", "a person walking slowly", gt)
check("동일 강도 = 낮은 점수", s <= 10, f"got {s}")

# 역전: B가 더 강함 (강도 차이는 있으므로 부분 점수)
s, r = score_c04("a person standing still", "a person sprinting away", gt)
check("역전 = 부분 점수 (강도 차이는 인정)", 8 <= s <= 14, f"got {s}")

# 양쪽 저강도 반응 (차이 없음)
s, r = score_c04("a person looking around", "a person standing there", gt)
check("양쪽 저강도 = 부분 점수", 4 <= s <= 8, f"got {s}")

# 차이 있지만 부족
s, r = score_c04("a person jogging away", "a person walking forward", gt)
check("차이 1 < min_diff 2", s < 20, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[10/12] C05: 감정 에스컬레이션")
# ═══════════════════════════════════════════════════════════════

# 만점: 증가 추세
s, r = score_c05([
    "a person stepping back",
    "a person running away in fear",
    "a person desperately fleeing in terror",
], {"expected_trend": "increasing"})
check("증가 추세 = 높은 점수", s >= 16, f"got {s}")

# 감소 추세 (기대: 감소)
s, r = score_c05([
    "a person sprinting in terror",
    "a person jogging cautiously",
    "a person walking calmly",
], {"expected_trend": "decreasing"})
check("감소 추세 = 높은 점수", s >= 16, f"got {s}")

# 안정 (기대: 안정)
s, r = score_c05([
    "a person walking forward",
    "a person walking ahead",
    "a person walking steadily",
], {"expected_trend": "stable"})
check("안정 유지 = 높은 점수", s >= 14, f"got {s}")

# 역전 (기대: 증가인데 감소)
s, r = score_c05([
    "a person desperately fleeing",
    "a person walking calmly",
    "a person standing still",
], {"expected_trend": "increasing"})
check("역전 = 낮은 점수", s <= 8, f"got {s}")

# 시퀀스 1개만
s, r = score_c05(["a person walking"], {"expected_trend": "increasing"})
check("시퀀스 1개 = 0점", s == 0)

# 4개 시퀀스 단조 증가
s, r = score_c05([
    "a person standing still",
    "a person stepping back",
    "a person running away",
    "a person desperately sprinting in terror",
], {"expected_trend": "increasing"})
check("4단계 단조 증가", s >= 18, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[11/12] C08: 모션 표현력")
# ═══════════════════════════════════════════════════════════════

gt_high = {"expected_min_intensity": 4, "expected_emotion": True, "expected_min_descriptors": 2}
gt_low = {"expected_min_intensity": 1, "expected_emotion": False, "expected_min_descriptors": 1}

# 만점: 풍부한 표현
s, r = score_c08("a person desperately sprinting right in terror", gt_high)
check("풍부한 표현 = 20점", s == 20, f"got {s}")

# 감정 없는 표현 (기대: 감정 있음)
s, r = score_c08("a person moving right", gt_high)
check("감정 없음 < 만점", s < 15, f"got {s}")

# 평상시 (기대: 감정 없음)
s, r = score_c08("a person walking forward steadily", gt_low)
check("평상시 적절 = 높은 점수", s >= 14, f"got {s}")

# 너무 긴 텍스트
long_motion = "a person " + " ".join(["really"] * 25) + " running"
s, r = score_c08(long_motion, gt_high)
check("30단어 초과 = 길이 감점", s < 20, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[12/12] C09: 실시간 성능")
# ═══════════════════════════════════════════════════════════════

# 만점
s, r = score_c09({"fps": 50, "cognitive_latency_ms": 2000, "frame_drop_rate": 0.005, "gpu_memory_stable": True})
check("완벽 성능 = 20점", s == 20)

# 최소
s, r = score_c09({"fps": 20, "cognitive_latency_ms": 8000, "frame_drop_rate": 0.03, "gpu_memory_stable": True})
check("최소 성능", 5 <= s <= 12, f"got {s}")

# 미달
s, r = score_c09({"fps": 5, "cognitive_latency_ms": 15000, "frame_drop_rate": 0.1, "gpu_memory_stable": False})
check("미달 성능 = 낮은 점수", s <= 3, f"got {s}")


# ═══════════════════════════════════════════════════════════════
print("\n[BONUS] 통합 점수 + 등급 경계")
# ═══════════════════════════════════════════════════════════════

# S등급 경계
r = calculate_wm_score({"C01":100,"C02":100,"C03":100,"C04":100,"C05":100,"C06":100,"C07":100,"C08":100,"C09":100,"C10":100})
check(f"만점 = {r['wm_score']} (S등급)", r["grade"] == "S" and r["wm_score"] == 1000)

# A등급 경계
r = calculate_wm_score({"C01":80,"C02":80,"C03":80,"C04":80,"C05":80,"C06":80,"C07":80,"C08":80,"C09":80,"C10":80})
check(f"80점대 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] >= 750)

# B등급
r = calculate_wm_score({"C01":65,"C02":75,"C03":85,"C04":90,"C05":85,"C06":60,"C07":70,"C08":80,"C09":85,"C10":35})
check(f"VIDRAFT 기준점 = {r['wm_score']} ({r['grade']}등급)", r["grade"] in ("B", "A"))

# F등급
r = calculate_wm_score({"C01":10,"C02":10,"C03":10,"C04":10,"C05":10,"C06":10,"C07":10,"C08":10,"C09":10,"C10":10})
check(f"최저 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] < 200)

# 0점
r = calculate_wm_score({"C01":0,"C02":0,"C03":0,"C04":0,"C05":0,"C06":0,"C07":0,"C08":0,"C09":0,"C10":0})
check(f"0점 = {r['wm_score']} ({r['grade']}등급)", r["wm_score"] == 0 and r["grade"] == "F")


# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print(f"  결과: {passed}/{total} 통과, {failed}/{total} 실패")
if failed == 0:
    print("  ✅ 모든 테스트 통과 — 채점 시스템 검증 완료")
else:
    print(f"  ❌ {failed}개 실패 — 수정 필요")
print("=" * 70)