import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
import numpy as np
from PIL import Image

import gradio as gr
from torchvision import transforms

from transformers import (
    CLIPModel,
    CLIPProcessor,
    BlipProcessor,
    BlipForConditionalGeneration,
)

# =========================================
# 0. 경로 / 디바이스 설정
# =========================================
CLIP_EMBED_PATH = "multimodal_assets/clip_text_embeds.pt"
MODEL_WEIGHTS_PATH = "models/convnext_base_merged_ema.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(" Device:", device)

# =========================================
# 1. 병합 클래스 이름 & CLIP 텍스트 임베딩 로드
# =========================================
print(" CLIP 텍스트 임베딩 로드 중...")
clip_data = torch.load(CLIP_EMBED_PATH)

merged_class_names = clip_data["class_names"]  # 17개 병합 클래스 이름
clip_prompts = clip_data["prompts"]
text_embeds = clip_data["text_embeds"]  # [17, D]
clip_model_name = clip_data["clip_model_name"]

# 텍스트 임베딩을 디바이스로 올리기
text_embeds = text_embeds.to(device)

print("병합 클래스 수:", len(merged_class_names))
print("병합 클래스 목록:", merged_class_names)

# =========================================
# 2. ConvNeXt-Base 분류 모델 로드
# =========================================
print(" ConvNeXt-Base 모델 로드 중 (timm)...")
num_classes = len(merged_class_names)

convnext_model = timm.create_model(
    "convnext_base",
    pretrained=False,
    num_classes=num_classes,
)
state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location="cpu")
convnext_model.load_state_dict(state_dict)
convnext_model.to(device)
convnext_model.eval()

print(" ConvNeXt-Base 학습 가중치 로드 완료")

# ConvNeXt용 전처리 (검증용)
mean = (0.485, 0.456, 0.406)
std  = (0.229, 0.224, 0.225)

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

# =========================================
# 3. CLIP 모델 로드
# =========================================
print(f" CLIP 모델 로드 중... ({clip_model_name})")
clip_model = CLIPModel.from_pretrained(clip_model_name)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)

clip_model.to(device)
clip_model.eval()

# =========================================
# 4. BLIP 캡션 모델 로드
# =========================================
print(" BLIP 캡션 모델 로드 중... (Salesforce/blip-image-captioning-base)")
blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)
blip_model.eval()

# =========================================
# 5. 세부 메뉴 후보 / 칼로리 정보 정의
# =========================================

# 원래 27개 메뉴(세부 메뉴)
fine_grained_menus = [
    "간장돼불덮밥",
    "고추치킨카레동",
    "공기밥",
    "김치어묵우동",
    "닭강정",
    "돈까스오므라이스",
    "돈까스우동세트",
    "돈까스카레동",
    "등심돈까스",
    "마그마새우튀김알밥",
    "마그마치킨마요",
    "베이컨 알리오올리오",
    "삼겹된장짜글이",
    "삼겹살강된장비빔밥",
    "새우튀김알밥",
    "새우튀김우동",
    "소떡소떡",
    "신라면(계란)",
    "신라면(계란+치즈)",
    "양념치킨오므라이스",
    "어묵우동",
    "에비카레동",
    "오므라이스",
    "쫑쫑이덮밥",
    "치킨마요",
    "케네디소시지",
    "케네디소시지오므라이스",
]

# 병합 대분류 → 세부 메뉴 후보
merged_to_fine = {
    "오므라이스류": ["오므라이스", "돈까스오므라이스", "케네디소시지오므라이스"],
    "치킨마요류": ["치킨마요", "마그마치킨마요"],
    "새우튀김알밥류": ["새우튀김알밥", "마그마새우튀김알밥"],
    "라면류": ["신라면(계란)", "신라면(계란+치즈)"],
}

# 대표 세부 메뉴 (사용자가 선택 안 했을 때 기본값)
default_detail = {
    "오므라이스류": "오므라이스",
    "치킨마요류": "치킨마요",
    "새우튀김알밥류": "새우튀김알밥",
    "라면류": "신라면(계란)",
}

# 아주 대략적인 칼로리 테이블
calorie_table = {
    "간장돼불덮밥": 800,
    "고추치킨카레동": 900,
    "공기밥": 300,
    "김치어묵우동": 500,
    "닭강정": 450,
    "돈까스오므라이스": 950,
    "돈까스우동세트": 900,
    "돈까스카레동": 900,
    "등심돈까스": 700,
    "마그마새우튀김알밥": 800,
    "마그마치킨마요": 850,
    "베이컨 알리오올리오": 800,
    "삼겹된장짜글이": 750,
    "삼겹살강된장비빔밥": 800,
    "새우튀김알밥": 750,
    "새우튀김우동": 550,
    "소떡소떡": 450,
    "신라면(계란)": 570,
    "신라면(계란+치즈)": 630,
    "양념치킨오므라이스": 950,
    "어묵우동": 450,
    "에비카레동": 800,
    "오므라이스": 730,
    "쫑쫑이덮밥": 700,
    "치킨마요": 800,
    "케네디소시지": 280,
    "케네디소시지오므라이스": 1000,
}

# =========================================
# 6. 유틸 함수들
# =========================================

def predict_convnext(image: Image.Image):
    """ConvNeXt-Base로 병합 대분류 예측"""
    convnext_model.eval()
    img_t = val_transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = convnext_model(img_t)
        probs = F.softmax(logits, dim=-1).cpu().numpy()[0]

    top1_idx = int(np.argmax(probs))
    top1_prob = float(probs[top1_idx])

    # Top-3도 보고싶으면:
    top3_idx = np.argsort(probs)[::-1][:3]
    top3 = [(merged_class_names[i], float(probs[i])) for i in top3_idx]

    return merged_class_names[top1_idx], top1_prob, top3


def recommend_with_clip(image: Image.Image, top_k=3):
    """CLIP으로 병합 대분류 기준 유사 메뉴 Top-K"""
    clip_model.eval()

    inputs = clip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        img_feat = clip_model.get_image_features(**inputs)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)

        sims = (img_feat @ text_embeds.T).squeeze(0)  # [17]
        topk = sims.topk(top_k)

    indices = topk.indices.tolist()
    scores = topk.values.tolist()
    result = [(merged_class_names[i], float(s)) for i, s in zip(indices, scores)]
    return result


def generate_caption(image: Image.Image):
    """BLIP으로 이미지 캡션 생성"""
    blip_model.eval()
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = blip_model.generate(**inputs, max_new_tokens=20)
    caption = blip_processor.decode(out[0], skip_special_tokens=True)
    return caption


def calorie_comment(menu_name: str, activity: str):
    kcal = calorie_table.get(menu_name)
    if kcal is None:
        return "이 메뉴에 대한 칼로리 정보가 등록되어 있지 않습니다."

    base = f"예상 칼로리: 약 {kcal} kcal.\n"

    if activity == "거의 안 움직임":
        if kcal >= 900:
            return base + "오늘 활동량을 고려하면 꽤 높은 칼로리라서, 자주 먹기엔 부담될 수 있어요."
        elif kcal >= 600:
            return base + "적당한 편이지만, 간식이나 다른 식사와 함께라면 총량을 조금 신경 쓰면 좋겠어요."
        else:
            return base + "가벼운 편이라 큰 부담 없이 먹어도 괜찮은 수준이에요."
    elif activity == "보통 활동":
        if kcal >= 1000:
            return base + "활동량을 고려해도 꽤 든든한 한 끼라서, 다른 끼니는 조금 가볍게 구성하면 좋아요."
        elif kcal >= 700:
            return base + "하루 한 끼 메인으로 먹기 좋은 정도의 칼로리예요."
        else:
            return base + "조금 가벼운 편이라, 배가 빨리 꺼질 수는 있어요."
    else:  # 많이 움직임
        if kcal >= 1000:
            return base + "활동량이 많다면 이 정도 칼로리는 충분히 잘 쓰일 거예요!"
        elif kcal >= 700:
            return base + "운동 전후 한 끼로 적당한 수준의 에너지 공급이 될 것 같아요."
        else:
            return base + "활동량에 비해 조금 가벼운 편이라, 간단한 간식을 더 곁들여도 좋겠어요."


# =========================================
# 7. Gradio 웹앱 메인 함수
# =========================================

def analyze_menu(image, activity_level, detail_menu_choice):
    """
    image: 업로드된 이미지 (PIL)
    activity_level: 활동량 (라디오 버튼)
    detail_menu_choice: 사용자가 선택한 세부 메뉴 (드롭다운)
    """
    if image is None:
        return "이미지를 업로드해 주세요.", "", "", ""

    # 1) ConvNeXt로 병합 대분류 예측
    big_cls, big_prob, top3_conv = predict_convnext(image)

    # 2) 해당 대분류에 세부 후보가 있는지 확인
    fine_candidates = merged_to_fine.get(big_cls, [])

    # 3) 세부 메뉴 결정 로직
    if detail_menu_choice is not None and detail_menu_choice != "선택 안 함 (모델에 맡기기)":
        final_menu = detail_menu_choice
        detail_info = f"사용자가 직접 선택한 세부 메뉴: **{final_menu}**"
    else:
        # 사용자가 직접 선택 안 한 경우
        if big_cls in default_detail:
            final_menu = default_detail[big_cls]
            detail_info = (
                f"예측 대분류: **{big_cls}** (신뢰도: {big_prob*100:.2f}%)\n"
                f"세부 메뉴는 선택하지 않아, 대표 메뉴 **'{final_menu}'** 기준으로 칼로리를 안내합니다.\n"
                f"(선택 메뉴를 바꾸면 칼로리 문장이 달라질 수 있어요)"
            )
        else:
            # 대분류 자체가 이미 최종 메뉴인 경우
            final_menu = big_cls
            detail_info = f"예측 메뉴: **{final_menu}** (신뢰도: {big_prob*100:.2f}%)"

    # 4) CLIP Top-3 유사 병합 메뉴
    clip_top3 = recommend_with_clip(image, top_k=3)
    clip_text_lines = []
    for name, score in clip_top3:
        clip_text_lines.append(f"- {name} (유사도: {score:.4f})")
    clip_text = "\n".join(clip_text_lines)

    # 5) BLIP 캡션 생성
    caption = generate_caption(image)

    # 6) 칼로리 코멘트
    kcal_text = calorie_comment(final_menu, activity_level)

    # 7) 안내 문구 (세부 후보 보여주기)
    if fine_candidates:
        candidate_text = (
            f"이 이미지는 **'{big_cls}'**(으)로 분류되었습니다.\n\n"
            f"이 대분류에 해당하는 세부 메뉴 후보:\n" +
            "\n".join([f"- {m}" for m in fine_candidates]) +
            "\n\n위 드롭다운에서 세부 메뉴를 직접 선택하면 칼로리 안내가 더 정확해집니다."
        )
    else:
        candidate_text = f"이 이미지는 **'{big_cls}'**(으)로 분류되었고, 별도의 세부 메뉴 분기는 없는 카테고리입니다."

    # 최종 요약 메시지
    summary = (
        f"###  최종 메뉴 분석\n"
        f"- 예측 대분류: **{big_cls}** (신뢰도: {big_prob*100:.2f}%)\n"
        f"- 최종 기준 메뉴: **{final_menu}**\n"
        f"- 활동량: **{activity_level}**\n\n"
        f"###  세부 메뉴 정보\n{detail_info}\n\n"
        f"###  ConvNeXt Top-3 (병합 클래스 기준)\n" +
        "\n".join([f"- {name} ({p*100:.2f}%)" for name, p in top3_conv]) +
        "\n\n"
        f"###  CLIP 유사 메뉴 Top-3 (병합 클래스 기준)\n{clip_text}\n\n"
        f"###  BLIP 캡션 (영어)\n> {caption}\n\n"
        f"###  칼로리 & 활동량 코멘트\n{kcal_text}\n\n"
        f"---\n"
        f"{candidate_text}"
    )

    return summary, caption, clip_text, kcal_text


# =========================================
# 8. Gradio 인터페이스 정의
# =========================================

with gr.Blocks() as demo:
    gr.Markdown("##  학식 스캐너")

    with gr.Row():
        with gr.Column():
            img_input = gr.Image(type="pil", label="메뉴 사진 업로드")

            activity_input = gr.Radio(
                choices=["거의 안 움직임", "보통 활동", "많이 움직임"],
                value="보통 활동",
                label="오늘 활동량",
            )

            detail_menu_input = gr.Dropdown(
                choices=["선택 안 함 (모델에 맡기기)"] + fine_grained_menus,
                value="선택 안 함 (모델에 맡기기)",
                label="세부 메뉴 (선택하면 칼로리 계산에 사용)",
            )

            run_btn = gr.Button("분석 실행 ")

        with gr.Column():
            summary_output = gr.Markdown(label="분석 결과 요약")
            caption_output = gr.Textbox(label="BLIP 캡션 (영어)", lines=2)
            clip_output = gr.Textbox(label="CLIP 유사 병합 메뉴 Top-3", lines=4)
            kcal_output = gr.Textbox(label="칼로리 코멘트", lines=3)

    run_btn.click(
        fn=analyze_menu,
        inputs=[img_input, activity_input, detail_menu_input],
        outputs=[summary_output, caption_output, clip_output, kcal_output],
    )

demo.launch()