Spaces:

simkyuri
/

cafeteria-menu-classifier

Running

App Files Files Community

simkyuri commited on Nov 15, 2025

Commit

0c636ec

verified ·

1 Parent(s): bc807a0

Upload app.py

Browse files

Files changed (1) hide show

app.py +242 -0

app.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+import gradio as gr
+from transformers import (
+    CLIPModel,
+    CLIPProcessor,
+    BlipProcessor,
+    BlipForConditionalGeneration,
+)
+# =========================================
+# 0. 경로 / 디바이스 설정
+# =========================================
+CLIP_EMBED_PATH = "multimodal_assets/clip_text_embeds.pt"
+MODEL_WEIGHTS_PATH = "models/convnext_base_merged_ema.pth"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Device:", device)
+# =========================================
+# 1. CLIP 텍스트 임베딩 로드
+# =========================================
+print("CLIP 텍스트 임베딩 로드 중...")
+clip_data = torch.load(CLIP_EMBED_PATH, map_location="cpu")
+merged_class_names = clip_data["class_names"]
+clip_prompts = clip_data["prompts"]
+text_embeds = clip_data["text_embeds"]
+clip_model_name = clip_data["clip_model_name"]
+text_embeds = text_embeds.to(device)
+print("병합 클래스 수:", len(merged_class_names))
+print("병합 클래스 목록:", merged_class_names)
+# =========================================
+# 2. ConvNeXt-Base 분류 모델 로드
+# =========================================
+print("ConvNeXt-Base 모델 로드 중 (timm)...")
+num_classes = len(merged_class_names)
+convnext_model = timm.create_model(
+    "convnext_base",
+    pretrained=False,
+    num_classes=num_classes,
+)
+state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location="cpu")
+convnext_model.load_state_dict(state_dict)
+convnext_model.to(device)
+convnext_model.eval()
+print("ConvNeXt-Base 학습 가중치 로드 완료")
+mean = (0.485, 0.456, 0.406)
+std  = (0.229, 0.224, 0.225)
+val_transform = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(mean, std),
+])
+# =========================================
+# 3. CLIP 모델 로드
+# =========================================
+print(f"CLIP 모델 로드 중... ({clip_model_name})")
+clip_model = CLIPModel.from_pretrained(clip_model_name)
+clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
+clip_model.to(device)
+clip_model.eval()
+# =========================================
+# 4. BLIP 캡션 모델 로드
+# =========================================
+print("BLIP 모델 로드 중...")
+blip_model_name = "Salesforce/blip-image-captioning-base"
+blip_processor = BlipProcessor.from_pretrained(blip_model_name)
+blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)
+blip_model.eval()
+# =========================================
+# 5. 세부 메뉴 / 칼로리 테이블
+# =========================================
+fine_grained_menus = [
+    "간장돼불덮밥", "고추치킨카레동", "공기밥", "김치어묵우동", "닭강정",
+    "돈까스오므라이스", "돈까스우동세트", "돈까스카레동", "등심돈까스",
+    "마그마새우튀김알밥", "마그마치킨마요", "베이컨 알리오올리오", "삼겹된장짜글이",
+    "삼겹살강된장비빔밥", "새우튀김알밥", "새우튀김우동", "소떡소떡",
+    "신라면(계란)", "신라면(계란+치즈)", "양념치킨오므라이스", "어묵우동",
+    "에비카레동", "오므라이스", "쫑쫑이덮밥", "치킨마요", "케네디소시지",
+    "케네디소시지오므라이스",
+]
+merged_to_fine = {
+    "오므라이스류": ["오므라이스", "돈까스오므라이스", "케네디소시지오므라이스"],
+    "치킨마요류": ["치킨마요", "마그마치킨마요"],
+    "새우튀김알밥류": ["새우튀김알밥", "마그마새우튀김알밥"],
+    "라면류": ["신라면(계란)", "신라면(계란+치즈)"],
+}
+default_detail = {
+    "오므라이스류": "오므라이스",
+    "치킨마요류": "치킨마요",
+    "새우튀김알밥류": "새우튀김알밥",
+    "라면류": "신라면(계란)",
+}
+calorie_table = {
+    "간장돼불덮밥": 800, "고추치킨카레동": 900, "공기밥": 300,
+    "김치어묵우동": 500, "닭강정": 450, "돈까스오므라이스": 950,
+    "돈까스우동세트": 900, "돈까스카레동": 900, "등심돈까스": 700,
+    "마그마새우튀김알밥": 800, "마그마치킨마요": 850,
+    "베이컨 알리오올리오": 800, "삼겹된장짜글이": 750,
+    "삼겹살강된장비빔밥": 800, "새우튀김알밥": 750, "새우튀김우동": 550,
+    "소떡소떡": 450, "신라면(계란)": 570, "신라면(계란+치즈)": 630,
+    "양념치킨오므라이스": 950, "어묵우동": 450, "에비카레동": 800,
+    "오므라이스": 730, "쫑쫑이덮밥": 700, "치킨마요": 800,
+    "케네디소시지": 280, "케네디소시지오므라이스": 1000,
+}
+# =========================================
+# 6. 모델 기능 함수
+# =========================================
+def predict_convnext(image: Image.Image):
+    img_t = val_transform(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        logits = convnext_model(img_t)
+        probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
+    top1 = int(np.argmax(probs))
+    top1_prob = float(probs[top1])
+    return merged_class_names[top1], top1_prob
+def recommend_with_clip(image: Image.Image):
+    inputs = clip_processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        img_feat = clip_model.get_image_features(**inputs)
+        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
+        sims = (img_feat @ text_embeds.T).squeeze(0)
+        topk = sims.topk(3)
+    result = [(merged_class_names[i], float(s)) for i, s in zip(topk.indices.tolist(), topk.values.tolist())]
+    return result
+def generate_caption(image: Image.Image):
+    inputs = blip_processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = blip_model.generate(**inputs, max_new_tokens=20)
+    return blip_processor.decode(out[0], skip_special_tokens=True)
+def calorie_comment(menu_name: str, activity: str):
+    kcal = calorie_table.get(menu_name, None)
+    if kcal is None:
+        return "칼로리 정보 없음"
+    return f"{menu_name}: 약 {kcal} kcal"
+# =========================================
+# 7. 웹앱 메인
+# =========================================
+def analyze_menu(image, activity_level, detail_menu_choice):
+    if image is None:
+        return "이미지를 업로드하세요.", "", "", ""
+    # 1) ConvNeXt
+    big_cls, prob = predict_convnext(image)
+    # 2) 세부 메뉴 결정
+    fine_candidates = merged_to_fine.get(big_cls, [])
+    if detail_menu_choice != "선택 안 함 (모델에 맡기기)":
+        final_menu = detail_menu_choice
+    else:
+        final_menu = default_detail.get(big_cls, big_cls)
+    # 3) CLIP Top-3
+    clip_top3 = recommend_with_clip(image)
+    clip_text = "\n".join([f"- {n} ({s:.4f})" for n, s in clip_top3])
+    # 4) BLIP
+    caption = generate_caption(image)
+    # 5) 칼로리
+    kcal = calorie_comment(final_menu, activity_level)
+    # 6) 출력
+    summary = (
+        f"### 최종 메뉴 분석\n"
+        f"- 예측 대분류: **{big_cls}** ({prob*100:.2f}%)\n"
+        f"- 최종 세부 메뉴: **{final_menu}**\n\n"
+        f"### CLIP Top-3\n{clip_text}\n\n"
+        f"### BLIP 캡션\n> {caption}\n\n"
+        f"### 칼로리 정보\n{kcal}"
+    )
+    return summary, caption, clip_text, kcal
+# =========================================
+# 8. Gradio 인터페이스
+# =========================================
+with gr.Blocks() as demo:
+    gr.Markdown("## 학식 스캐너")
+    with gr.Row():
+        with gr.Column():
+            img_input = gr.Image(type="pil", label="메뉴 사진 업로드")
+            activity_input = gr.Radio(
+                choices=["거의 안 움직임", "보통 활동", "많이 움직임"],
+                value="보통 활동",
+                label="오늘 활동량",
+            )
+            detail_menu_input = gr.Dropdown(
+                choices=["선택 안 함 (모델에 맡기기)"] + fine_grained_menus,
+                value="선택 안 함 (모델에 맡기기)",
+                label="세부 메뉴 선택",
+            )
+            btn = gr.Button("분석하기")
+        with gr.Column():
+            summary_output = gr.Markdown()
+            caption_output = gr.Textbox(label="BLIP 캡션")
+            clip_output = gr.Textbox(label="CLIP Top-3")
+            kcal_output = gr.Textbox(label="칼로리")
+    btn.click(
+        analyze_menu,
+        inputs=[img_input, activity_input, detail_menu_input],
+        outputs=[summary_output, caption_output, clip_output, kcal_output],
+    )
+demo.launch()