simkyuri's picture
Upload app.py
d40ff78 verified
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
import numpy as np
from PIL import Image
import gradio as gr
from torchvision import transforms
from transformers import (
CLIPModel,
CLIPProcessor,
BlipProcessor,
BlipForConditionalGeneration,
)
# =========================================
# 0. ๊ฒฝ๋กœ / ๋””๋ฐ”์ด์Šค ์„ค์ •
# =========================================
CLIP_EMBED_PATH = "multimodal_assets/clip_text_embeds.pt"
MODEL_WEIGHTS_PATH = "models/convnext_base_merged_ema.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(" Device:", device)
# =========================================
# 1. ๋ณ‘ํ•ฉ ํด๋ž˜์Šค ์ด๋ฆ„ & CLIP ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ๋กœ๋“œ
# =========================================
print(" CLIP ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ๋กœ๋“œ ์ค‘...")
clip_data = torch.load(CLIP_EMBED_PATH)
merged_class_names = clip_data["class_names"] # 17๊ฐœ ๋ณ‘ํ•ฉ ํด๋ž˜์Šค ์ด๋ฆ„
clip_prompts = clip_data["prompts"]
text_embeds = clip_data["text_embeds"] # [17, D]
clip_model_name = clip_data["clip_model_name"]
# ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ์„ ๋””๋ฐ”์ด์Šค๋กœ ์˜ฌ๋ฆฌ๊ธฐ
text_embeds = text_embeds.to(device)
print("๋ณ‘ํ•ฉ ํด๋ž˜์Šค ์ˆ˜:", len(merged_class_names))
print("๋ณ‘ํ•ฉ ํด๋ž˜์Šค ๋ชฉ๋ก:", merged_class_names)
# =========================================
# 2. ConvNeXt-Base ๋ถ„๋ฅ˜ ๋ชจ๋ธ ๋กœ๋“œ
# =========================================
print(" ConvNeXt-Base ๋ชจ๋ธ ๋กœ๋“œ ์ค‘ (timm)...")
num_classes = len(merged_class_names)
convnext_model = timm.create_model(
"convnext_base",
pretrained=False,
num_classes=num_classes,
)
state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location="cpu")
convnext_model.load_state_dict(state_dict)
convnext_model.to(device)
convnext_model.eval()
print(" ConvNeXt-Base ํ•™์Šต ๊ฐ€์ค‘์น˜ ๋กœ๋“œ ์™„๋ฃŒ")
# ConvNeXt์šฉ ์ „์ฒ˜๋ฆฌ (๊ฒ€์ฆ์šฉ)
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean, std),
])
# =========================================
# 3. CLIP ๋ชจ๋ธ ๋กœ๋“œ
# =========================================
print(f" CLIP ๋ชจ๋ธ ๋กœ๋“œ ์ค‘... ({clip_model_name})")
clip_model = CLIPModel.from_pretrained(clip_model_name)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
clip_model.to(device)
clip_model.eval()
# =========================================
# 4. BLIP ์บก์…˜ ๋ชจ๋ธ ๋กœ๋“œ
# =========================================
print(" BLIP ์บก์…˜ ๋ชจ๋ธ ๋กœ๋“œ ์ค‘... (Salesforce/blip-image-captioning-base)")
blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)
blip_model.eval()
# =========================================
# 5. ์„ธ๋ถ€ ๋ฉ”๋‰ด ํ›„๋ณด / ์นผ๋กœ๋ฆฌ ์ •๋ณด ์ •์˜
# =========================================
# ์›๋ž˜ 27๊ฐœ ๋ฉ”๋‰ด(์„ธ๋ถ€ ๋ฉ”๋‰ด)
fine_grained_menus = [
"๊ฐ„์žฅ๋ผ๋ถˆ๋ฎ๋ฐฅ",
"๊ณ ์ถ”์น˜ํ‚จ์นด๋ ˆ๋™",
"๊ณต๊ธฐ๋ฐฅ",
"๊น€์น˜์–ด๋ฌต์šฐ๋™",
"๋‹ญ๊ฐ•์ •",
"๋ˆ๊นŒ์Šค์˜ค๋ฏ€๋ผ์ด์Šค",
"๋ˆ๊นŒ์Šค์šฐ๋™์„ธํŠธ",
"๋ˆ๊นŒ์Šค์นด๋ ˆ๋™",
"๋“ฑ์‹ฌ๋ˆ๊นŒ์Šค",
"๋งˆ๊ทธ๋งˆ์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ",
"๋งˆ๊ทธ๋งˆ์น˜ํ‚จ๋งˆ์š”",
"๋ฒ ์ด์ปจ ์•Œ๋ฆฌ์˜ค์˜ฌ๋ฆฌ์˜ค",
"์‚ผ๊ฒน๋œ์žฅ์งœ๊ธ€์ด",
"์‚ผ๊ฒน์‚ด๊ฐ•๋œ์žฅ๋น„๋น”๋ฐฅ",
"์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ",
"์ƒˆ์šฐํŠ€๊น€์šฐ๋™",
"์†Œ๋–ก์†Œ๋–ก",
"์‹ ๋ผ๋ฉด(๊ณ„๋ž€)",
"์‹ ๋ผ๋ฉด(๊ณ„๋ž€+์น˜์ฆˆ)",
"์–‘๋…์น˜ํ‚จ์˜ค๋ฏ€๋ผ์ด์Šค",
"์–ด๋ฌต์šฐ๋™",
"์—๋น„์นด๋ ˆ๋™",
"์˜ค๋ฏ€๋ผ์ด์Šค",
"์ซ‘์ซ‘์ด๋ฎ๋ฐฅ",
"์น˜ํ‚จ๋งˆ์š”",
"์ผ€๋„ค๋””์†Œ์‹œ์ง€",
"์ผ€๋„ค๋””์†Œ์‹œ์ง€์˜ค๋ฏ€๋ผ์ด์Šค",
]
# ๋ณ‘ํ•ฉ ๋Œ€๋ถ„๋ฅ˜ โ†’ ์„ธ๋ถ€ ๋ฉ”๋‰ด ํ›„๋ณด
merged_to_fine = {
"์˜ค๋ฏ€๋ผ์ด์Šค๋ฅ˜": ["์˜ค๋ฏ€๋ผ์ด์Šค", "๋ˆ๊นŒ์Šค์˜ค๋ฏ€๋ผ์ด์Šค", "์ผ€๋„ค๋””์†Œ์‹œ์ง€์˜ค๋ฏ€๋ผ์ด์Šค"],
"์น˜ํ‚จ๋งˆ์š”๋ฅ˜": ["์น˜ํ‚จ๋งˆ์š”", "๋งˆ๊ทธ๋งˆ์น˜ํ‚จ๋งˆ์š”"],
"์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ๋ฅ˜": ["์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ", "๋งˆ๊ทธ๋งˆ์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ"],
"๋ผ๋ฉด๋ฅ˜": ["์‹ ๋ผ๋ฉด(๊ณ„๋ž€)", "์‹ ๋ผ๋ฉด(๊ณ„๋ž€+์น˜์ฆˆ)"],
}
# ๋Œ€ํ‘œ ์„ธ๋ถ€ ๋ฉ”๋‰ด (์‚ฌ์šฉ์ž๊ฐ€ ์„ ํƒ ์•ˆ ํ–ˆ์„ ๋•Œ ๊ธฐ๋ณธ๊ฐ’)
default_detail = {
"์˜ค๋ฏ€๋ผ์ด์Šค๋ฅ˜": "์˜ค๋ฏ€๋ผ์ด์Šค",
"์น˜ํ‚จ๋งˆ์š”๋ฅ˜": "์น˜ํ‚จ๋งˆ์š”",
"์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ๋ฅ˜": "์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ",
"๋ผ๋ฉด๋ฅ˜": "์‹ ๋ผ๋ฉด(๊ณ„๋ž€)",
}
# ์•„์ฃผ ๋Œ€๋žต์ ์ธ ์นผ๋กœ๋ฆฌ ํ…Œ์ด๋ธ”
calorie_table = {
"๊ฐ„์žฅ๋ผ๋ถˆ๋ฎ๋ฐฅ": 800,
"๊ณ ์ถ”์น˜ํ‚จ์นด๋ ˆ๋™": 900,
"๊ณต๊ธฐ๋ฐฅ": 300,
"๊น€์น˜์–ด๋ฌต์šฐ๋™": 500,
"๋‹ญ๊ฐ•์ •": 450,
"๋ˆ๊นŒ์Šค์˜ค๋ฏ€๋ผ์ด์Šค": 950,
"๋ˆ๊นŒ์Šค์šฐ๋™์„ธํŠธ": 900,
"๋ˆ๊นŒ์Šค์นด๋ ˆ๋™": 900,
"๋“ฑ์‹ฌ๋ˆ๊นŒ์Šค": 700,
"๋งˆ๊ทธ๋งˆ์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ": 800,
"๋งˆ๊ทธ๋งˆ์น˜ํ‚จ๋งˆ์š”": 850,
"๋ฒ ์ด์ปจ ์•Œ๋ฆฌ์˜ค์˜ฌ๋ฆฌ์˜ค": 800,
"์‚ผ๊ฒน๋œ์žฅ์งœ๊ธ€์ด": 750,
"์‚ผ๊ฒน์‚ด๊ฐ•๋œ์žฅ๋น„๋น”๋ฐฅ": 800,
"์ƒˆ์šฐํŠ€๊น€์•Œ๋ฐฅ": 750,
"์ƒˆ์šฐํŠ€๊น€์šฐ๋™": 550,
"์†Œ๋–ก์†Œ๋–ก": 450,
"์‹ ๋ผ๋ฉด(๊ณ„๋ž€)": 570,
"์‹ ๋ผ๋ฉด(๊ณ„๋ž€+์น˜์ฆˆ)": 630,
"์–‘๋…์น˜ํ‚จ์˜ค๋ฏ€๋ผ์ด์Šค": 950,
"์–ด๋ฌต์šฐ๋™": 450,
"์—๋น„์นด๋ ˆ๋™": 800,
"์˜ค๋ฏ€๋ผ์ด์Šค": 730,
"์ซ‘์ซ‘์ด๋ฎ๋ฐฅ": 700,
"์น˜ํ‚จ๋งˆ์š”": 800,
"์ผ€๋„ค๋””์†Œ์‹œ์ง€": 280,
"์ผ€๋„ค๋””์†Œ์‹œ์ง€์˜ค๋ฏ€๋ผ์ด์Šค": 1000,
}
# =========================================
# 6. ์œ ํ‹ธ ํ•จ์ˆ˜๋“ค
# =========================================
def predict_convnext(image: Image.Image):
"""ConvNeXt-Base๋กœ ๋ณ‘ํ•ฉ ๋Œ€๋ถ„๋ฅ˜ ์˜ˆ์ธก"""
convnext_model.eval()
img_t = val_transform(image).unsqueeze(0).to(device)
with torch.no_grad():
logits = convnext_model(img_t)
probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
top1_idx = int(np.argmax(probs))
top1_prob = float(probs[top1_idx])
# Top-3๋„ ๋ณด๊ณ ์‹ถ์œผ๋ฉด:
top3_idx = np.argsort(probs)[::-1][:3]
top3 = [(merged_class_names[i], float(probs[i])) for i in top3_idx]
return merged_class_names[top1_idx], top1_prob, top3
def recommend_with_clip(image: Image.Image, top_k=3):
"""CLIP์œผ๋กœ ๋ณ‘ํ•ฉ ๋Œ€๋ถ„๋ฅ˜ ๊ธฐ์ค€ ์œ ์‚ฌ ๋ฉ”๋‰ด Top-K"""
clip_model.eval()
inputs = clip_processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
img_feat = clip_model.get_image_features(**inputs)
img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
sims = (img_feat @ text_embeds.T).squeeze(0) # [17]
topk = sims.topk(top_k)
indices = topk.indices.tolist()
scores = topk.values.tolist()
result = [(merged_class_names[i], float(s)) for i, s in zip(indices, scores)]
return result
def generate_caption(image: Image.Image):
"""BLIP์œผ๋กœ ์ด๋ฏธ์ง€ ์บก์…˜ ์ƒ์„ฑ"""
blip_model.eval()
inputs = blip_processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
out = blip_model.generate(**inputs, max_new_tokens=20)
caption = blip_processor.decode(out[0], skip_special_tokens=True)
return caption
def calorie_comment(menu_name: str, activity: str):
kcal = calorie_table.get(menu_name)
if kcal is None:
return "์ด ๋ฉ”๋‰ด์— ๋Œ€ํ•œ ์นผ๋กœ๋ฆฌ ์ •๋ณด๊ฐ€ ๋“ฑ๋ก๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."
base = f"์˜ˆ์ƒ ์นผ๋กœ๋ฆฌ: ์•ฝ {kcal} kcal.\n"
if activity == "๊ฑฐ์˜ ์•ˆ ์›€์ง์ž„":
if kcal >= 900:
return base + "์˜ค๋Š˜ ํ™œ๋™๋Ÿ‰์„ ๊ณ ๋ คํ•˜๋ฉด ๊ฝค ๋†’์€ ์นผ๋กœ๋ฆฌ๋ผ์„œ, ์ž์ฃผ ๋จน๊ธฐ์—” ๋ถ€๋‹ด๋  ์ˆ˜ ์žˆ์–ด์š”."
elif kcal >= 600:
return base + "์ ๋‹นํ•œ ํŽธ์ด์ง€๋งŒ, ๊ฐ„์‹์ด๋‚˜ ๋‹ค๋ฅธ ์‹์‚ฌ์™€ ํ•จ๊ป˜๋ผ๋ฉด ์ด๋Ÿ‰์„ ์กฐ๊ธˆ ์‹ ๊ฒฝ ์“ฐ๋ฉด ์ข‹๊ฒ ์–ด์š”."
else:
return base + "๊ฐ€๋ฒผ์šด ํŽธ์ด๋ผ ํฐ ๋ถ€๋‹ด ์—†์ด ๋จน์–ด๋„ ๊ดœ์ฐฎ์€ ์ˆ˜์ค€์ด์—์š”."
elif activity == "๋ณดํ†ต ํ™œ๋™":
if kcal >= 1000:
return base + "ํ™œ๋™๋Ÿ‰์„ ๊ณ ๋ คํ•ด๋„ ๊ฝค ๋“ ๋“ ํ•œ ํ•œ ๋ผ๋ผ์„œ, ๋‹ค๋ฅธ ๋ผ๋‹ˆ๋Š” ์กฐ๊ธˆ ๊ฐ€๋ณ๊ฒŒ ๊ตฌ์„ฑํ•˜๋ฉด ์ข‹์•„์š”."
elif kcal >= 700:
return base + "ํ•˜๋ฃจ ํ•œ ๋ผ ๋ฉ”์ธ์œผ๋กœ ๋จน๊ธฐ ์ข‹์€ ์ •๋„์˜ ์นผ๋กœ๋ฆฌ์˜ˆ์š”."
else:
return base + "์กฐ๊ธˆ ๊ฐ€๋ฒผ์šด ํŽธ์ด๋ผ, ๋ฐฐ๊ฐ€ ๋นจ๋ฆฌ ๊บผ์งˆ ์ˆ˜๋Š” ์žˆ์–ด์š”."
else: # ๋งŽ์ด ์›€์ง์ž„
if kcal >= 1000:
return base + "ํ™œ๋™๋Ÿ‰์ด ๋งŽ๋‹ค๋ฉด ์ด ์ •๋„ ์นผ๋กœ๋ฆฌ๋Š” ์ถฉ๋ถ„ํžˆ ์ž˜ ์“ฐ์ผ ๊ฑฐ์˜ˆ์š”!"
elif kcal >= 700:
return base + "์šด๋™ ์ „ํ›„ ํ•œ ๋ผ๋กœ ์ ๋‹นํ•œ ์ˆ˜์ค€์˜ ์—๋„ˆ์ง€ ๊ณต๊ธ‰์ด ๋  ๊ฒƒ ๊ฐ™์•„์š”."
else:
return base + "ํ™œ๋™๋Ÿ‰์— ๋น„ํ•ด ์กฐ๊ธˆ ๊ฐ€๋ฒผ์šด ํŽธ์ด๋ผ, ๊ฐ„๋‹จํ•œ ๊ฐ„์‹์„ ๋” ๊ณ๋“ค์—ฌ๋„ ์ข‹๊ฒ ์–ด์š”."
# =========================================
# 7. Gradio ์›น์•ฑ ๋ฉ”์ธ ํ•จ์ˆ˜
# =========================================
def analyze_menu(image, activity_level, detail_menu_choice):
"""
image: ์—…๋กœ๋“œ๋œ ์ด๋ฏธ์ง€ (PIL)
activity_level: ํ™œ๋™๋Ÿ‰ (๋ผ๋””์˜ค ๋ฒ„ํŠผ)
detail_menu_choice: ์‚ฌ์šฉ์ž๊ฐ€ ์„ ํƒํ•œ ์„ธ๋ถ€ ๋ฉ”๋‰ด (๋“œ๋กญ๋‹ค์šด)
"""
if image is None:
return "์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•ด ์ฃผ์„ธ์š”.", "", "", ""
# 1) ConvNeXt๋กœ ๋ณ‘ํ•ฉ ๋Œ€๋ถ„๋ฅ˜ ์˜ˆ์ธก
big_cls, big_prob, top3_conv = predict_convnext(image)
# 2) ํ•ด๋‹น ๋Œ€๋ถ„๋ฅ˜์— ์„ธ๋ถ€ ํ›„๋ณด๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
fine_candidates = merged_to_fine.get(big_cls, [])
# 3) ์„ธ๋ถ€ ๋ฉ”๋‰ด ๊ฒฐ์ • ๋กœ์ง
if detail_menu_choice is not None and detail_menu_choice != "์„ ํƒ ์•ˆ ํ•จ (๋ชจ๋ธ์— ๋งก๊ธฐ๊ธฐ)":
final_menu = detail_menu_choice
detail_info = f"์‚ฌ์šฉ์ž๊ฐ€ ์ง์ ‘ ์„ ํƒํ•œ ์„ธ๋ถ€ ๋ฉ”๋‰ด: **{final_menu}**"
else:
# ์‚ฌ์šฉ์ž๊ฐ€ ์ง์ ‘ ์„ ํƒ ์•ˆ ํ•œ ๊ฒฝ์šฐ
if big_cls in default_detail:
final_menu = default_detail[big_cls]
detail_info = (
f"์˜ˆ์ธก ๋Œ€๋ถ„๋ฅ˜: **{big_cls}** (์‹ ๋ขฐ๋„: {big_prob*100:.2f}%)\n"
f"์„ธ๋ถ€ ๋ฉ”๋‰ด๋Š” ์„ ํƒํ•˜์ง€ ์•Š์•„, ๋Œ€ํ‘œ ๋ฉ”๋‰ด **'{final_menu}'** ๊ธฐ์ค€์œผ๋กœ ์นผ๋กœ๋ฆฌ๋ฅผ ์•ˆ๋‚ดํ•ฉ๋‹ˆ๋‹ค.\n"
f"(์„ ํƒ ๋ฉ”๋‰ด๋ฅผ ๋ฐ”๊พธ๋ฉด ์นผ๋กœ๋ฆฌ ๋ฌธ์žฅ์ด ๋‹ฌ๋ผ์งˆ ์ˆ˜ ์žˆ์–ด์š”)"
)
else:
# ๋Œ€๋ถ„๋ฅ˜ ์ž์ฒด๊ฐ€ ์ด๋ฏธ ์ตœ์ข… ๋ฉ”๋‰ด์ธ ๊ฒฝ์šฐ
final_menu = big_cls
detail_info = f"์˜ˆ์ธก ๋ฉ”๋‰ด: **{final_menu}** (์‹ ๋ขฐ๋„: {big_prob*100:.2f}%)"
# 4) CLIP Top-3 ์œ ์‚ฌ ๋ณ‘ํ•ฉ ๋ฉ”๋‰ด
clip_top3 = recommend_with_clip(image, top_k=3)
clip_text_lines = []
for name, score in clip_top3:
clip_text_lines.append(f"- {name} (์œ ์‚ฌ๋„: {score:.4f})")
clip_text = "\n".join(clip_text_lines)
# 5) BLIP ์บก์…˜ ์ƒ์„ฑ
caption = generate_caption(image)
# 6) ์นผ๋กœ๋ฆฌ ์ฝ”๋ฉ˜ํŠธ
kcal_text = calorie_comment(final_menu, activity_level)
# 7) ์•ˆ๋‚ด ๋ฌธ๊ตฌ (์„ธ๋ถ€ ํ›„๋ณด ๋ณด์—ฌ์ฃผ๊ธฐ)
if fine_candidates:
candidate_text = (
f"์ด ์ด๋ฏธ์ง€๋Š” **'{big_cls}'**(์œผ)๋กœ ๋ถ„๋ฅ˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n\n"
f"์ด ๋Œ€๋ถ„๋ฅ˜์— ํ•ด๋‹นํ•˜๋Š” ์„ธ๋ถ€ ๋ฉ”๋‰ด ํ›„๋ณด:\n" +
"\n".join([f"- {m}" for m in fine_candidates]) +
"\n\n์œ„ ๋“œ๋กญ๋‹ค์šด์—์„œ ์„ธ๋ถ€ ๋ฉ”๋‰ด๋ฅผ ์ง์ ‘ ์„ ํƒํ•˜๋ฉด ์นผ๋กœ๋ฆฌ ์•ˆ๋‚ด๊ฐ€ ๋” ์ •ํ™•ํ•ด์ง‘๋‹ˆ๋‹ค."
)
else:
candidate_text = f"์ด ์ด๋ฏธ์ง€๋Š” **'{big_cls}'**(์œผ)๋กœ ๋ถ„๋ฅ˜๋˜์—ˆ๊ณ , ๋ณ„๋„์˜ ์„ธ๋ถ€ ๋ฉ”๋‰ด ๋ถ„๊ธฐ๋Š” ์—†๋Š” ์นดํ…Œ๊ณ ๋ฆฌ์ž…๋‹ˆ๋‹ค."
# ์ตœ์ข… ์š”์•ฝ ๋ฉ”์‹œ์ง€
summary = (
f"### ์ตœ์ข… ๋ฉ”๋‰ด ๋ถ„์„\n"
f"- ์˜ˆ์ธก ๋Œ€๋ถ„๋ฅ˜: **{big_cls}** (์‹ ๋ขฐ๋„: {big_prob*100:.2f}%)\n"
f"- ์ตœ์ข… ๊ธฐ์ค€ ๋ฉ”๋‰ด: **{final_menu}**\n"
f"- ํ™œ๋™๋Ÿ‰: **{activity_level}**\n\n"
f"### ์„ธ๋ถ€ ๋ฉ”๋‰ด ์ •๋ณด\n{detail_info}\n\n"
f"### ConvNeXt Top-3 (๋ณ‘ํ•ฉ ํด๋ž˜์Šค ๊ธฐ์ค€)\n" +
"\n".join([f"- {name} ({p*100:.2f}%)" for name, p in top3_conv]) +
"\n\n"
f"### CLIP ์œ ์‚ฌ ๋ฉ”๋‰ด Top-3 (๋ณ‘ํ•ฉ ํด๋ž˜์Šค ๊ธฐ์ค€)\n{clip_text}\n\n"
f"### BLIP ์บก์…˜ (์˜์–ด)\n> {caption}\n\n"
f"### ์นผ๋กœ๋ฆฌ & ํ™œ๋™๋Ÿ‰ ์ฝ”๋ฉ˜ํŠธ\n{kcal_text}\n\n"
f"---\n"
f"{candidate_text}"
)
return summary, caption, clip_text, kcal_text
# =========================================
# 8. Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ •์˜
# =========================================
with gr.Blocks() as demo:
gr.Markdown("## ํ•™์‹ ์Šค์บ๋„ˆ")
with gr.Row():
with gr.Column():
img_input = gr.Image(type="pil", label="๋ฉ”๋‰ด ์‚ฌ์ง„ ์—…๋กœ๋“œ")
activity_input = gr.Radio(
choices=["๊ฑฐ์˜ ์•ˆ ์›€์ง์ž„", "๋ณดํ†ต ํ™œ๋™", "๋งŽ์ด ์›€์ง์ž„"],
value="๋ณดํ†ต ํ™œ๋™",
label="์˜ค๋Š˜ ํ™œ๋™๋Ÿ‰",
)
detail_menu_input = gr.Dropdown(
choices=["์„ ํƒ ์•ˆ ํ•จ (๋ชจ๋ธ์— ๋งก๊ธฐ๊ธฐ)"] + fine_grained_menus,
value="์„ ํƒ ์•ˆ ํ•จ (๋ชจ๋ธ์— ๋งก๊ธฐ๊ธฐ)",
label="์„ธ๋ถ€ ๋ฉ”๋‰ด (์„ ํƒํ•˜๋ฉด ์นผ๋กœ๋ฆฌ ๊ณ„์‚ฐ์— ์‚ฌ์šฉ)",
)
run_btn = gr.Button("๋ถ„์„ ์‹คํ–‰ ")
with gr.Column():
summary_output = gr.Markdown(label="๋ถ„์„ ๊ฒฐ๊ณผ ์š”์•ฝ")
caption_output = gr.Textbox(label="BLIP ์บก์…˜ (์˜์–ด)", lines=2)
clip_output = gr.Textbox(label="CLIP ์œ ์‚ฌ ๋ณ‘ํ•ฉ ๋ฉ”๋‰ด Top-3", lines=4)
kcal_output = gr.Textbox(label="์นผ๋กœ๋ฆฌ ์ฝ”๋ฉ˜ํŠธ", lines=3)
run_btn.click(
fn=analyze_menu,
inputs=[img_input, activity_input, detail_menu_input],
outputs=[summary_output, caption_output, clip_output, kcal_output],
)
demo.launch()