Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import timm | |
| import numpy as np | |
| from PIL import Image | |
| import gradio as gr | |
| from torchvision import transforms | |
| from transformers import ( | |
| CLIPModel, | |
| CLIPProcessor, | |
| BlipProcessor, | |
| BlipForConditionalGeneration, | |
| ) | |
| # ========================================= | |
| # 0. ๊ฒฝ๋ก / ๋๋ฐ์ด์ค ์ค์ | |
| # ========================================= | |
| CLIP_EMBED_PATH = "multimodal_assets/clip_text_embeds.pt" | |
| MODEL_WEIGHTS_PATH = "models/convnext_base_merged_ema.pth" | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(" Device:", device) | |
| # ========================================= | |
| # 1. ๋ณํฉ ํด๋์ค ์ด๋ฆ & CLIP ํ ์คํธ ์๋ฒ ๋ฉ ๋ก๋ | |
| # ========================================= | |
| print(" CLIP ํ ์คํธ ์๋ฒ ๋ฉ ๋ก๋ ์ค...") | |
| clip_data = torch.load(CLIP_EMBED_PATH) | |
| merged_class_names = clip_data["class_names"] # 17๊ฐ ๋ณํฉ ํด๋์ค ์ด๋ฆ | |
| clip_prompts = clip_data["prompts"] | |
| text_embeds = clip_data["text_embeds"] # [17, D] | |
| clip_model_name = clip_data["clip_model_name"] | |
| # ํ ์คํธ ์๋ฒ ๋ฉ์ ๋๋ฐ์ด์ค๋ก ์ฌ๋ฆฌ๊ธฐ | |
| text_embeds = text_embeds.to(device) | |
| print("๋ณํฉ ํด๋์ค ์:", len(merged_class_names)) | |
| print("๋ณํฉ ํด๋์ค ๋ชฉ๋ก:", merged_class_names) | |
| # ========================================= | |
| # 2. ConvNeXt-Base ๋ถ๋ฅ ๋ชจ๋ธ ๋ก๋ | |
| # ========================================= | |
| print(" ConvNeXt-Base ๋ชจ๋ธ ๋ก๋ ์ค (timm)...") | |
| num_classes = len(merged_class_names) | |
| convnext_model = timm.create_model( | |
| "convnext_base", | |
| pretrained=False, | |
| num_classes=num_classes, | |
| ) | |
| state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location="cpu") | |
| convnext_model.load_state_dict(state_dict) | |
| convnext_model.to(device) | |
| convnext_model.eval() | |
| print(" ConvNeXt-Base ํ์ต ๊ฐ์ค์น ๋ก๋ ์๋ฃ") | |
| # ConvNeXt์ฉ ์ ์ฒ๋ฆฌ (๊ฒ์ฆ์ฉ) | |
| mean = (0.485, 0.456, 0.406) | |
| std = (0.229, 0.224, 0.225) | |
| val_transform = transforms.Compose([ | |
| transforms.Resize(256), | |
| transforms.CenterCrop(224), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean, std), | |
| ]) | |
| # ========================================= | |
| # 3. CLIP ๋ชจ๋ธ ๋ก๋ | |
| # ========================================= | |
| print(f" CLIP ๋ชจ๋ธ ๋ก๋ ์ค... ({clip_model_name})") | |
| clip_model = CLIPModel.from_pretrained(clip_model_name) | |
| clip_processor = CLIPProcessor.from_pretrained(clip_model_name) | |
| clip_model.to(device) | |
| clip_model.eval() | |
| # ========================================= | |
| # 4. BLIP ์บก์ ๋ชจ๋ธ ๋ก๋ | |
| # ========================================= | |
| print(" BLIP ์บก์ ๋ชจ๋ธ ๋ก๋ ์ค... (Salesforce/blip-image-captioning-base)") | |
| blip_model_name = "Salesforce/blip-image-captioning-base" | |
| blip_processor = BlipProcessor.from_pretrained(blip_model_name) | |
| blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device) | |
| blip_model.eval() | |
| # ========================================= | |
| # 5. ์ธ๋ถ ๋ฉ๋ด ํ๋ณด / ์นผ๋ก๋ฆฌ ์ ๋ณด ์ ์ | |
| # ========================================= | |
| # ์๋ 27๊ฐ ๋ฉ๋ด(์ธ๋ถ ๋ฉ๋ด) | |
| fine_grained_menus = [ | |
| "๊ฐ์ฅ๋ผ๋ถ๋ฎ๋ฐฅ", | |
| "๊ณ ์ถ์นํจ์นด๋ ๋", | |
| "๊ณต๊ธฐ๋ฐฅ", | |
| "๊น์น์ด๋ฌต์ฐ๋", | |
| "๋ญ๊ฐ์ ", | |
| "๋๊น์ค์ค๋ฏ๋ผ์ด์ค", | |
| "๋๊น์ค์ฐ๋์ธํธ", | |
| "๋๊น์ค์นด๋ ๋", | |
| "๋ฑ์ฌ๋๊น์ค", | |
| "๋ง๊ทธ๋ง์์ฐํ๊น์๋ฐฅ", | |
| "๋ง๊ทธ๋ง์นํจ๋ง์", | |
| "๋ฒ ์ด์ปจ ์๋ฆฌ์ค์ฌ๋ฆฌ์ค", | |
| "์ผ๊ฒน๋์ฅ์ง๊ธ์ด", | |
| "์ผ๊ฒน์ด๊ฐ๋์ฅ๋น๋น๋ฐฅ", | |
| "์์ฐํ๊น์๋ฐฅ", | |
| "์์ฐํ๊น์ฐ๋", | |
| "์๋ก์๋ก", | |
| "์ ๋ผ๋ฉด(๊ณ๋)", | |
| "์ ๋ผ๋ฉด(๊ณ๋+์น์ฆ)", | |
| "์๋ ์นํจ์ค๋ฏ๋ผ์ด์ค", | |
| "์ด๋ฌต์ฐ๋", | |
| "์๋น์นด๋ ๋", | |
| "์ค๋ฏ๋ผ์ด์ค", | |
| "์ซ์ซ์ด๋ฎ๋ฐฅ", | |
| "์นํจ๋ง์", | |
| "์ผ๋ค๋์์์ง", | |
| "์ผ๋ค๋์์์ง์ค๋ฏ๋ผ์ด์ค", | |
| ] | |
| # ๋ณํฉ ๋๋ถ๋ฅ โ ์ธ๋ถ ๋ฉ๋ด ํ๋ณด | |
| merged_to_fine = { | |
| "์ค๋ฏ๋ผ์ด์ค๋ฅ": ["์ค๋ฏ๋ผ์ด์ค", "๋๊น์ค์ค๋ฏ๋ผ์ด์ค", "์ผ๋ค๋์์์ง์ค๋ฏ๋ผ์ด์ค"], | |
| "์นํจ๋ง์๋ฅ": ["์นํจ๋ง์", "๋ง๊ทธ๋ง์นํจ๋ง์"], | |
| "์์ฐํ๊น์๋ฐฅ๋ฅ": ["์์ฐํ๊น์๋ฐฅ", "๋ง๊ทธ๋ง์์ฐํ๊น์๋ฐฅ"], | |
| "๋ผ๋ฉด๋ฅ": ["์ ๋ผ๋ฉด(๊ณ๋)", "์ ๋ผ๋ฉด(๊ณ๋+์น์ฆ)"], | |
| } | |
| # ๋ํ ์ธ๋ถ ๋ฉ๋ด (์ฌ์ฉ์๊ฐ ์ ํ ์ ํ์ ๋ ๊ธฐ๋ณธ๊ฐ) | |
| default_detail = { | |
| "์ค๋ฏ๋ผ์ด์ค๋ฅ": "์ค๋ฏ๋ผ์ด์ค", | |
| "์นํจ๋ง์๋ฅ": "์นํจ๋ง์", | |
| "์์ฐํ๊น์๋ฐฅ๋ฅ": "์์ฐํ๊น์๋ฐฅ", | |
| "๋ผ๋ฉด๋ฅ": "์ ๋ผ๋ฉด(๊ณ๋)", | |
| } | |
| # ์์ฃผ ๋๋ต์ ์ธ ์นผ๋ก๋ฆฌ ํ ์ด๋ธ | |
| calorie_table = { | |
| "๊ฐ์ฅ๋ผ๋ถ๋ฎ๋ฐฅ": 800, | |
| "๊ณ ์ถ์นํจ์นด๋ ๋": 900, | |
| "๊ณต๊ธฐ๋ฐฅ": 300, | |
| "๊น์น์ด๋ฌต์ฐ๋": 500, | |
| "๋ญ๊ฐ์ ": 450, | |
| "๋๊น์ค์ค๋ฏ๋ผ์ด์ค": 950, | |
| "๋๊น์ค์ฐ๋์ธํธ": 900, | |
| "๋๊น์ค์นด๋ ๋": 900, | |
| "๋ฑ์ฌ๋๊น์ค": 700, | |
| "๋ง๊ทธ๋ง์์ฐํ๊น์๋ฐฅ": 800, | |
| "๋ง๊ทธ๋ง์นํจ๋ง์": 850, | |
| "๋ฒ ์ด์ปจ ์๋ฆฌ์ค์ฌ๋ฆฌ์ค": 800, | |
| "์ผ๊ฒน๋์ฅ์ง๊ธ์ด": 750, | |
| "์ผ๊ฒน์ด๊ฐ๋์ฅ๋น๋น๋ฐฅ": 800, | |
| "์์ฐํ๊น์๋ฐฅ": 750, | |
| "์์ฐํ๊น์ฐ๋": 550, | |
| "์๋ก์๋ก": 450, | |
| "์ ๋ผ๋ฉด(๊ณ๋)": 570, | |
| "์ ๋ผ๋ฉด(๊ณ๋+์น์ฆ)": 630, | |
| "์๋ ์นํจ์ค๋ฏ๋ผ์ด์ค": 950, | |
| "์ด๋ฌต์ฐ๋": 450, | |
| "์๋น์นด๋ ๋": 800, | |
| "์ค๋ฏ๋ผ์ด์ค": 730, | |
| "์ซ์ซ์ด๋ฎ๋ฐฅ": 700, | |
| "์นํจ๋ง์": 800, | |
| "์ผ๋ค๋์์์ง": 280, | |
| "์ผ๋ค๋์์์ง์ค๋ฏ๋ผ์ด์ค": 1000, | |
| } | |
| # ========================================= | |
| # 6. ์ ํธ ํจ์๋ค | |
| # ========================================= | |
| def predict_convnext(image: Image.Image): | |
| """ConvNeXt-Base๋ก ๋ณํฉ ๋๋ถ๋ฅ ์์ธก""" | |
| convnext_model.eval() | |
| img_t = val_transform(image).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| logits = convnext_model(img_t) | |
| probs = F.softmax(logits, dim=-1).cpu().numpy()[0] | |
| top1_idx = int(np.argmax(probs)) | |
| top1_prob = float(probs[top1_idx]) | |
| # Top-3๋ ๋ณด๊ณ ์ถ์ผ๋ฉด: | |
| top3_idx = np.argsort(probs)[::-1][:3] | |
| top3 = [(merged_class_names[i], float(probs[i])) for i in top3_idx] | |
| return merged_class_names[top1_idx], top1_prob, top3 | |
| def recommend_with_clip(image: Image.Image, top_k=3): | |
| """CLIP์ผ๋ก ๋ณํฉ ๋๋ถ๋ฅ ๊ธฐ์ค ์ ์ฌ ๋ฉ๋ด Top-K""" | |
| clip_model.eval() | |
| inputs = clip_processor(images=image, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| img_feat = clip_model.get_image_features(**inputs) | |
| img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True) | |
| sims = (img_feat @ text_embeds.T).squeeze(0) # [17] | |
| topk = sims.topk(top_k) | |
| indices = topk.indices.tolist() | |
| scores = topk.values.tolist() | |
| result = [(merged_class_names[i], float(s)) for i, s in zip(indices, scores)] | |
| return result | |
| def generate_caption(image: Image.Image): | |
| """BLIP์ผ๋ก ์ด๋ฏธ์ง ์บก์ ์์ฑ""" | |
| blip_model.eval() | |
| inputs = blip_processor(images=image, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| out = blip_model.generate(**inputs, max_new_tokens=20) | |
| caption = blip_processor.decode(out[0], skip_special_tokens=True) | |
| return caption | |
| def calorie_comment(menu_name: str, activity: str): | |
| kcal = calorie_table.get(menu_name) | |
| if kcal is None: | |
| return "์ด ๋ฉ๋ด์ ๋ํ ์นผ๋ก๋ฆฌ ์ ๋ณด๊ฐ ๋ฑ๋ก๋์ด ์์ง ์์ต๋๋ค." | |
| base = f"์์ ์นผ๋ก๋ฆฌ: ์ฝ {kcal} kcal.\n" | |
| if activity == "๊ฑฐ์ ์ ์์ง์": | |
| if kcal >= 900: | |
| return base + "์ค๋ ํ๋๋์ ๊ณ ๋ คํ๋ฉด ๊ฝค ๋์ ์นผ๋ก๋ฆฌ๋ผ์, ์์ฃผ ๋จน๊ธฐ์ ๋ถ๋ด๋ ์ ์์ด์." | |
| elif kcal >= 600: | |
| return base + "์ ๋นํ ํธ์ด์ง๋ง, ๊ฐ์์ด๋ ๋ค๋ฅธ ์์ฌ์ ํจ๊ป๋ผ๋ฉด ์ด๋์ ์กฐ๊ธ ์ ๊ฒฝ ์ฐ๋ฉด ์ข๊ฒ ์ด์." | |
| else: | |
| return base + "๊ฐ๋ฒผ์ด ํธ์ด๋ผ ํฐ ๋ถ๋ด ์์ด ๋จน์ด๋ ๊ด์ฐฎ์ ์์ค์ด์์." | |
| elif activity == "๋ณดํต ํ๋": | |
| if kcal >= 1000: | |
| return base + "ํ๋๋์ ๊ณ ๋ คํด๋ ๊ฝค ๋ ๋ ํ ํ ๋ผ๋ผ์, ๋ค๋ฅธ ๋ผ๋๋ ์กฐ๊ธ ๊ฐ๋ณ๊ฒ ๊ตฌ์ฑํ๋ฉด ์ข์์." | |
| elif kcal >= 700: | |
| return base + "ํ๋ฃจ ํ ๋ผ ๋ฉ์ธ์ผ๋ก ๋จน๊ธฐ ์ข์ ์ ๋์ ์นผ๋ก๋ฆฌ์์." | |
| else: | |
| return base + "์กฐ๊ธ ๊ฐ๋ฒผ์ด ํธ์ด๋ผ, ๋ฐฐ๊ฐ ๋นจ๋ฆฌ ๊บผ์ง ์๋ ์์ด์." | |
| else: # ๋ง์ด ์์ง์ | |
| if kcal >= 1000: | |
| return base + "ํ๋๋์ด ๋ง๋ค๋ฉด ์ด ์ ๋ ์นผ๋ก๋ฆฌ๋ ์ถฉ๋ถํ ์ ์ฐ์ผ ๊ฑฐ์์!" | |
| elif kcal >= 700: | |
| return base + "์ด๋ ์ ํ ํ ๋ผ๋ก ์ ๋นํ ์์ค์ ์๋์ง ๊ณต๊ธ์ด ๋ ๊ฒ ๊ฐ์์." | |
| else: | |
| return base + "ํ๋๋์ ๋นํด ์กฐ๊ธ ๊ฐ๋ฒผ์ด ํธ์ด๋ผ, ๊ฐ๋จํ ๊ฐ์์ ๋ ๊ณ๋ค์ฌ๋ ์ข๊ฒ ์ด์." | |
| # ========================================= | |
| # 7. Gradio ์น์ฑ ๋ฉ์ธ ํจ์ | |
| # ========================================= | |
| def analyze_menu(image, activity_level, detail_menu_choice): | |
| """ | |
| image: ์ ๋ก๋๋ ์ด๋ฏธ์ง (PIL) | |
| activity_level: ํ๋๋ (๋ผ๋์ค ๋ฒํผ) | |
| detail_menu_choice: ์ฌ์ฉ์๊ฐ ์ ํํ ์ธ๋ถ ๋ฉ๋ด (๋๋กญ๋ค์ด) | |
| """ | |
| if image is None: | |
| return "์ด๋ฏธ์ง๋ฅผ ์ ๋ก๋ํด ์ฃผ์ธ์.", "", "", "" | |
| # 1) ConvNeXt๋ก ๋ณํฉ ๋๋ถ๋ฅ ์์ธก | |
| big_cls, big_prob, top3_conv = predict_convnext(image) | |
| # 2) ํด๋น ๋๋ถ๋ฅ์ ์ธ๋ถ ํ๋ณด๊ฐ ์๋์ง ํ์ธ | |
| fine_candidates = merged_to_fine.get(big_cls, []) | |
| # 3) ์ธ๋ถ ๋ฉ๋ด ๊ฒฐ์ ๋ก์ง | |
| if detail_menu_choice is not None and detail_menu_choice != "์ ํ ์ ํจ (๋ชจ๋ธ์ ๋งก๊ธฐ๊ธฐ)": | |
| final_menu = detail_menu_choice | |
| detail_info = f"์ฌ์ฉ์๊ฐ ์ง์ ์ ํํ ์ธ๋ถ ๋ฉ๋ด: **{final_menu}**" | |
| else: | |
| # ์ฌ์ฉ์๊ฐ ์ง์ ์ ํ ์ ํ ๊ฒฝ์ฐ | |
| if big_cls in default_detail: | |
| final_menu = default_detail[big_cls] | |
| detail_info = ( | |
| f"์์ธก ๋๋ถ๋ฅ: **{big_cls}** (์ ๋ขฐ๋: {big_prob*100:.2f}%)\n" | |
| f"์ธ๋ถ ๋ฉ๋ด๋ ์ ํํ์ง ์์, ๋ํ ๋ฉ๋ด **'{final_menu}'** ๊ธฐ์ค์ผ๋ก ์นผ๋ก๋ฆฌ๋ฅผ ์๋ดํฉ๋๋ค.\n" | |
| f"(์ ํ ๋ฉ๋ด๋ฅผ ๋ฐ๊พธ๋ฉด ์นผ๋ก๋ฆฌ ๋ฌธ์ฅ์ด ๋ฌ๋ผ์ง ์ ์์ด์)" | |
| ) | |
| else: | |
| # ๋๋ถ๋ฅ ์์ฒด๊ฐ ์ด๋ฏธ ์ต์ข ๋ฉ๋ด์ธ ๊ฒฝ์ฐ | |
| final_menu = big_cls | |
| detail_info = f"์์ธก ๋ฉ๋ด: **{final_menu}** (์ ๋ขฐ๋: {big_prob*100:.2f}%)" | |
| # 4) CLIP Top-3 ์ ์ฌ ๋ณํฉ ๋ฉ๋ด | |
| clip_top3 = recommend_with_clip(image, top_k=3) | |
| clip_text_lines = [] | |
| for name, score in clip_top3: | |
| clip_text_lines.append(f"- {name} (์ ์ฌ๋: {score:.4f})") | |
| clip_text = "\n".join(clip_text_lines) | |
| # 5) BLIP ์บก์ ์์ฑ | |
| caption = generate_caption(image) | |
| # 6) ์นผ๋ก๋ฆฌ ์ฝ๋ฉํธ | |
| kcal_text = calorie_comment(final_menu, activity_level) | |
| # 7) ์๋ด ๋ฌธ๊ตฌ (์ธ๋ถ ํ๋ณด ๋ณด์ฌ์ฃผ๊ธฐ) | |
| if fine_candidates: | |
| candidate_text = ( | |
| f"์ด ์ด๋ฏธ์ง๋ **'{big_cls}'**(์ผ)๋ก ๋ถ๋ฅ๋์์ต๋๋ค.\n\n" | |
| f"์ด ๋๋ถ๋ฅ์ ํด๋นํ๋ ์ธ๋ถ ๋ฉ๋ด ํ๋ณด:\n" + | |
| "\n".join([f"- {m}" for m in fine_candidates]) + | |
| "\n\n์ ๋๋กญ๋ค์ด์์ ์ธ๋ถ ๋ฉ๋ด๋ฅผ ์ง์ ์ ํํ๋ฉด ์นผ๋ก๋ฆฌ ์๋ด๊ฐ ๋ ์ ํํด์ง๋๋ค." | |
| ) | |
| else: | |
| candidate_text = f"์ด ์ด๋ฏธ์ง๋ **'{big_cls}'**(์ผ)๋ก ๋ถ๋ฅ๋์๊ณ , ๋ณ๋์ ์ธ๋ถ ๋ฉ๋ด ๋ถ๊ธฐ๋ ์๋ ์นดํ ๊ณ ๋ฆฌ์ ๋๋ค." | |
| # ์ต์ข ์์ฝ ๋ฉ์์ง | |
| summary = ( | |
| f"### ์ต์ข ๋ฉ๋ด ๋ถ์\n" | |
| f"- ์์ธก ๋๋ถ๋ฅ: **{big_cls}** (์ ๋ขฐ๋: {big_prob*100:.2f}%)\n" | |
| f"- ์ต์ข ๊ธฐ์ค ๋ฉ๋ด: **{final_menu}**\n" | |
| f"- ํ๋๋: **{activity_level}**\n\n" | |
| f"### ์ธ๋ถ ๋ฉ๋ด ์ ๋ณด\n{detail_info}\n\n" | |
| f"### ConvNeXt Top-3 (๋ณํฉ ํด๋์ค ๊ธฐ์ค)\n" + | |
| "\n".join([f"- {name} ({p*100:.2f}%)" for name, p in top3_conv]) + | |
| "\n\n" | |
| f"### CLIP ์ ์ฌ ๋ฉ๋ด Top-3 (๋ณํฉ ํด๋์ค ๊ธฐ์ค)\n{clip_text}\n\n" | |
| f"### BLIP ์บก์ (์์ด)\n> {caption}\n\n" | |
| f"### ์นผ๋ก๋ฆฌ & ํ๋๋ ์ฝ๋ฉํธ\n{kcal_text}\n\n" | |
| f"---\n" | |
| f"{candidate_text}" | |
| ) | |
| return summary, caption, clip_text, kcal_text | |
| # ========================================= | |
| # 8. Gradio ์ธํฐํ์ด์ค ์ ์ | |
| # ========================================= | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## ํ์ ์ค์บ๋") | |
| with gr.Row(): | |
| with gr.Column(): | |
| img_input = gr.Image(type="pil", label="๋ฉ๋ด ์ฌ์ง ์ ๋ก๋") | |
| activity_input = gr.Radio( | |
| choices=["๊ฑฐ์ ์ ์์ง์", "๋ณดํต ํ๋", "๋ง์ด ์์ง์"], | |
| value="๋ณดํต ํ๋", | |
| label="์ค๋ ํ๋๋", | |
| ) | |
| detail_menu_input = gr.Dropdown( | |
| choices=["์ ํ ์ ํจ (๋ชจ๋ธ์ ๋งก๊ธฐ๊ธฐ)"] + fine_grained_menus, | |
| value="์ ํ ์ ํจ (๋ชจ๋ธ์ ๋งก๊ธฐ๊ธฐ)", | |
| label="์ธ๋ถ ๋ฉ๋ด (์ ํํ๋ฉด ์นผ๋ก๋ฆฌ ๊ณ์ฐ์ ์ฌ์ฉ)", | |
| ) | |
| run_btn = gr.Button("๋ถ์ ์คํ ") | |
| with gr.Column(): | |
| summary_output = gr.Markdown(label="๋ถ์ ๊ฒฐ๊ณผ ์์ฝ") | |
| caption_output = gr.Textbox(label="BLIP ์บก์ (์์ด)", lines=2) | |
| clip_output = gr.Textbox(label="CLIP ์ ์ฌ ๋ณํฉ ๋ฉ๋ด Top-3", lines=4) | |
| kcal_output = gr.Textbox(label="์นผ๋ก๋ฆฌ ์ฝ๋ฉํธ", lines=3) | |
| run_btn.click( | |
| fn=analyze_menu, | |
| inputs=[img_input, activity_input, detail_menu_input], | |
| outputs=[summary_output, caption_output, clip_output, kcal_output], | |
| ) | |
| demo.launch() | |