VLproject1 / app.py
lookie14's picture
Update app.py
3259f8c verified
# 0. ํ•„์ˆ˜ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜ (Colab์—์„œ ์‹คํ–‰ ์‹œ ์ฃผ์„ ํ•ด์ œ ํ›„ ๋จผ์ € ์‹คํ–‰)
# !pip install gradio transformers torch pillow
import gradio as gr
import torch
import spaces
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
# 1. ๋ชจ๋ธ ๋ฐ ํ”„๋กœ์„ธ์„œ ๋กœ๋“œ
# ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ๋ฅผ ์œ„ํ•ด ์ตœ์ดˆ ์‹คํ–‰ ์‹œ ์‹œ๊ฐ„์ด ์กฐ๊ธˆ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
model_name = "openai/clip-vit-base-patch32"
try:
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
print(f"๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ: {model_name}")
except Exception as e:
print(f"๋ชจ๋ธ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
# GPU ์‚ฌ์šฉ ๊ฐ€๋Šฅ ์‹œ CUDA ์„ค์ •, ์•„๋‹ˆ๋ฉด CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# 2. ์ถ”๋ก  ํ•จ์ˆ˜ ์ •์˜
@spaces.GPU
def predict(image, text_options):
if image is None:
return None
# ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋œ ๋ฌธ์ž์—ด์„ ๋ฆฌ์ŠคํŠธ๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๊ณต๋ฐฑ ์ œ๊ฑฐ)
candidates = [t.strip() for t in text_options.split(",") if t.strip()]
if not candidates:
return {"Error": "ํ…์ŠคํŠธ ํ›„๋ณด๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."}
# ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ (์ด๋ฏธ์ง€ + ํ…์ŠคํŠธ)
inputs = processor(
text=candidates,
images=image,
return_tensors="pt",
padding=True
).to(device)
# ์ถ”๋ก  (Gradient ๊ณ„์‚ฐ ๋ถˆํ•„์š”)
with torch.no_grad():
outputs = model(**inputs)
# ๊ฒฐ๊ณผ ๊ณ„์‚ฐ (Logits -> Softmax ํ™•๋ฅ  ๋ณ€ํ™˜)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
# ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ ์ƒ์„ฑ (Label: Score ํ˜•ํƒœ)
# Gradio์˜ Label ์ปดํฌ๋„ŒํŠธ๋Š” {๋ผ๋ฒจ: ํ™•๋ฅ } ํ˜•ํƒœ์˜ ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค.
return {candidates[i]: float(probs[i]) for i in range(len(candidates))}
# 3. Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
iface = gr.Interface(
fn=predict,
inputs=[
gr.Image(type="pil", label="์ด๋ฏธ์ง€ ์—…๋กœ๋“œ"),
gr.Textbox(
label="ํ›„๋ณด ํ…์ŠคํŠธ (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„)",
placeholder="์˜ˆ: soccer player, baseball player, referee", # ์˜ˆ์‹œ ์ˆ˜์ •
value="cat, dog, car" # ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •
)
],
outputs=gr.Label(num_top_classes=3, label="๋งค์นญ ๊ฒฐ๊ณผ"),
title="CLIP ์ด๋ฏธ์ง€-ํ…์ŠคํŠธ ๋งค์นญ๊ธฐ",
description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๊ณ , ๊ทธ ์ด๋ฏธ์ง€๊ฐ€ ๋ฌด์—‡์ธ์ง€ ์„ค๋ช…ํ•˜๋Š” ๋‹จ์–ด๋“ค์„ ์‰ผํ‘œ(,)๋กœ ๊ตฌ๋ถ„ํ•ด ์ ์–ด์ฃผ์„ธ์š”. AI๊ฐ€ ๊ฐ€์žฅ ์ ์ ˆํ•œ ์„ค๋ช…์„ ์ฐพ์•„์ค๋‹ˆ๋‹ค."
)
# ๋ฉ”์ธ ์‹คํ–‰ ๋ธ”๋ก ์ˆ˜์ • (์˜คํƒ€ ์ˆ˜์ •: _= -> ==)
if __name__ == "__main__":
# Colab์—์„œ ์‹คํ–‰ ์‹œ share=True๋ฅผ ํ•˜๋ฉด ์™ธ๋ถ€์—์„œ ์ ‘์† ๊ฐ€๋Šฅํ•œ ๋งํฌ๊ฐ€ ์ƒ์„ฑ๋ฉ๋‹ˆ๋‹ค.
iface.launch(share=True)