sanvo's picture
Upload folder using huggingface_hub
2582729 verified
"""Gradio interactive demo for Vietnamese image captioning.
Upload an image to get a Vietnamese caption generated through the
BLIP → CLIP → NLLB pipeline.
Usage:
python app.py
"""
import gradio as gr
from PIL import Image
from generate_captions import load_blip_model, generate_captions
from rank_captions import load_clip_model, rank_captions
from translate import load_nllb_model, translate_to_vietnamese
# Load all models at startup
print("Loading models for Vietnamese Image Captioning...")
blip_model, blip_processor, device = load_blip_model()
clip_model, clip_processor, _ = load_clip_model()
clip_model = clip_model.to(device)
nllb_model, nllb_tokenizer, _ = load_nllb_model()
nllb_model = nllb_model.to(device)
print("All models loaded.\n")
def caption_image(image, num_captions=5):
"""Generate Vietnamese caption for uploaded image.
Args:
image: PIL Image from Gradio.
num_captions: Number of candidate captions.
Returns:
Vietnamese caption, English caption, ranked captions details.
"""
if image is None:
return "No image provided.", "", ""
image = image.convert("RGB")
# Stage 1: BLIP generates English captions
english_captions = generate_captions(
image, blip_model, blip_processor, device,
num_captions=num_captions,
)
# Stage 2: CLIP ranks captions
ranked = rank_captions(image, english_captions, clip_model, clip_processor, device)
best_english = ranked[0][0] if ranked else english_captions[0]
# Stage 3: NLLB translates to Vietnamese
vietnamese = translate_to_vietnamese(best_english, nllb_model, nllb_tokenizer, device)
# Format ranked captions for display
ranked_text = "\n".join(
f"{i+1}. [{score:.3f}] {cap}"
for i, (cap, score) in enumerate(ranked)
)
return vietnamese, best_english, ranked_text
demo = gr.Interface(
fn=caption_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Slider(minimum=3, maximum=10, value=5, step=1, label="Number of Candidate Captions"),
],
outputs=[
gr.Textbox(label="Vietnamese Caption (Chú thích tiếng Việt)"),
gr.Textbox(label="Best English Caption"),
gr.Textbox(label="All Ranked Captions (CLIP scores)", lines=6),
],
title="Vietnamese Multi-Modal Image Captioning",
description=(
"Upload an image to generate a Vietnamese caption.\n\n"
"**Pipeline:** BLIP (caption generation) → CLIP (ranking) → NLLB (translation)\n\n"
"1. **BLIP** generates multiple English captions\n"
"2. **CLIP** ranks them by image-text similarity\n"
"3. **NLLB** translates the best caption to Vietnamese"
),
theme=gr.themes.Soft(),
)
if __name__ == "__main__":
demo.launch(share=False)