Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import BlipProcessor, BlipForConditionalGeneration, NllbTokenizer, AutoModelForSeq2SeqLM | |
| from PIL import Image | |
| import torch | |
| # Load model 1: English image captioning | |
| blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| # Load model 2: Translate EN → VI | |
| translator_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") | |
| translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") | |
| lang_code = "vie_Latn" | |
| # Manually map lang_code to token ID (based on tokenizer's config) | |
| lang_token_id = translator_tokenizer.convert_tokens_to_ids(lang_code) | |
| def caption_translate(image): | |
| # Step 1: Get English caption | |
| inputs = blip_processor(image, return_tensors="pt") | |
| out = blip_model.generate(**inputs) | |
| eng_caption = blip_processor.decode(out[0], skip_special_tokens=True) | |
| # Step 2: Translate to Vietnamese | |
| inputs = translator_tokenizer(eng_caption, return_tensors="pt", src_lang="eng_Latn") | |
| translated = translator_model.generate( | |
| **inputs, | |
| forced_bos_token_id=lang_token_id, | |
| max_length=100 | |
| ) | |
| vi_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True) | |
| return vi_caption | |
| iface = gr.Interface( | |
| fn=caption_translate, | |
| inputs=gr.Image(type="pil"), | |
| outputs="text", | |
| title="🧠 AI Mô Tả Hình Ảnh Bằng Tiếng Việt", | |
| description="Upload ảnh, hệ thống sẽ mô tả nội dung bằng tiếng Việt bằng cách kết hợp 2 mô hình: caption → translate" | |
| ) | |
| iface.launch() | |