Spaces:
Sleeping
Sleeping
File size: 5,529 Bytes
c9f8fb0 4bd925a c9f8fb0 4bd925a fee2e0a 4bd925a c9f8fb0 4bd925a fa5cf58 fee2e0a 4bd925a 034b2f2 4bd925a fd13abe 4bd925a d18aa71 f30c62d d18aa71 f30c62d d18aa71 f30c62d d18aa71 f30c62d d18aa71 f30c62d 4bd925a f30c62d 4bd925a d18aa71 4bd925a d18aa71 cd4d77a d18aa71 cd4d77a d18aa71 cd4d77a d18aa71 f30c62d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image
# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# # caption = generate_caption(model, processor, image)
# # audio_file = text_to_audio_file(caption)
# # return caption, audio_file # return file path, not BytesIO
# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3
# def text_to_audio_file(text):
# # Create a temporary file
# tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
# tmp_path = tmp_file.name
# tmp_file.close()
# engine = pyttsx3.init()
# engine.save_to_file(text, tmp_path)
# engine.runAndWait()
# return tmp_path
# def generate_caption_from_image(model, processor, image):
# # image: PIL.Image
# inputs = processor(images=image, return_tensors="pt")
# out = model.generate(**inputs)
# caption = processor.decode(out[0], skip_special_tokens=True)
# return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
# caption = generate_caption_from_image(model, processor, image) # uses global model/processor
# # audio_file = text_to_audio_file(caption)
# return caption
# interface = gr.Interface(
# fn=generate_caption_tts,
# inputs=gr.Image(type="numpy"),
# outputs=[gr.Textbox(label="Generated Caption")],
# title="Image Captioning for Visually Impaired",
# description="Upload an image, get a caption and audio description."
# )
# interface.launch()
# # demo.launch(share=True)
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
from PIL import Image
import torch
import streamlit as st
# ----------------------
# Cached Model Loaders
# ----------------------
@st.cache_resource
def load_caption_model():
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
return processor, model
@st.cache_resource
def load_vqa_model():
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16, device_map="auto"
)
return processor, model
@st.cache_resource
def load_translation_models():
return {
"Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
"French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
"Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
}
# ----------------------
# Load All Models with Spinner
# ----------------------
with st.spinner("Loading BLIP2 models... please wait ⏳"):
caption_processor, caption_model = load_caption_model()
vqa_processor, vqa_model = load_vqa_model()
translation_models = load_translation_models()
st.success("✅ Models are ready!")
# ----------------------
# Caption + Translate Function
# ----------------------
def generate_caption_translate(image, target_lang):
inputs = caption_processor(image, return_tensors="pt")
out = caption_model.generate(**inputs, max_new_tokens=50)
english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
if target_lang in translation_models:
translated = translation_models[target_lang](english_caption)[0]['translation_text']
else:
translated = "Translation not available"
return english_caption, translated
# ----------------------
# VQA Function
# ----------------------
def vqa(image, question):
inputs = vqa_processor(image, question, return_tensors="pt").to(vqa_model.device)
out = vqa_model.generate(**inputs, max_new_tokens=100)
answer = vqa_processor.decode(out[0], skip_special_tokens=True)
return answer
# ----------------------
# Gradio UI
# ----------------------
with gr.Blocks(title="BLIP2 Vision App") as demo:
gr.Markdown("## 🖼️ BLIP2: Image Captioning + Translation + Question Answering")
with gr.Tab("Caption + Translate"):
with gr.Row():
img_in = gr.Image(type="pil", label="Upload Image")
lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To")
eng_out = gr.Textbox(label="English Caption")
trans_out = gr.Textbox(label="Translated Caption")
btn1 = gr.Button("Generate Caption & Translate")
btn1.click(generate_caption_translate, inputs=[img_in, lang_in], outputs=[eng_out, trans_out])
with gr.Tab("Visual Question Answering (VQA)"):
with gr.Row():
img_vqa = gr.Image(type="pil", label="Upload Image")
q_in = gr.Textbox(label="Ask a Question about the Image")
ans_out = gr.Textbox(label="Answer")
btn2 = gr.Button("Ask")
btn2.click(vqa, inputs=[img_vqa, q_in], outputs=ans_out)
demo.launch()
|