gopalagra's picture
Update app.py
739fb9a verified
raw
history blame
4.22 kB
# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image
# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# # caption = generate_caption(model, processor, image)
# # audio_file = text_to_audio_file(caption)
# # return caption, audio_file # return file path, not BytesIO
# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3
# def text_to_audio_file(text):
# # Create a temporary file
# tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
# tmp_path = tmp_file.name
# tmp_file.close()
# engine = pyttsx3.init()
# engine.save_to_file(text, tmp_path)
# engine.runAndWait()
# return tmp_path
# def generate_caption_from_image(model, processor, image):
# # image: PIL.Image
# inputs = processor(images=image, return_tensors="pt")
# out = model.generate(**inputs)
# caption = processor.decode(out[0], skip_special_tokens=True)
# return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
# caption = generate_caption_from_image(model, processor, image) # uses global model/processor
# # audio_file = text_to_audio_file(caption)
# return caption
# interface = gr.Interface(
# fn=generate_caption_tts,
# inputs=gr.Image(type="numpy"),
# outputs=[gr.Textbox(label="Generated Caption")],
# title="Image Captioning for Visually Impaired",
# description="Upload an image, get a caption and audio description."
# )
# interface.launch()
# # demo.launch(share=True)
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image
# Load small LLaVA model
processor = AutoProcessor.from_pretrained("import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image
# Load small LLaVA model
processor = AutoProcessor.from_pretrained("LLaVA/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
"LLaVA/LLaVA-7B-llm-small",
torch_dtype=torch.float16,
device_map="auto" # Automatically use GPU if available
)
def generate_caption(image):
# Convert to PIL if needed
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Prepare inputs
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode result
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Gradio Interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="LLaVA Image Captioning"
)
interface.launch()
")
model = AutoModelForCausalLM.from_pretrained(
"LLaVA/LLaVA-7B-llm-small",
torch_dtype=torch.float16,
device_map="auto" # Automatically use GPU if available
)
def generate_caption(image):
# Convert to PIL if needed
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Prepare inputs
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode result
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Gradio Interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="LLaVA Image Captioning"
)
interface.launch()