Spaces:
Sleeping
Sleeping
File size: 3,161 Bytes
c9f8fb0 880b908 c9f8fb0 fee2e0a c9f8fb0 fa5cf58 fee2e0a 034b2f2 c9f8fb0 880b908 c9f8fb0 880b908 c9f8fb0 880b908 c9f8fb0 739fb9a c9f8fb0 739fb9a c9f8fb0 739fb9a c9f8fb0 739fb9a c9f8fb0 739fb9a c9f8fb0 739fb9a c9f8fb0 2f5c91e 880b908 ec1090e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# app.py
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import io
from PIL import Image
# -------------------------------
# Load BLIP-base model (lighter version)
# -------------------------------
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# -------------------------------
# Generate caption function
# -------------------------------
# def generate_caption_tts(image):
# caption = generate_caption(model, processor, image)
# audio_file = text_to_audio_file(caption)
# return caption, audio_file # return file path, not BytesIO
# -------------------------------
# Convert text to speech using gTTS
# -------------------------------
import tempfile
import pyttsx3
def text_to_audio_file(text):
# Create a temporary file
tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tmp_path = tmp_file.name
tmp_file.close()
engine = pyttsx3.init()
engine.save_to_file(text, tmp_path)
engine.runAndWait()
return tmp_path
def generate_caption_from_image(model, processor, image):
# image: PIL.Image
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
# -------------------------------
# Gradio interface: Caption + Audio
# -------------------------------
def generate_caption_tts(image):
caption = generate_caption_from_image(model, processor, image) # uses global model/processor
# audio_file = text_to_audio_file(caption)
return caption
interface = gr.Interface(
fn=generate_caption_tts,
inputs=gr.Image(type="numpy"),
outputs=[gr.Textbox(label="Generated Caption")],
title="Image Captioning for Visually Impaired",
description="Upload an image, get a caption and audio description."
)
interface.launch()
# # demo.launch(share=True)
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForCausalLM
# import torch
# from PIL import Image
# # Load small LLaVA model
# processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
# model = AutoModelForCausalLM.from_pretrained(
# "llava/LLaVA-7B-llm-small",
# torch_dtype=torch.float16,
# device_map="auto" # Automatically use GPU if available
# )
# def generate_caption(image):
# # Convert to PIL if needed
# if isinstance(image, str):
# image = Image.open(image).convert("RGB")
# # Prepare inputs
# inputs = processor(images=image, return_tensors="pt").to(model.device)
# # Generate output
# outputs = model.generate(**inputs, max_new_tokens=50)
# # Decode result
# caption = processor.decode(outputs[0], skip_special_tokens=True)
# return caption
# # Gradio Interface
# interface = gr.Interface(
# fn=generate_caption,
# inputs=gr.Image(type="pil"),
# outputs=gr.Textbox(label="Generated Caption"),
# title="LLaVA Image Captioning"
# )
# interface.launch()
|