Spaces:

gopalagra
/

blind-image-captioning

Sleeping

File size: 3,161 Bytes

c9f8fb0
 
 
 
 
 
880b908
c9f8fb0
 
 
 
 
 
 
 
 
fee2e0a
c9f8fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa5cf58
fee2e0a
034b2f2
c9f8fb0
 
 
 
 
 
 
880b908
 
c9f8fb0
880b908
 
 
 
c9f8fb0
 
 
 
880b908
c9f8fb0
 
 
 
 
 
 
739fb9a
c9f8fb0
 
 
 
739fb9a
c9f8fb0
 
739fb9a
c9f8fb0
 
739fb9a
c9f8fb0
 
 
739fb9a
c9f8fb0
 
 
 
 
 
 
739fb9a
c9f8fb0
2f5c91e
880b908
ec1090e

# app.py
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import io
from PIL import Image

# -------------------------------
# Load BLIP-base model (lighter version)
# -------------------------------
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# -------------------------------
# Generate caption function
# -------------------------------
# def generate_caption_tts(image):
#     caption = generate_caption(model, processor, image)
#     audio_file = text_to_audio_file(caption)
#     return caption, audio_file  # return file path, not BytesIO


# -------------------------------
# Convert text to speech using gTTS
# -------------------------------
import tempfile
import pyttsx3

def text_to_audio_file(text):
    # Create a temporary file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
    tmp_path = tmp_file.name
    tmp_file.close()

    engine = pyttsx3.init()
    engine.save_to_file(text, tmp_path)
    engine.runAndWait()

    return tmp_path

def generate_caption_from_image(model, processor, image):
    # image: PIL.Image
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption
# -------------------------------
# Gradio interface: Caption + Audio
# -------------------------------
def generate_caption_tts(image):
    caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
    # audio_file = text_to_audio_file(caption)
    return caption 



interface = gr.Interface(
    fn=generate_caption_tts,
    inputs=gr.Image(type="numpy"),
    outputs=[gr.Textbox(label="Generated Caption")],
    title="Image Captioning for Visually Impaired",
    description="Upload an image, get a caption and audio description."
)


interface.launch()
# # demo.launch(share=True)



# import gradio as gr
# from transformers import AutoProcessor, AutoModelForCausalLM
# import torch
# from PIL import Image

# # Load small LLaVA model
# processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
# model = AutoModelForCausalLM.from_pretrained(
#     "llava/LLaVA-7B-llm-small",
#     torch_dtype=torch.float16,
#     device_map="auto"  # Automatically use GPU if available
# )

# def generate_caption(image):
#     # Convert to PIL if needed
#     if isinstance(image, str):
#         image = Image.open(image).convert("RGB")
    
#     # Prepare inputs
#     inputs = processor(images=image, return_tensors="pt").to(model.device)
    
#     # Generate output
#     outputs = model.generate(**inputs, max_new_tokens=50)
    
#     # Decode result
#     caption = processor.decode(outputs[0], skip_special_tokens=True)
#     return caption

# # Gradio Interface
# interface = gr.Interface(
#     fn=generate_caption,
#     inputs=gr.Image(type="pil"),
#     outputs=gr.Textbox(label="Generated Caption"),
#     title="LLaVA Image Captioning"
# )

# interface.launch()