Spaces:

gopalagra
/

blind-image-captioning

Running

File size: 3,205 Bytes

# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image

# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# #     caption = generate_caption(model, processor, image)
# #     audio_file = text_to_audio_file(caption)
# #     return caption, audio_file  # return file path, not BytesIO


# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3

# def text_to_audio_file(text):
#     # Create a temporary file
#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
#     tmp_path = tmp_file.name
#     tmp_file.close()

#     engine = pyttsx3.init()
#     engine.save_to_file(text, tmp_path)
#     engine.runAndWait()

#     return tmp_path

# def generate_caption_from_image(model, processor, image):
#     # image: PIL.Image
#     inputs = processor(images=image, return_tensors="pt")
#     out = model.generate(**inputs)
#     caption = processor.decode(out[0], skip_special_tokens=True)
#     return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
#     # audio_file = text_to_audio_file(caption)
#     return caption 



# interface = gr.Interface(
#     fn=generate_caption_tts,
#     inputs=gr.Image(type="numpy"),
#     outputs=[gr.Textbox(label="Generated Caption")],
#     title="Image Captioning for Visually Impaired",
#     description="Upload an image, get a caption and audio description."
# )


# interface.launch()
# # demo.launch(share=True)



import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image

# Load small LLaVA model
processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
    "llava/LLaVA-7B-llm-small",
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically use GPU if available
)

def generate_caption(image):
    # Convert to PIL if needed
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    
    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt").to(model.device)
    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode result
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Gradio Interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="LLaVA Image Captioning"
)

interface.launch()