File size: 3,205 Bytes
880b908 2f5c91e 880b908 2f5c91e 880b908 fee2e0a 880b908 fa5cf58 fee2e0a 034b2f2 880b908 0576f19 739fb9a 0576f19 739fb9a 2f5c91e 880b908 ec1090e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image
# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# # caption = generate_caption(model, processor, image)
# # audio_file = text_to_audio_file(caption)
# # return caption, audio_file # return file path, not BytesIO
# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3
# def text_to_audio_file(text):
# # Create a temporary file
# tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
# tmp_path = tmp_file.name
# tmp_file.close()
# engine = pyttsx3.init()
# engine.save_to_file(text, tmp_path)
# engine.runAndWait()
# return tmp_path
# def generate_caption_from_image(model, processor, image):
# # image: PIL.Image
# inputs = processor(images=image, return_tensors="pt")
# out = model.generate(**inputs)
# caption = processor.decode(out[0], skip_special_tokens=True)
# return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
# caption = generate_caption_from_image(model, processor, image) # uses global model/processor
# # audio_file = text_to_audio_file(caption)
# return caption
# interface = gr.Interface(
# fn=generate_caption_tts,
# inputs=gr.Image(type="numpy"),
# outputs=[gr.Textbox(label="Generated Caption")],
# title="Image Captioning for Visually Impaired",
# description="Upload an image, get a caption and audio description."
# )
# interface.launch()
# # demo.launch(share=True)
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image
# Load small LLaVA model
processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
"llava/LLaVA-7B-llm-small",
torch_dtype=torch.float16,
device_map="auto" # Automatically use GPU if available
)
def generate_caption(image):
# Convert to PIL if needed
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Prepare inputs
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode result
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Gradio Interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="LLaVA Image Captioning"
)
interface.launch()
|