Spaces:
Sleeping
Sleeping
File size: 4,215 Bytes
880b908 2f5c91e 880b908 2f5c91e 880b908 fee2e0a 880b908 fa5cf58 fee2e0a 034b2f2 880b908 739fb9a 880b908 739fb9a 880b908 2f5c91e 880b908 c11c555 28cca05 880b908 ec1090e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image
# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# # caption = generate_caption(model, processor, image)
# # audio_file = text_to_audio_file(caption)
# # return caption, audio_file # return file path, not BytesIO
# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3
# def text_to_audio_file(text):
# # Create a temporary file
# tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
# tmp_path = tmp_file.name
# tmp_file.close()
# engine = pyttsx3.init()
# engine.save_to_file(text, tmp_path)
# engine.runAndWait()
# return tmp_path
# def generate_caption_from_image(model, processor, image):
# # image: PIL.Image
# inputs = processor(images=image, return_tensors="pt")
# out = model.generate(**inputs)
# caption = processor.decode(out[0], skip_special_tokens=True)
# return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
# caption = generate_caption_from_image(model, processor, image) # uses global model/processor
# # audio_file = text_to_audio_file(caption)
# return caption
# interface = gr.Interface(
# fn=generate_caption_tts,
# inputs=gr.Image(type="numpy"),
# outputs=[gr.Textbox(label="Generated Caption")],
# title="Image Captioning for Visually Impaired",
# description="Upload an image, get a caption and audio description."
# )
# interface.launch()
# # demo.launch(share=True)
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image
# Load small LLaVA model
processor = AutoProcessor.from_pretrained("import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image
# Load small LLaVA model
processor = AutoProcessor.from_pretrained("LLaVA/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
"LLaVA/LLaVA-7B-llm-small",
torch_dtype=torch.float16,
device_map="auto" # Automatically use GPU if available
)
def generate_caption(image):
# Convert to PIL if needed
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Prepare inputs
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode result
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Gradio Interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="LLaVA Image Captioning"
)
interface.launch()
")
model = AutoModelForCausalLM.from_pretrained(
"LLaVA/LLaVA-7B-llm-small",
torch_dtype=torch.float16,
device_map="auto" # Automatically use GPU if available
)
def generate_caption(image):
# Convert to PIL if needed
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Prepare inputs
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode result
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Gradio Interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="LLaVA Image Captioning"
)
interface.launch()
|