File size: 3,205 Bytes
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5c91e
880b908
 
 
 
 
2f5c91e
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fee2e0a
880b908
 
 
fa5cf58
fee2e0a
034b2f2
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0576f19
739fb9a
0576f19
739fb9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5c91e
880b908
ec1090e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image

# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# #     caption = generate_caption(model, processor, image)
# #     audio_file = text_to_audio_file(caption)
# #     return caption, audio_file  # return file path, not BytesIO


# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3

# def text_to_audio_file(text):
#     # Create a temporary file
#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
#     tmp_path = tmp_file.name
#     tmp_file.close()

#     engine = pyttsx3.init()
#     engine.save_to_file(text, tmp_path)
#     engine.runAndWait()

#     return tmp_path

# def generate_caption_from_image(model, processor, image):
#     # image: PIL.Image
#     inputs = processor(images=image, return_tensors="pt")
#     out = model.generate(**inputs)
#     caption = processor.decode(out[0], skip_special_tokens=True)
#     return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
#     # audio_file = text_to_audio_file(caption)
#     return caption 



# interface = gr.Interface(
#     fn=generate_caption_tts,
#     inputs=gr.Image(type="numpy"),
#     outputs=[gr.Textbox(label="Generated Caption")],
#     title="Image Captioning for Visually Impaired",
#     description="Upload an image, get a caption and audio description."
# )


# interface.launch()
# # demo.launch(share=True)



import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image

# Load small LLaVA model
processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
    "llava/LLaVA-7B-llm-small",
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically use GPU if available
)

def generate_caption(image):
    # Convert to PIL if needed
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    
    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt").to(model.device)
    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode result
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Gradio Interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="LLaVA Image Captioning"
)

interface.launch()