File size: 4,215 Bytes
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5c91e
880b908
 
 
 
 
2f5c91e
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fee2e0a
880b908
 
 
fa5cf58
fee2e0a
034b2f2
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739fb9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880b908
739fb9a
880b908
 
2f5c91e
 
880b908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c11c555
28cca05
880b908
ec1090e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# # app.py
# import gradio as gr
# from transformers import BlipProcessor, BlipForConditionalGeneration
# from gtts import gTTS
# import io
# from PIL import Image

# # -------------------------------
# # Load BLIP-base model (lighter version)
# # -------------------------------
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# # -------------------------------
# # Generate caption function
# # -------------------------------
# # def generate_caption_tts(image):
# #     caption = generate_caption(model, processor, image)
# #     audio_file = text_to_audio_file(caption)
# #     return caption, audio_file  # return file path, not BytesIO


# # -------------------------------
# # Convert text to speech using gTTS
# # -------------------------------
# import tempfile
# import pyttsx3

# def text_to_audio_file(text):
#     # Create a temporary file
#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
#     tmp_path = tmp_file.name
#     tmp_file.close()

#     engine = pyttsx3.init()
#     engine.save_to_file(text, tmp_path)
#     engine.runAndWait()

#     return tmp_path

# def generate_caption_from_image(model, processor, image):
#     # image: PIL.Image
#     inputs = processor(images=image, return_tensors="pt")
#     out = model.generate(**inputs)
#     caption = processor.decode(out[0], skip_special_tokens=True)
#     return caption
# # -------------------------------
# # Gradio interface: Caption + Audio
# # -------------------------------
# def generate_caption_tts(image):
#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
#     # audio_file = text_to_audio_file(caption)
#     return caption 



# interface = gr.Interface(
#     fn=generate_caption_tts,
#     inputs=gr.Image(type="numpy"),
#     outputs=[gr.Textbox(label="Generated Caption")],
#     title="Image Captioning for Visually Impaired",
#     description="Upload an image, get a caption and audio description."
# )


# interface.launch()
# # demo.launch(share=True)



import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image

# Load small LLaVA model
processor = AutoProcessor.from_pretrained("import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image

# Load small LLaVA model
processor = AutoProcessor.from_pretrained("LLaVA/LLaVA-7B-llm-small")
model = AutoModelForCausalLM.from_pretrained(
    "LLaVA/LLaVA-7B-llm-small",
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically use GPU if available
)

def generate_caption(image):
    # Convert to PIL if needed
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    
    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt").to(model.device)
    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode result
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Gradio Interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="LLaVA Image Captioning"
)

interface.launch()
")
model = AutoModelForCausalLM.from_pretrained(
    "LLaVA/LLaVA-7B-llm-small",
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically use GPU if available
)

def generate_caption(image):
    # Convert to PIL if needed
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    
    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt").to(model.device)
    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode result
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Gradio Interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="LLaVA Image Captioning"
)

interface.launch()