gopalagra commited on
Commit
9ec241a
·
verified ·
1 Parent(s): 4d8f5fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -59
app.py CHANGED
@@ -1,69 +1,69 @@
1
- import gradio as gr
2
- from transformers import BlipProcessor, BlipForConditionalGeneration
3
- from gtts import gTTS
4
- import io
5
- from PIL import Image
 
 
 
 
 
 
6
 
7
- # -------------------------------
8
- # Load BLIP-base model (lighter version)
9
- # -------------------------------
10
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
11
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 
 
12
 
13
- # -------------------------------
14
- # Generate caption function
15
- # -------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # def generate_caption_tts(image):
17
- # caption = generate_caption(model, processor, image)
18
- # audio_file = text_to_audio_file(caption)
19
- # return caption, audio_file # return file path, not BytesIO
20
 
21
 
22
- # -------------------------------
23
- # Convert text to speech using gTTS
24
- # -------------------------------
25
- import tempfile
26
- import pyttsx3
27
-
28
- def text_to_audio_file(text):
29
- # Create a temporary file
30
- tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
31
- tmp_path = tmp_file.name
32
- tmp_file.close()
33
-
34
- engine = pyttsx3.init()
35
- engine.save_to_file(text, tmp_path)
36
- engine.runAndWait()
37
-
38
- return tmp_path
39
-
40
- def generate_caption_from_image(model, processor, image):
41
- # image: PIL.Image
42
- inputs = processor(images=image, return_tensors="pt")
43
- out = model.generate(**inputs)
44
- caption = processor.decode(out[0], skip_special_tokens=True)
45
- return caption
46
- # -------------------------------
47
- # Gradio interface: Caption + Audio
48
- # -------------------------------
49
- def generate_caption_tts(image):
50
- caption = generate_caption_from_image(model, processor, image) # uses global model/processor
51
- # audio_file = text_to_audio_file(caption)
52
- return caption
53
-
54
-
55
-
56
- interface = gr.Interface(
57
- fn=generate_caption_tts,
58
- inputs=gr.Image(type="numpy"),
59
- outputs=[gr.Textbox(label="Generated Caption")],
60
- title="Image Captioning for Visually Impaired",
61
- description="Upload an image, get a caption and audio description."
62
- )
63
 
64
 
65
- interface.launch()
66
- # demo.launch(share=True)
67
 
68
  import gradio as gr
69
  from transformers import (
 
1
+ # import gradio as gr
2
+ # from transformers import BlipProcessor, BlipForConditionalGeneration
3
+ # from gtts import gTTS
4
+ # import io
5
+ # from PIL import Image
6
+
7
+ # # -------------------------------
8
+ # # Load BLIP-base model (lighter version)
9
+ # # -------------------------------
10
+ # processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
11
+ # model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
12
 
13
+ # # -------------------------------
14
+ # # Generate caption function
15
+ # # -------------------------------
16
+ # # def generate_caption_tts(image):
17
+ # # caption = generate_caption(model, processor, image)
18
+ # # audio_file = text_to_audio_file(caption)
19
+ # # return caption, audio_file # return file path, not BytesIO
20
 
21
+
22
+ # # -------------------------------
23
+ # # Convert text to speech using gTTS
24
+ # # -------------------------------
25
+ # import tempfile
26
+ # import pyttsx3
27
+
28
+ # def text_to_audio_file(text):
29
+ # # Create a temporary file
30
+ # tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
31
+ # tmp_path = tmp_file.name
32
+ # tmp_file.close()
33
+
34
+ # engine = pyttsx3.init()
35
+ # engine.save_to_file(text, tmp_path)
36
+ # engine.runAndWait()
37
+
38
+ # return tmp_path
39
+
40
+ # def generate_caption_from_image(model, processor, image):
41
+ # # image: PIL.Image
42
+ # inputs = processor(images=image, return_tensors="pt")
43
+ # out = model.generate(**inputs)
44
+ # caption = processor.decode(out[0], skip_special_tokens=True)
45
+ # return caption
46
+ # # -------------------------------
47
+ # # Gradio interface: Caption + Audio
48
+ # # -------------------------------
49
  # def generate_caption_tts(image):
50
+ # caption = generate_caption_from_image(model, processor, image) # uses global model/processor
51
+ # # audio_file = text_to_audio_file(caption)
52
+ # return caption
53
 
54
 
55
+
56
+ # interface = gr.Interface(
57
+ # fn=generate_caption_tts,
58
+ # inputs=gr.Image(type="numpy"),
59
+ # outputs=[gr.Textbox(label="Generated Caption")],
60
+ # title="Image Captioning for Visually Impaired",
61
+ # description="Upload an image, get a caption and audio description."
62
+ # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
+ # interface.launch()
66
+ # # demo.launch(share=True)
67
 
68
  import gradio as gr
69
  from transformers import (