gopalagra commited on
Commit
034b2f2
·
verified ·
1 Parent(s): c379826

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -23
app.py CHANGED
@@ -1,34 +1,57 @@
 
1
  import gradio as gr
2
- from transformers import pipeline
3
- import requests
4
- from io import BytesIO
5
  from PIL import Image
6
- import pyttsx3 # Text-to-speech (optional)
7
 
8
- # --- Load hosted model ---
9
- captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 
 
 
10
 
11
- # --- Caption & TTS function ---
12
- def generate_caption_tts(image):
13
- # If user uploads a URL
14
- if isinstance(image, str):
15
- image = Image.open(BytesIO(requests.get(image).content))
16
- caption = captioner(image)[0]['generated_text']
 
 
 
 
17
 
18
- # TTS (optional)
19
- tts = pyttsx3.init()
20
- tts.say(caption)
21
- tts.runAndWait()
22
 
23
  return caption
24
 
25
- # --- Gradio interface ---
26
- iface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  fn=generate_caption_tts,
28
- inputs=gr.Image(type="pil"),
29
- outputs="text",
30
- title="Image Captioning for Visually Impaired",
31
- description="Upload any image and get a descriptive caption."
32
  )
33
 
34
- iface.launch()
 
1
+ # app.py
2
  import gradio as gr
3
+ from transformers import BlipProcessor, BlipForConditionalGeneration
4
+ from gtts import gTTS
5
+ import io
6
  from PIL import Image
 
7
 
8
+ # -------------------------------
9
+ # Load BLIP-base model (lighter version)
10
+ # -------------------------------
11
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
12
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
13
 
14
+ # -------------------------------
15
+ # Generate caption function
16
+ # -------------------------------
17
+ def generate_caption_fn(image):
18
+ # Convert uploaded image to PIL
19
+ if not isinstance(image, Image.Image):
20
+ image = Image.fromarray(image)
21
+
22
+ # BLIP preprocessing
23
+ inputs = processor(images=image, return_tensors="pt")
24
 
25
+ # Generate caption
26
+ out = model.generate(**inputs)
27
+ caption = processor.decode(out[0], skip_special_tokens=True)
 
28
 
29
  return caption
30
 
31
+ # -------------------------------
32
+ # Convert text to speech using gTTS
33
+ # -------------------------------
34
+ def text_to_speech(caption):
35
+ tts = gTTS(text=caption, lang='en')
36
+ mp3_fp = io.BytesIO()
37
+ tts.write_to_fp(mp3_fp)
38
+ mp3_fp.seek(0)
39
+ return mp3_fp
40
+
41
+ # -------------------------------
42
+ # Gradio interface: Caption + Audio
43
+ # -------------------------------
44
+ def generate_caption_tts(image):
45
+ caption = generate_caption_fn(image)
46
+ audio = text_to_speech(caption)
47
+ return caption, audio
48
+
49
+ interface = gr.Interface(
50
  fn=generate_caption_tts,
51
+ inputs=gr.Image(type="numpy"),
52
+ outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(type="file", label="TTS Audio")],
53
+ title="Blind Assistant: Image Captioning",
54
+ description="Upload an image and get a descriptive caption + speech."
55
  )
56
 
57
+ interface.launch()