Spaces:

gopalagra
/

blind-image-captioning

Sleeping

gopalagra commited on Sep 3

Commit

fee2e0a

verified ·

1 Parent(s): c11c555

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,37 +14,39 @@ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-capt
 # -------------------------------
 # Generate caption function
 # -------------------------------
-def generate_caption_fn(image):
-    # Convert uploaded image to PIL
-    if not isinstance(image, Image.Image):
-        image = Image.fromarray(image)
-    # BLIP preprocessing
-    inputs = processor(images=image, return_tensors="pt")
-    # Generate caption
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
 # -------------------------------
 # Convert text to speech using gTTS
 # -------------------------------
-def text_to_speech(caption):
-    tts = gTTS(text=caption, lang='en')
-    mp3_fp = io.BytesIO()
-    tts.write_to_fp(mp3_fp)
-    mp3_fp.seek(0)
-    return mp3_fp
 # -------------------------------
 # Gradio interface: Caption + Audio
 # -------------------------------
 def generate_caption_tts(image):
-    caption = generate_caption_fn(image)
-    audio = text_to_speech(caption)
-    return caption, audio
 interface = gr.Interface(
     fn=generate_caption_tts,

 # -------------------------------
 # Generate caption function
 # -------------------------------
+# def generate_caption_tts(image):
+#     caption = generate_caption(model, processor, image)
+#     audio_file = text_to_audio_file(caption)
+#     return caption, audio_file  # return file path, not BytesIO
 # -------------------------------
 # Convert text to speech using gTTS
 # -------------------------------
+import tempfile
+import pyttsx3
+def text_to_audio_file(text):
+    # Create a temporary file
+    tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+    tmp_path = tmp_file.name
+    tmp_file.close()
+    engine = pyttsx3.init()
+    engine.save_to_file(text, tmp_path)
+    engine.runAndWait()
+    return tmp_path
 # -------------------------------
 # Gradio interface: Caption + Audio
 # -------------------------------
 def generate_caption_tts(image):
+    caption = generate_caption(model, processor, image)
+    audio_file = text_to_audio_file(caption)
+    return caption, audio_file  # return file path, not BytesIO
 interface = gr.Interface(
     fn=generate_caption_tts,