Image-Captioning

Running

Walid-Ahmed commited on Jul 14, 2024

Commit

5eb3adc

verified ·

1 Parent(s): 4040430

Upload 3 files

Files changed (3) hide show

app.py ADDED Viewed

+import torch
+from transformers import pipeline
+from PIL import Image
+from scipy.io import wavfile
+import simpleaudio as sa
+import gradio as gr
+import numpy as np
+# Specify the device (CPU or GPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load the image-to-text pipeline
+caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
+# Load the image-to-text pipeline with the vit-gpt2 model
+#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)
+# Load the text-to-speech pipeline
+narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
+def process_image(image):
+    # Generate the caption
+    caption = caption_image(image)[0]['generated_text']
+    # Generate speech from the caption
+    speech = narrator(caption)
+    # Convert the audio to PCM format
+    audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)
+    # Save the audio to a WAV file
+    audio_path = "caption.wav"
+    wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)
+    return caption, audio_path
+# Create Gradio interface
+iface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil"),
+    outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")]
+)
+# Launch the interface
+iface.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ espeak

requirements.txt ADDED Viewed

+transformers
+gradio
+timm
+scipy
+phonemizer