jonloporto commited on
Commit
9f14f1c
·
verified ·
1 Parent(s): ed8dbf8

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +28 -12
  2. app.py +69 -0
  3. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,28 @@
1
- ---
2
- title: ImageToSpeechTest
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Image to Voice
3
+ emoji: 🎤
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # Image to Voice Converter
13
+
14
+ This Space converts images to text using Hugging Face's image-to-text pipeline, then converts the text to speech using Supertonic TTS.
15
+
16
+ ## How it works
17
+
18
+ 1. Upload an image
19
+ 2. The model extracts text from the image
20
+ 3. The text is converted to speech using a text-to-speech model
21
+ 4. Listen to the generated audio!
22
+
23
+ ## Technologies Used
24
+
25
+ - **Hugging Face Transformers**: For image-to-text conversion
26
+ - **Supertonic TTS**: For text-to-speech synthesis
27
+ - **Gradio**: For the web interface
28
+
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ImageToVoice Hugging Face Space
3
+
4
+ Converts images to text using Hugging Face's image-to-text pipeline,
5
+ then converts the text to speech using Supertonic TTS.
6
+ """
7
+
8
+ import gradio as gr
9
+ from supertonic import TTS
10
+ from transformers import pipeline
11
+ from PIL import Image
12
+ import io
13
+
14
+ # Initialize models (load once at startup)
15
+ image_to_text = pipeline("image-to-text")
16
+ tts = TTS(auto_download=True)
17
+ style = tts.get_voice_style(voice_name="M5")
18
+
19
+
20
+ def image_to_voice(image):
21
+ """Convert image to text, then text to speech."""
22
+ if image is None:
23
+ return None, "Please upload an image."
24
+
25
+ try:
26
+ # Convert image to text
27
+ result = image_to_text(image)
28
+ generated_text = result[0]['generated_text']
29
+
30
+ # Convert text to speech
31
+ wav, duration = tts.synthesize(generated_text, voice_style=style)
32
+
33
+ # Convert numpy array to audio format for Gradio
34
+ # Gradio Audio component expects (sample_rate, audio_data) tuple
35
+ # Supertonic typically uses 22050 Hz sample rate
36
+ sample_rate = 22050
37
+ return (sample_rate, wav), generated_text
38
+ except Exception as e:
39
+ return None, f"Error: {str(e)}"
40
+
41
+
42
+ # Create Gradio interface
43
+ with gr.Blocks(title="Image to Voice") as demo:
44
+ gr.Markdown("# Image to Voice Converter")
45
+ gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
46
+
47
+ with gr.Row():
48
+ with gr.Column():
49
+ image_input = gr.Image(type="pil", label="Upload Image")
50
+ generate_btn = gr.Button("Generate Speech", variant="primary")
51
+
52
+ with gr.Column():
53
+ audio_output = gr.Audio(label="Generated Speech", type="numpy")
54
+ text_output = gr.Textbox(label="Extracted Text", lines=5)
55
+
56
+ generate_btn.click(
57
+ fn=image_to_voice,
58
+ inputs=image_input,
59
+ outputs=[audio_output, text_output]
60
+ )
61
+
62
+ gr.Examples(
63
+ examples=[],
64
+ inputs=image_input
65
+ )
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()
69
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ supertonic
4
+ pillow>=9.0.0
5
+ torch
6
+ torchaudio
7
+