jonloporto commited on
Commit
3c5d69c
·
verified ·
1 Parent(s): f1a84e4

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +28 -12
  2. app.py +84 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,28 @@
1
- ---
2
- title: ImageToVoiceForClass
3
- emoji: 🐠
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Image to Voice
3
+ emoji: 🎤
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # Image to Voice Converter
13
+
14
+ Convert images to text descriptions and then to speech audio!
15
+
16
+ ## How it works
17
+
18
+ 1. Upload an image
19
+ 2. The AI analyzes the image and generates a text description
20
+ 3. The text is converted to speech using a text-to-speech model
21
+ 4. Download the audio file
22
+
23
+ ## Technologies Used
24
+
25
+ - **Hugging Face Transformers**: For image-to-text conversion
26
+ - **Supertonic TTS**: For text-to-speech synthesis
27
+ - **Gradio**: For the web interface
28
+
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Image to Voice - Hugging Face Spaces
4
+ Converts images to text and then to speech
5
+ """
6
+
7
+ import gradio as gr
8
+ from supertonic import TTS
9
+ from transformers import pipeline
10
+
11
+ # Initialize the image-to-text pipeline
12
+ image_to_text = pipeline("image-to-text")
13
+
14
+ # Initialize TTS (will be loaded on first use)
15
+ tts = None
16
+
17
+ def get_tts():
18
+ """Lazy load TTS to avoid loading on startup"""
19
+ global tts
20
+ if tts is None:
21
+ tts = TTS(auto_download=True)
22
+ return tts
23
+
24
+ def image_to_voice(image):
25
+ """
26
+ Convert image to text and then to speech
27
+
28
+ Args:
29
+ image: PIL Image or numpy array from Gradio
30
+
31
+ Returns:
32
+ tuple: (audio_file_path, text_description)
33
+ """
34
+ if image is None:
35
+ return None, "Please upload an image."
36
+
37
+ try:
38
+ # Convert image to text
39
+ result = image_to_text(image)
40
+ text = result[0]['generated_text']
41
+
42
+ # Convert text to speech
43
+ tts_model = get_tts()
44
+ style = tts_model.get_voice_style(voice_name="M5")
45
+ wav, duration = tts_model.synthesize(text, voice_style=style)
46
+
47
+ # Save audio to a temporary file
48
+ output_path = "output.wav"
49
+ tts_model.save_audio(wav, output_path)
50
+
51
+ return output_path, text
52
+
53
+ except Exception as e:
54
+ return None, f"Error: {str(e)}"
55
+
56
+ # Create Gradio interface
57
+ with gr.Blocks(title="Image to Voice") as demo:
58
+ gr.Markdown("# 🖼️ Image to Voice Converter")
59
+ gr.Markdown("Upload an image and get an audio description of it!")
60
+
61
+ with gr.Row():
62
+ with gr.Column():
63
+ image_input = gr.Image(type="pil", label="Upload Image")
64
+ generate_btn = gr.Button("Generate Audio", variant="primary")
65
+
66
+ with gr.Column():
67
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
68
+ text_output = gr.Textbox(label="Image Description", lines=5)
69
+
70
+ generate_btn.click(
71
+ fn=image_to_voice,
72
+ inputs=image_input,
73
+ outputs=[audio_output, text_output]
74
+ )
75
+
76
+ gr.Examples(
77
+ examples=[],
78
+ inputs=image_input,
79
+ label="Example Images (add your own examples)"
80
+ )
81
+
82
+ if __name__ == "__main__":
83
+ demo.launch()
84
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ supertonic
3
+ gradio
4
+ torch
5
+ torchaudio
6
+