jonloporto commited on
Commit
456f8ff
Β·
verified Β·
1 Parent(s): 1d08bb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -285
app.py CHANGED
@@ -1,285 +1,285 @@
1
- # -*- coding: utf-8 -*-
2
- """ImageToVoice Hugging Face Space
3
-
4
- Converts images to text using Hugging Face's image-to-text pipeline,
5
- then converts the text to speech using Supertonic TTS.
6
- """
7
-
8
- import gradio as gr
9
- from supertonic import TTS
10
- from transformers import pipeline
11
- from PIL import Image
12
- import numpy as np
13
- import traceback
14
-
15
- # Initialize models (load once at startup)
16
- image_to_text = None
17
- tts = None
18
- init_error = None
19
-
20
- # Available voice styles for supertonic
21
- AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
22
-
23
- try:
24
- print("Initializing image-to-text pipeline...")
25
- image_to_text = pipeline("image-to-text")
26
- print("Image-to-text pipeline initialized successfully")
27
- except Exception as e:
28
- init_error = f"Failed to initialize image-to-text: {str(e)}"
29
- print(init_error)
30
- traceback.print_exc()
31
-
32
- try:
33
- print("Initializing TTS...")
34
- tts = TTS(auto_download=True)
35
- print("TTS initialized successfully")
36
- except Exception as e:
37
- if init_error:
38
- init_error += f"\nFailed to initialize TTS: {str(e)}"
39
- else:
40
- init_error = f"Failed to initialize TTS: {str(e)}"
41
- print(init_error)
42
- traceback.print_exc()
43
-
44
-
45
- def image_to_voice(image, voice_name):
46
- """Convert image to text, then text to speech."""
47
- if image is None:
48
- return None, "Please upload an image."
49
-
50
- if image_to_text is None or tts is None:
51
- error_msg = "Error: Models failed to initialize. "
52
- if init_error:
53
- error_msg += f"\n\nDetails: {init_error}"
54
- else:
55
- error_msg += "Please check the logs for more information."
56
- return None, error_msg
57
-
58
- # Validate and get voice style
59
- if voice_name not in AVAILABLE_VOICES:
60
- voice_name = "M5" # Default fallback
61
- print(f"Invalid voice name, using default: M5")
62
-
63
- try:
64
- print(f"Getting voice style: {voice_name}")
65
- style = tts.get_voice_style(voice_name=voice_name)
66
- print(f"Voice style '{voice_name}' loaded successfully")
67
- except Exception as e:
68
- error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
69
- print(error_msg)
70
- return None, error_msg
71
-
72
- try:
73
- print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
74
-
75
- # Convert PIL Image to format expected by pipeline
76
- if isinstance(image, Image.Image):
77
- # PIL Image should work directly, but ensure it's RGB
78
- if image.mode != 'RGB':
79
- image = image.convert('RGB')
80
- print(f"Converted image to RGB mode")
81
-
82
- # Convert image to text
83
- print("Running image-to-text pipeline...")
84
- result = image_to_text(image)
85
- print(f"Image-to-text result: {result}")
86
-
87
- if not result or len(result) == 0:
88
- return None, "Error: Could not extract text from image. The pipeline returned an empty result."
89
-
90
- generated_text = result[0].get('generated_text', '')
91
- if not generated_text:
92
- return None, "Error: No text was extracted from the image. The generated text is empty."
93
-
94
- print(f"Extracted text: {generated_text}")
95
-
96
- # Convert text to speech
97
- print(f"Synthesizing speech with voice '{voice_name}'...")
98
- wav, duration = tts.synthesize(generated_text, voice_style=style)
99
- print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
100
-
101
- # Ensure wav is a numpy array
102
- if not isinstance(wav, np.ndarray):
103
- wav = np.array(wav)
104
- print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
105
-
106
- # Ensure audio is 1D (mono) format
107
- if wav.ndim > 1:
108
- wav = wav.squeeze()
109
- if wav.ndim > 1:
110
- # If still multi-dimensional, take first channel
111
- wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
112
- print(f"Squeezed wav to 1D: shape={wav.shape}")
113
-
114
- # Normalize audio to [-1, 1] range if needed
115
- if wav.dtype == np.int16:
116
- wav = wav.astype(np.float32) / 32768.0
117
- elif wav.dtype == np.int32:
118
- wav = wav.astype(np.float32) / 2147483648.0
119
- elif wav.dtype != np.float32:
120
- # If already in a reasonable range, just convert to float32
121
- if np.abs(wav).max() > 1.0:
122
- wav = wav.astype(np.float32) / np.abs(wav).max()
123
- else:
124
- wav = wav.astype(np.float32)
125
-
126
- print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
127
-
128
- # Calculate sample rate from duration and audio length
129
- # sample_rate = samples / duration_in_seconds
130
- if duration > 0:
131
- calculated_sample_rate = int(len(wav) / duration)
132
- print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
133
- sample_rate = calculated_sample_rate
134
- else:
135
- # Fallback: Try common TTS sample rates
136
- # Many TTS systems use 24000 Hz or 16000 Hz
137
- # If audio sounds slow, try higher sample rate; if fast, try lower
138
- sample_rate = 24000 # Common TTS sample rate
139
- print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
140
-
141
- return (sample_rate, wav), generated_text
142
-
143
- except Exception as e:
144
- error_msg = f"Error processing image: {str(e)}"
145
- full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
146
- print(full_error) # Print full traceback for debugging
147
- return None, error_msg
148
-
149
-
150
- # Create Gradio interface with playful styling
151
- custom_css = """
152
- /* Playful background gradient */
153
- .gradio-container {
154
- background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
155
- background-size: 400% 400%;
156
- animation: gradientShift 15s ease infinite;
157
- min-height: 100vh;
158
- padding: 20px;
159
- }
160
-
161
- @keyframes gradientShift {
162
- 0% { background-position: 0% 50%; }
163
- 50% { background-position: 100% 50%; }
164
- 100% { background-position: 0% 50%; }
165
- }
166
-
167
- /* Fun title styling */
168
- h1 {
169
- color: #FFD700 !important;
170
- font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
171
- text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
172
- font-size: 3em !important;
173
- text-align: center !important;
174
- margin-bottom: 20px !important;
175
- animation: bounce 2s infinite;
176
- }
177
-
178
- @keyframes bounce {
179
- 0%, 100% { transform: translateY(0); }
180
- 50% { transform: translateY(-10px); }
181
- }
182
-
183
- /* Playful paragraph text */
184
- p, .markdown-text {
185
- color: #FFFFFF !important;
186
- font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
187
- font-size: 1.2em !important;
188
- text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
189
- }
190
-
191
- /* Card/panel styling */
192
- .panel, .block, .gradio-block {
193
- background: rgba(255, 255, 255, 0.95) !important;
194
- border-radius: 20px !important;
195
- padding: 20px !important;
196
- box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
197
- border: 3px solid #FFD700 !important;
198
- }
199
-
200
- /* Label styling */
201
- label {
202
- color: #764ba2 !important;
203
- font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
204
- font-weight: bold !important;
205
- font-size: 1.1em !important;
206
- }
207
-
208
- /* Button styling */
209
- button.primary {
210
- background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
211
- color: white !important;
212
- font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
213
- font-size: 1.3em !important;
214
- font-weight: bold !important;
215
- border-radius: 25px !important;
216
- padding: 15px 30px !important;
217
- border: 3px solid #FFD700 !important;
218
- box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
219
- transition: all 0.3s ease !important;
220
- }
221
-
222
- button.primary:hover {
223
- transform: scale(1.1) !important;
224
- box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
225
- }
226
-
227
- /* Input fields */
228
- input, textarea, select {
229
- border-radius: 15px !important;
230
- border: 2px solid #4ECDC4 !important;
231
- font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
232
- }
233
-
234
- /* Dropdown styling */
235
- select {
236
- background: linear-gradient(45deg, #f093fb, #4facfe) !important;
237
- color: white !important;
238
- font-weight: bold !important;
239
- }
240
-
241
- /* Textbox styling */
242
- textarea {
243
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
244
- color: white !important;
245
- font-weight: bold !important;
246
- }
247
- """
248
-
249
- with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
250
- gr.Markdown(
251
- """
252
- # 🎨✨ Image to Voice Converter ✨🎨
253
- ### Upload an image to convert it to text, then hear it as speech! 🎀🎡
254
- """
255
- )
256
-
257
- with gr.Row():
258
- with gr.Column():
259
- image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image")
260
- voice_dropdown = gr.Dropdown(
261
- choices=AVAILABLE_VOICES,
262
- value="M5",
263
- label="🎭 Voice Style",
264
- info="Select a voice style for text-to-speech πŸŽͺ"
265
- )
266
- generate_btn = gr.Button("πŸš€ Generate Speech πŸš€", variant="primary")
267
-
268
- with gr.Column():
269
- audio_output = gr.Audio(label="🎡 Generated Speech", type="numpy")
270
- text_output = gr.Textbox(label="πŸ“ Extracted Text", lines=5)
271
-
272
- generate_btn.click(
273
- fn=image_to_voice,
274
- inputs=[image_input, voice_dropdown],
275
- outputs=[audio_output, text_output]
276
- )
277
-
278
- gr.Examples(
279
- examples=[],
280
- inputs=image_input
281
- )
282
-
283
- if __name__ == "__main__":
284
- demo.launch()
285
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """ImageToVoice Hugging Face Space
3
+
4
+ Converts images to text using Hugging Face's image-to-text pipeline,
5
+ then converts the text to speech using Supertonic TTS.
6
+ """
7
+
8
+ import gradio as gr
9
+ from supertonic import TTS
10
+ from transformers import pipeline
11
+ from PIL import Image
12
+ import numpy as np
13
+ import traceback
14
+
15
+ # Initialize models (load once at startup)
16
+ image_to_text = None
17
+ tts = None
18
+ init_error = None
19
+
20
+ # Available voice styles for supertonic
21
+ AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
22
+
23
+ try:
24
+ print("Initializing image-to-text pipeline...")
25
+ image_to_text = pipeline("image-to-text")
26
+ print("Image-to-text pipeline initialized successfully")
27
+ except Exception as e:
28
+ init_error = f"Failed to initialize image-to-text: {str(e)}"
29
+ print(init_error)
30
+ traceback.print_exc()
31
+
32
+ try:
33
+ print("Initializing TTS...")
34
+ tts = TTS(auto_download=True)
35
+ print("TTS initialized successfully")
36
+ except Exception as e:
37
+ if init_error:
38
+ init_error += f"\nFailed to initialize TTS: {str(e)}"
39
+ else:
40
+ init_error = f"Failed to initialize TTS: {str(e)}"
41
+ print(init_error)
42
+ traceback.print_exc()
43
+
44
+
45
+ def image_to_voice(image, voice_name):
46
+ """Convert image to text, then text to speech."""
47
+ if image is None:
48
+ return None, "Please upload an image."
49
+
50
+ if image_to_text is None or tts is None:
51
+ error_msg = "Error: Models failed to initialize. "
52
+ if init_error:
53
+ error_msg += f"\n\nDetails: {init_error}"
54
+ else:
55
+ error_msg += "Please check the logs for more information."
56
+ return None, error_msg
57
+
58
+ # Validate and get voice style
59
+ if voice_name not in AVAILABLE_VOICES:
60
+ voice_name = "M5" # Default fallback
61
+ print(f"Invalid voice name, using default: M5")
62
+
63
+ try:
64
+ print(f"Getting voice style: {voice_name}")
65
+ style = tts.get_voice_style(voice_name=voice_name)
66
+ print(f"Voice style '{voice_name}' loaded successfully")
67
+ except Exception as e:
68
+ error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
69
+ print(error_msg)
70
+ return None, error_msg
71
+
72
+ try:
73
+ print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
74
+
75
+ # Convert PIL Image to format expected by pipeline
76
+ if isinstance(image, Image.Image):
77
+ # PIL Image should work directly, but ensure it's RGB
78
+ if image.mode != 'RGB':
79
+ image = image.convert('RGB')
80
+ print(f"Converted image to RGB mode")
81
+
82
+ # Convert image to text
83
+ print("Running image-to-text pipeline...")
84
+ result = image_to_text(image)
85
+ print(f"Image-to-text result: {result}")
86
+
87
+ if not result or len(result) == 0:
88
+ return None, "Error: Could not extract text from image. The pipeline returned an empty result."
89
+
90
+ generated_text = result[0].get('generated_text', '')
91
+ if not generated_text:
92
+ return None, "Error: No text was extracted from the image. The generated text is empty."
93
+
94
+ print(f"Extracted text: {generated_text}")
95
+
96
+ # Convert text to speech
97
+ print(f"Synthesizing speech with voice '{voice_name}'...")
98
+ wav, duration = tts.synthesize(generated_text, voice_style=style)
99
+ print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
100
+
101
+ # Ensure wav is a numpy array
102
+ if not isinstance(wav, np.ndarray):
103
+ wav = np.array(wav)
104
+ print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
105
+
106
+ # Ensure audio is 1D (mono) format
107
+ if wav.ndim > 1:
108
+ wav = wav.squeeze()
109
+ if wav.ndim > 1:
110
+ # If still multi-dimensional, take first channel
111
+ wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
112
+ print(f"Squeezed wav to 1D: shape={wav.shape}")
113
+
114
+ # Normalize audio to [-1, 1] range if needed
115
+ if wav.dtype == np.int16:
116
+ wav = wav.astype(np.float32) / 32768.0
117
+ elif wav.dtype == np.int32:
118
+ wav = wav.astype(np.float32) / 2147483648.0
119
+ elif wav.dtype != np.float32:
120
+ # If already in a reasonable range, just convert to float32
121
+ if np.abs(wav).max() > 1.0:
122
+ wav = wav.astype(np.float32) / np.abs(wav).max()
123
+ else:
124
+ wav = wav.astype(np.float32)
125
+
126
+ print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
127
+
128
+ # Calculate sample rate from duration and audio length
129
+ # sample_rate = samples / duration_in_seconds
130
+ if duration > 0:
131
+ calculated_sample_rate = int(len(wav) / duration)
132
+ print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
133
+ sample_rate = calculated_sample_rate
134
+ else:
135
+ # Fallback: Try common TTS sample rates
136
+ # Many TTS systems use 24000 Hz or 16000 Hz
137
+ # If audio sounds slow, try higher sample rate; if fast, try lower
138
+ sample_rate = 24000 # Common TTS sample rate
139
+ print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
140
+
141
+ return (sample_rate, wav), generated_text
142
+
143
+ except Exception as e:
144
+ error_msg = f"Error processing image: {str(e)}"
145
+ full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
146
+ print(full_error) # Print full traceback for debugging
147
+ return None, error_msg
148
+
149
+
150
+ # Create Gradio interface with playful styling
151
+ custom_css = """
152
+ /* Playful background gradient */
153
+ .gradio-container {
154
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
155
+ background-size: 400% 400%;
156
+ animation: gradientShift 15s ease infinite;
157
+ min-height: 100vh;
158
+ padding: 20px;
159
+ }
160
+
161
+ @keyframes gradientShift {
162
+ 0% { background-position: 0% 50%; }
163
+ 50% { background-position: 100% 50%; }
164
+ 100% { background-position: 0% 50%; }
165
+ }
166
+
167
+ /* Fun title styling */
168
+ h1 {
169
+ color: #000000 !important;
170
+ font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
171
+ text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
172
+ font-size: 3em !important;
173
+ text-align: center !important;
174
+ margin-bottom: 20px !important;
175
+ animation: bounce 2s infinite;
176
+ }
177
+
178
+ @keyframes bounce {
179
+ 0%, 100% { transform: translateY(0); }
180
+ 50% { transform: translateY(-10px); }
181
+ }
182
+
183
+ /* Playful paragraph text */
184
+ p, .markdown-text {
185
+ color: #000000 !important;
186
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
187
+ font-size: 1.2em !important;
188
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
189
+ }
190
+
191
+ /* Card/panel styling */
192
+ .panel, .block, .gradio-block {
193
+ background: rgba(255, 255, 255, 0.95) !important;
194
+ border-radius: 20px !important;
195
+ padding: 20px !important;
196
+ box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
197
+ border: 3px solid #FFD700 !important;
198
+ }
199
+
200
+ /* Label styling */
201
+ label {
202
+ color: #000000 !important;
203
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
204
+ font-weight: bold !important;
205
+ font-size: 1.1em !important;
206
+ }
207
+
208
+ /* Button styling */
209
+ button.primary {
210
+ background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
211
+ color: white !important;
212
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
213
+ font-size: 1.3em !important;
214
+ font-weight: bold !important;
215
+ border-radius: 25px !important;
216
+ padding: 15px 30px !important;
217
+ border: 3px solid #FFD700 !important;
218
+ box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
219
+ transition: all 0.3s ease !important;
220
+ }
221
+
222
+ button.primary:hover {
223
+ transform: scale(1.1) !important;
224
+ box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
225
+ }
226
+
227
+ /* Input fields */
228
+ input, textarea, select {
229
+ border-radius: 15px !important;
230
+ border: 2px solid #4ECDC4 !important;
231
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
232
+ }
233
+
234
+ /* Dropdown styling */
235
+ select {
236
+ background: linear-gradient(45deg, #f093fb, #4facfe) !important;
237
+ color: white !important;
238
+ font-weight: bold !important;
239
+ }
240
+
241
+ /* Textbox styling */
242
+ textarea {
243
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
244
+ color: white !important;
245
+ font-weight: bold !important;
246
+ }
247
+ """
248
+
249
+ with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
250
+ gr.Markdown(
251
+ """
252
+ # 🎨✨ Image to Voice Converter ✨🎨
253
+ ### Upload an image to convert it to text, then hear it as speech! 🎀🎡
254
+ """
255
+ )
256
+
257
+ with gr.Row():
258
+ with gr.Column():
259
+ image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image")
260
+ voice_dropdown = gr.Dropdown(
261
+ choices=AVAILABLE_VOICES,
262
+ value="M5",
263
+ label="🎭 Voice Style",
264
+ info="Select a voice style for text-to-speech πŸŽͺ"
265
+ )
266
+ generate_btn = gr.Button("πŸš€ Generate Speech πŸš€", variant="primary")
267
+
268
+ with gr.Column():
269
+ audio_output = gr.Audio(label="🎡 Generated Speech", type="numpy")
270
+ text_output = gr.Textbox(label="πŸ“ Extracted Text", lines=5)
271
+
272
+ generate_btn.click(
273
+ fn=image_to_voice,
274
+ inputs=[image_input, voice_dropdown],
275
+ outputs=[audio_output, text_output]
276
+ )
277
+
278
+ gr.Examples(
279
+ examples=[],
280
+ inputs=image_input
281
+ )
282
+
283
+ if __name__ == "__main__":
284
+ demo.launch()
285
+