jonloporto commited on
Commit
1d08bb9
Β·
verified Β·
1 Parent(s): f7f7d8f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -0
app.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ImageToVoice Hugging Face Space
3
+
4
+ Converts images to text using Hugging Face's image-to-text pipeline,
5
+ then converts the text to speech using Supertonic TTS.
6
+ """
7
+
8
+ import gradio as gr
9
+ from supertonic import TTS
10
+ from transformers import pipeline
11
+ from PIL import Image
12
+ import numpy as np
13
+ import traceback
14
+
15
+ # Initialize models (load once at startup)
16
+ image_to_text = None
17
+ tts = None
18
+ init_error = None
19
+
20
+ # Available voice styles for supertonic
21
+ AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
22
+
23
+ try:
24
+ print("Initializing image-to-text pipeline...")
25
+ image_to_text = pipeline("image-to-text")
26
+ print("Image-to-text pipeline initialized successfully")
27
+ except Exception as e:
28
+ init_error = f"Failed to initialize image-to-text: {str(e)}"
29
+ print(init_error)
30
+ traceback.print_exc()
31
+
32
+ try:
33
+ print("Initializing TTS...")
34
+ tts = TTS(auto_download=True)
35
+ print("TTS initialized successfully")
36
+ except Exception as e:
37
+ if init_error:
38
+ init_error += f"\nFailed to initialize TTS: {str(e)}"
39
+ else:
40
+ init_error = f"Failed to initialize TTS: {str(e)}"
41
+ print(init_error)
42
+ traceback.print_exc()
43
+
44
+
45
+ def image_to_voice(image, voice_name):
46
+ """Convert image to text, then text to speech."""
47
+ if image is None:
48
+ return None, "Please upload an image."
49
+
50
+ if image_to_text is None or tts is None:
51
+ error_msg = "Error: Models failed to initialize. "
52
+ if init_error:
53
+ error_msg += f"\n\nDetails: {init_error}"
54
+ else:
55
+ error_msg += "Please check the logs for more information."
56
+ return None, error_msg
57
+
58
+ # Validate and get voice style
59
+ if voice_name not in AVAILABLE_VOICES:
60
+ voice_name = "M5" # Default fallback
61
+ print(f"Invalid voice name, using default: M5")
62
+
63
+ try:
64
+ print(f"Getting voice style: {voice_name}")
65
+ style = tts.get_voice_style(voice_name=voice_name)
66
+ print(f"Voice style '{voice_name}' loaded successfully")
67
+ except Exception as e:
68
+ error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
69
+ print(error_msg)
70
+ return None, error_msg
71
+
72
+ try:
73
+ print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
74
+
75
+ # Convert PIL Image to format expected by pipeline
76
+ if isinstance(image, Image.Image):
77
+ # PIL Image should work directly, but ensure it's RGB
78
+ if image.mode != 'RGB':
79
+ image = image.convert('RGB')
80
+ print(f"Converted image to RGB mode")
81
+
82
+ # Convert image to text
83
+ print("Running image-to-text pipeline...")
84
+ result = image_to_text(image)
85
+ print(f"Image-to-text result: {result}")
86
+
87
+ if not result or len(result) == 0:
88
+ return None, "Error: Could not extract text from image. The pipeline returned an empty result."
89
+
90
+ generated_text = result[0].get('generated_text', '')
91
+ if not generated_text:
92
+ return None, "Error: No text was extracted from the image. The generated text is empty."
93
+
94
+ print(f"Extracted text: {generated_text}")
95
+
96
+ # Convert text to speech
97
+ print(f"Synthesizing speech with voice '{voice_name}'...")
98
+ wav, duration = tts.synthesize(generated_text, voice_style=style)
99
+ print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
100
+
101
+ # Ensure wav is a numpy array
102
+ if not isinstance(wav, np.ndarray):
103
+ wav = np.array(wav)
104
+ print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
105
+
106
+ # Ensure audio is 1D (mono) format
107
+ if wav.ndim > 1:
108
+ wav = wav.squeeze()
109
+ if wav.ndim > 1:
110
+ # If still multi-dimensional, take first channel
111
+ wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
112
+ print(f"Squeezed wav to 1D: shape={wav.shape}")
113
+
114
+ # Normalize audio to [-1, 1] range if needed
115
+ if wav.dtype == np.int16:
116
+ wav = wav.astype(np.float32) / 32768.0
117
+ elif wav.dtype == np.int32:
118
+ wav = wav.astype(np.float32) / 2147483648.0
119
+ elif wav.dtype != np.float32:
120
+ # If already in a reasonable range, just convert to float32
121
+ if np.abs(wav).max() > 1.0:
122
+ wav = wav.astype(np.float32) / np.abs(wav).max()
123
+ else:
124
+ wav = wav.astype(np.float32)
125
+
126
+ print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
127
+
128
+ # Calculate sample rate from duration and audio length
129
+ # sample_rate = samples / duration_in_seconds
130
+ if duration > 0:
131
+ calculated_sample_rate = int(len(wav) / duration)
132
+ print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
133
+ sample_rate = calculated_sample_rate
134
+ else:
135
+ # Fallback: Try common TTS sample rates
136
+ # Many TTS systems use 24000 Hz or 16000 Hz
137
+ # If audio sounds slow, try higher sample rate; if fast, try lower
138
+ sample_rate = 24000 # Common TTS sample rate
139
+ print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
140
+
141
+ return (sample_rate, wav), generated_text
142
+
143
+ except Exception as e:
144
+ error_msg = f"Error processing image: {str(e)}"
145
+ full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
146
+ print(full_error) # Print full traceback for debugging
147
+ return None, error_msg
148
+
149
+
150
+ # Create Gradio interface with playful styling
151
+ custom_css = """
152
+ /* Playful background gradient */
153
+ .gradio-container {
154
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
155
+ background-size: 400% 400%;
156
+ animation: gradientShift 15s ease infinite;
157
+ min-height: 100vh;
158
+ padding: 20px;
159
+ }
160
+
161
+ @keyframes gradientShift {
162
+ 0% { background-position: 0% 50%; }
163
+ 50% { background-position: 100% 50%; }
164
+ 100% { background-position: 0% 50%; }
165
+ }
166
+
167
+ /* Fun title styling */
168
+ h1 {
169
+ color: #FFD700 !important;
170
+ font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
171
+ text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
172
+ font-size: 3em !important;
173
+ text-align: center !important;
174
+ margin-bottom: 20px !important;
175
+ animation: bounce 2s infinite;
176
+ }
177
+
178
+ @keyframes bounce {
179
+ 0%, 100% { transform: translateY(0); }
180
+ 50% { transform: translateY(-10px); }
181
+ }
182
+
183
+ /* Playful paragraph text */
184
+ p, .markdown-text {
185
+ color: #FFFFFF !important;
186
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
187
+ font-size: 1.2em !important;
188
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
189
+ }
190
+
191
+ /* Card/panel styling */
192
+ .panel, .block, .gradio-block {
193
+ background: rgba(255, 255, 255, 0.95) !important;
194
+ border-radius: 20px !important;
195
+ padding: 20px !important;
196
+ box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
197
+ border: 3px solid #FFD700 !important;
198
+ }
199
+
200
+ /* Label styling */
201
+ label {
202
+ color: #764ba2 !important;
203
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
204
+ font-weight: bold !important;
205
+ font-size: 1.1em !important;
206
+ }
207
+
208
+ /* Button styling */
209
+ button.primary {
210
+ background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
211
+ color: white !important;
212
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
213
+ font-size: 1.3em !important;
214
+ font-weight: bold !important;
215
+ border-radius: 25px !important;
216
+ padding: 15px 30px !important;
217
+ border: 3px solid #FFD700 !important;
218
+ box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
219
+ transition: all 0.3s ease !important;
220
+ }
221
+
222
+ button.primary:hover {
223
+ transform: scale(1.1) !important;
224
+ box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
225
+ }
226
+
227
+ /* Input fields */
228
+ input, textarea, select {
229
+ border-radius: 15px !important;
230
+ border: 2px solid #4ECDC4 !important;
231
+ font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
232
+ }
233
+
234
+ /* Dropdown styling */
235
+ select {
236
+ background: linear-gradient(45deg, #f093fb, #4facfe) !important;
237
+ color: white !important;
238
+ font-weight: bold !important;
239
+ }
240
+
241
+ /* Textbox styling */
242
+ textarea {
243
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
244
+ color: white !important;
245
+ font-weight: bold !important;
246
+ }
247
+ """
248
+
249
+ with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
250
+ gr.Markdown(
251
+ """
252
+ # 🎨✨ Image to Voice Converter ✨🎨
253
+ ### Upload an image to convert it to text, then hear it as speech! 🎀🎡
254
+ """
255
+ )
256
+
257
+ with gr.Row():
258
+ with gr.Column():
259
+ image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image")
260
+ voice_dropdown = gr.Dropdown(
261
+ choices=AVAILABLE_VOICES,
262
+ value="M5",
263
+ label="🎭 Voice Style",
264
+ info="Select a voice style for text-to-speech πŸŽͺ"
265
+ )
266
+ generate_btn = gr.Button("πŸš€ Generate Speech πŸš€", variant="primary")
267
+
268
+ with gr.Column():
269
+ audio_output = gr.Audio(label="🎡 Generated Speech", type="numpy")
270
+ text_output = gr.Textbox(label="πŸ“ Extracted Text", lines=5)
271
+
272
+ generate_btn.click(
273
+ fn=image_to_voice,
274
+ inputs=[image_input, voice_dropdown],
275
+ outputs=[audio_output, text_output]
276
+ )
277
+
278
+ gr.Examples(
279
+ examples=[],
280
+ inputs=image_input
281
+ )
282
+
283
+ if __name__ == "__main__":
284
+ demo.launch()
285
+