File size: 10,211 Bytes
456f8ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# -*- coding: utf-8 -*-
"""ImageToVoice Hugging Face Space

Converts images to text using Hugging Face's image-to-text pipeline,
then converts the text to speech using Supertonic TTS.
"""

import gradio as gr
from supertonic import TTS
from transformers import pipeline
from PIL import Image
import numpy as np
import traceback

# Initialize models (load once at startup)
image_to_text = None
tts = None
init_error = None

# Available voice styles for supertonic
AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]

try:
    print("Initializing image-to-text pipeline...")
    image_to_text = pipeline("image-to-text")
    print("Image-to-text pipeline initialized successfully")
except Exception as e:
    init_error = f"Failed to initialize image-to-text: {str(e)}"
    print(init_error)
    traceback.print_exc()

try:
    print("Initializing TTS...")
    tts = TTS(auto_download=True)
    print("TTS initialized successfully")
except Exception as e:
    if init_error:
        init_error += f"\nFailed to initialize TTS: {str(e)}"
    else:
        init_error = f"Failed to initialize TTS: {str(e)}"
    print(init_error)
    traceback.print_exc()


def image_to_voice(image, voice_name):
    """Convert image to text, then text to speech."""
    if image is None:
        return None, "Please upload an image."
    
    if image_to_text is None or tts is None:
        error_msg = "Error: Models failed to initialize. "
        if init_error:
            error_msg += f"\n\nDetails: {init_error}"
        else:
            error_msg += "Please check the logs for more information."
        return None, error_msg
    
    # Validate and get voice style
    if voice_name not in AVAILABLE_VOICES:
        voice_name = "M5"  # Default fallback
        print(f"Invalid voice name, using default: M5")
    
    try:
        print(f"Getting voice style: {voice_name}")
        style = tts.get_voice_style(voice_name=voice_name)
        print(f"Voice style '{voice_name}' loaded successfully")
    except Exception as e:
        error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
        print(error_msg)
        return None, error_msg
    
    try:
        print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
        
        # Convert PIL Image to format expected by pipeline
        if isinstance(image, Image.Image):
            # PIL Image should work directly, but ensure it's RGB
            if image.mode != 'RGB':
                image = image.convert('RGB')
                print(f"Converted image to RGB mode")
        
        # Convert image to text
        print("Running image-to-text pipeline...")
        result = image_to_text(image)
        print(f"Image-to-text result: {result}")
        
        if not result or len(result) == 0:
            return None, "Error: Could not extract text from image. The pipeline returned an empty result."
        
        generated_text = result[0].get('generated_text', '')
        if not generated_text:
            return None, "Error: No text was extracted from the image. The generated text is empty."
        
        print(f"Extracted text: {generated_text}")
        
        # Convert text to speech
        print(f"Synthesizing speech with voice '{voice_name}'...")
        wav, duration = tts.synthesize(generated_text, voice_style=style)
        print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
        
        # Ensure wav is a numpy array
        if not isinstance(wav, np.ndarray):
            wav = np.array(wav)
            print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
        
        # Ensure audio is 1D (mono) format
        if wav.ndim > 1:
            wav = wav.squeeze()
            if wav.ndim > 1:
                # If still multi-dimensional, take first channel
                wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
            print(f"Squeezed wav to 1D: shape={wav.shape}")
        
        # Normalize audio to [-1, 1] range if needed
        if wav.dtype == np.int16:
            wav = wav.astype(np.float32) / 32768.0
        elif wav.dtype == np.int32:
            wav = wav.astype(np.float32) / 2147483648.0
        elif wav.dtype != np.float32:
            # If already in a reasonable range, just convert to float32
            if np.abs(wav).max() > 1.0:
                wav = wav.astype(np.float32) / np.abs(wav).max()
            else:
                wav = wav.astype(np.float32)
        
        print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
        
        # Calculate sample rate from duration and audio length
        # sample_rate = samples / duration_in_seconds
        if duration > 0:
            calculated_sample_rate = int(len(wav) / duration)
            print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
            sample_rate = calculated_sample_rate
        else:
            # Fallback: Try common TTS sample rates
            # Many TTS systems use 24000 Hz or 16000 Hz
            # If audio sounds slow, try higher sample rate; if fast, try lower
            sample_rate = 24000  # Common TTS sample rate
            print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
        
        return (sample_rate, wav), generated_text
        
    except Exception as e:
        error_msg = f"Error processing image: {str(e)}"
        full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        print(full_error)  # Print full traceback for debugging
        return None, error_msg


# Create Gradio interface with playful styling
custom_css = """
    /* Playful background gradient */
    .gradio-container {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
        background-size: 400% 400%;
        animation: gradientShift 15s ease infinite;
        min-height: 100vh;
        padding: 20px;
    }
    
    @keyframes gradientShift {
        0% { background-position: 0% 50%; }
        50% { background-position: 100% 50%; }
        100% { background-position: 0% 50%; }
    }
    
    /* Fun title styling */
    h1 {
        color: #000000 !important;
        font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
        text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
        font-size: 3em !important;
        text-align: center !important;
        margin-bottom: 20px !important;
        animation: bounce 2s infinite;
    }
    
    @keyframes bounce {
        0%, 100% { transform: translateY(0); }
        50% { transform: translateY(-10px); }
    }
    
    /* Playful paragraph text */
    p, .markdown-text {
        color: #000000 !important;
        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
        font-size: 1.2em !important;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
    }
    
    /* Card/panel styling */
    .panel, .block, .gradio-block {
        background: rgba(255, 255, 255, 0.95) !important;
        border-radius: 20px !important;
        padding: 20px !important;
        box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
        border: 3px solid #FFD700 !important;
    }
    
    /* Label styling */
    label {
        color: #000000 !important;
        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
        font-weight: bold !important;
        font-size: 1.1em !important;
    }
    
    /* Button styling */
    button.primary {
        background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
        color: white !important;
        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
        font-size: 1.3em !important;
        font-weight: bold !important;
        border-radius: 25px !important;
        padding: 15px 30px !important;
        border: 3px solid #FFD700 !important;
        box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
        transition: all 0.3s ease !important;
    }
    
    button.primary:hover {
        transform: scale(1.1) !important;
        box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
    }
    
    /* Input fields */
    input, textarea, select {
        border-radius: 15px !important;
        border: 2px solid #4ECDC4 !important;
        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
    }
    
    /* Dropdown styling */
    select {
        background: linear-gradient(45deg, #f093fb, #4facfe) !important;
        color: white !important;
        font-weight: bold !important;
    }
    
    /* Textbox styling */
    textarea {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
        color: white !important;
        font-weight: bold !important;
    }
"""

with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
    gr.Markdown(
        """
        # 🎨✨ Image to Voice Converter ✨🎨
        ### Upload an image to convert it to text, then hear it as speech! 🎀🎡
        """
    )
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image")
            voice_dropdown = gr.Dropdown(
                choices=AVAILABLE_VOICES,
                value="M5",
                label="🎭 Voice Style",
                info="Select a voice style for text-to-speech πŸŽͺ"
            )
            generate_btn = gr.Button("πŸš€ Generate Speech πŸš€", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(label="🎡 Generated Speech", type="numpy")
            text_output = gr.Textbox(label="πŸ“ Extracted Text", lines=5)
    
    generate_btn.click(
        fn=image_to_voice,
        inputs=[image_input, voice_dropdown],
        outputs=[audio_output, text_output]
    )
    
    gr.Examples(
        examples=[],
        inputs=image_input
    )

if __name__ == "__main__":
    demo.launch()