jonloporto commited on
Commit
b78885b
·
verified ·
1 Parent(s): f84f0b0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ImageToVoice Hugging Face Space
3
+
4
+ Converts images to text using Hugging Face's image-to-text pipeline,
5
+ then converts the text to speech using Supertonic TTS.
6
+ """
7
+
8
+ import gradio as gr
9
+ from supertonic import TTS
10
+ from transformers import pipeline
11
+ from PIL import Image
12
+ import numpy as np
13
+ import traceback
14
+
15
+ # Initialize models (load once at startup)
16
+ image_to_text = None
17
+ tts = None
18
+ init_error = None
19
+
20
+ # Available voice styles for supertonic
21
+ AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
22
+
23
+ try:
24
+ print("Initializing image-to-text pipeline...")
25
+ image_to_text = pipeline("image-to-text")
26
+ print("Image-to-text pipeline initialized successfully")
27
+ except Exception as e:
28
+ init_error = f"Failed to initialize image-to-text: {str(e)}"
29
+ print(init_error)
30
+ traceback.print_exc()
31
+
32
+ try:
33
+ print("Initializing TTS...")
34
+ tts = TTS(auto_download=True)
35
+ print("TTS initialized successfully")
36
+ except Exception as e:
37
+ if init_error:
38
+ init_error += f"\nFailed to initialize TTS: {str(e)}"
39
+ else:
40
+ init_error = f"Failed to initialize TTS: {str(e)}"
41
+ print(init_error)
42
+ traceback.print_exc()
43
+
44
+
45
+ def image_to_voice(image, voice_name):
46
+ """Convert image to text, then text to speech."""
47
+ if image is None:
48
+ return None, "Please upload an image."
49
+
50
+ if image_to_text is None or tts is None:
51
+ error_msg = "Error: Models failed to initialize. "
52
+ if init_error:
53
+ error_msg += f"\n\nDetails: {init_error}"
54
+ else:
55
+ error_msg += "Please check the logs for more information."
56
+ return None, error_msg
57
+
58
+ # Validate and get voice style
59
+ if voice_name not in AVAILABLE_VOICES:
60
+ voice_name = "M5" # Default fallback
61
+ print(f"Invalid voice name, using default: M5")
62
+
63
+ try:
64
+ print(f"Getting voice style: {voice_name}")
65
+ style = tts.get_voice_style(voice_name=voice_name)
66
+ print(f"Voice style '{voice_name}' loaded successfully")
67
+ except Exception as e:
68
+ error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
69
+ print(error_msg)
70
+ return None, error_msg
71
+
72
+ try:
73
+ print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
74
+
75
+ # Convert PIL Image to format expected by pipeline
76
+ if isinstance(image, Image.Image):
77
+ # PIL Image should work directly, but ensure it's RGB
78
+ if image.mode != 'RGB':
79
+ image = image.convert('RGB')
80
+ print(f"Converted image to RGB mode")
81
+
82
+ # Convert image to text
83
+ print("Running image-to-text pipeline...")
84
+ result = image_to_text(image)
85
+ print(f"Image-to-text result: {result}")
86
+
87
+ if not result or len(result) == 0:
88
+ return None, "Error: Could not extract text from image. The pipeline returned an empty result."
89
+
90
+ generated_text = result[0].get('generated_text', '')
91
+ if not generated_text:
92
+ return None, "Error: No text was extracted from the image. The generated text is empty."
93
+
94
+ print(f"Extracted text: {generated_text}")
95
+
96
+ # Convert text to speech
97
+ print(f"Synthesizing speech with voice '{voice_name}'...")
98
+ wav, duration = tts.synthesize(generated_text, voice_style=style)
99
+ print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
100
+
101
+ # Ensure wav is a numpy array
102
+ if not isinstance(wav, np.ndarray):
103
+ wav = np.array(wav)
104
+ print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
105
+
106
+ # Ensure audio is 1D (mono) format
107
+ if wav.ndim > 1:
108
+ wav = wav.squeeze()
109
+ if wav.ndim > 1:
110
+ # If still multi-dimensional, take first channel
111
+ wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
112
+ print(f"Squeezed wav to 1D: shape={wav.shape}")
113
+
114
+ # Normalize audio to [-1, 1] range if needed
115
+ if wav.dtype == np.int16:
116
+ wav = wav.astype(np.float32) / 32768.0
117
+ elif wav.dtype == np.int32:
118
+ wav = wav.astype(np.float32) / 2147483648.0
119
+ elif wav.dtype != np.float32:
120
+ # If already in a reasonable range, just convert to float32
121
+ if np.abs(wav).max() > 1.0:
122
+ wav = wav.astype(np.float32) / np.abs(wav).max()
123
+ else:
124
+ wav = wav.astype(np.float32)
125
+
126
+ print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
127
+
128
+ # Calculate sample rate from duration and audio length
129
+ # sample_rate = samples / duration_in_seconds
130
+ if duration > 0:
131
+ calculated_sample_rate = int(len(wav) / duration)
132
+ print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
133
+ sample_rate = calculated_sample_rate
134
+ else:
135
+ # Fallback: Try common TTS sample rates
136
+ # Many TTS systems use 24000 Hz or 16000 Hz
137
+ # If audio sounds slow, try higher sample rate; if fast, try lower
138
+ sample_rate = 24000 # Common TTS sample rate
139
+ print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
140
+
141
+ return (sample_rate, wav), generated_text
142
+
143
+ except Exception as e:
144
+ error_msg = f"Error processing image: {str(e)}"
145
+ full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
146
+ print(full_error) # Print full traceback for debugging
147
+ return None, error_msg
148
+
149
+
150
+ # Create Gradio interface
151
+ with gr.Blocks(title="Image to Voice") as demo:
152
+ gr.Markdown("# Image to Voice Converter")
153
+ gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
154
+
155
+ with gr.Row():
156
+ with gr.Column():
157
+ image_input = gr.Image(type="pil", label="Upload Image")
158
+ voice_dropdown = gr.Dropdown(
159
+ choices=AVAILABLE_VOICES,
160
+ value="M5",
161
+ label="Voice Style",
162
+ info="Select a voice style for text-to-speech"
163
+ )
164
+ generate_btn = gr.Button("Generate Speech", variant="primary")
165
+
166
+ with gr.Column():
167
+ audio_output = gr.Audio(label="Generated Speech", type="numpy")
168
+ text_output = gr.Textbox(label="Extracted Text", lines=5)
169
+
170
+ generate_btn.click(
171
+ fn=image_to_voice,
172
+ inputs=[image_input, voice_dropdown],
173
+ outputs=[audio_output, text_output]
174
+ )
175
+
176
+ gr.Examples(
177
+ examples=[],
178
+ inputs=image_input
179
+ )
180
+
181
+ if __name__ == "__main__":
182
+ demo.launch()
183
+