jonloporto commited on
Commit
f7f7d8f
·
verified ·
1 Parent(s): b78885b

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -183
app.py DELETED
@@ -1,183 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """ImageToVoice Hugging Face Space
3
-
4
- Converts images to text using Hugging Face's image-to-text pipeline,
5
- then converts the text to speech using Supertonic TTS.
6
- """
7
-
8
- import gradio as gr
9
- from supertonic import TTS
10
- from transformers import pipeline
11
- from PIL import Image
12
- import numpy as np
13
- import traceback
14
-
15
- # Initialize models (load once at startup)
16
- image_to_text = None
17
- tts = None
18
- init_error = None
19
-
20
- # Available voice styles for supertonic
21
- AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
22
-
23
- try:
24
- print("Initializing image-to-text pipeline...")
25
- image_to_text = pipeline("image-to-text")
26
- print("Image-to-text pipeline initialized successfully")
27
- except Exception as e:
28
- init_error = f"Failed to initialize image-to-text: {str(e)}"
29
- print(init_error)
30
- traceback.print_exc()
31
-
32
- try:
33
- print("Initializing TTS...")
34
- tts = TTS(auto_download=True)
35
- print("TTS initialized successfully")
36
- except Exception as e:
37
- if init_error:
38
- init_error += f"\nFailed to initialize TTS: {str(e)}"
39
- else:
40
- init_error = f"Failed to initialize TTS: {str(e)}"
41
- print(init_error)
42
- traceback.print_exc()
43
-
44
-
45
- def image_to_voice(image, voice_name):
46
- """Convert image to text, then text to speech."""
47
- if image is None:
48
- return None, "Please upload an image."
49
-
50
- if image_to_text is None or tts is None:
51
- error_msg = "Error: Models failed to initialize. "
52
- if init_error:
53
- error_msg += f"\n\nDetails: {init_error}"
54
- else:
55
- error_msg += "Please check the logs for more information."
56
- return None, error_msg
57
-
58
- # Validate and get voice style
59
- if voice_name not in AVAILABLE_VOICES:
60
- voice_name = "M5" # Default fallback
61
- print(f"Invalid voice name, using default: M5")
62
-
63
- try:
64
- print(f"Getting voice style: {voice_name}")
65
- style = tts.get_voice_style(voice_name=voice_name)
66
- print(f"Voice style '{voice_name}' loaded successfully")
67
- except Exception as e:
68
- error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
69
- print(error_msg)
70
- return None, error_msg
71
-
72
- try:
73
- print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
74
-
75
- # Convert PIL Image to format expected by pipeline
76
- if isinstance(image, Image.Image):
77
- # PIL Image should work directly, but ensure it's RGB
78
- if image.mode != 'RGB':
79
- image = image.convert('RGB')
80
- print(f"Converted image to RGB mode")
81
-
82
- # Convert image to text
83
- print("Running image-to-text pipeline...")
84
- result = image_to_text(image)
85
- print(f"Image-to-text result: {result}")
86
-
87
- if not result or len(result) == 0:
88
- return None, "Error: Could not extract text from image. The pipeline returned an empty result."
89
-
90
- generated_text = result[0].get('generated_text', '')
91
- if not generated_text:
92
- return None, "Error: No text was extracted from the image. The generated text is empty."
93
-
94
- print(f"Extracted text: {generated_text}")
95
-
96
- # Convert text to speech
97
- print(f"Synthesizing speech with voice '{voice_name}'...")
98
- wav, duration = tts.synthesize(generated_text, voice_style=style)
99
- print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
100
-
101
- # Ensure wav is a numpy array
102
- if not isinstance(wav, np.ndarray):
103
- wav = np.array(wav)
104
- print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
105
-
106
- # Ensure audio is 1D (mono) format
107
- if wav.ndim > 1:
108
- wav = wav.squeeze()
109
- if wav.ndim > 1:
110
- # If still multi-dimensional, take first channel
111
- wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
112
- print(f"Squeezed wav to 1D: shape={wav.shape}")
113
-
114
- # Normalize audio to [-1, 1] range if needed
115
- if wav.dtype == np.int16:
116
- wav = wav.astype(np.float32) / 32768.0
117
- elif wav.dtype == np.int32:
118
- wav = wav.astype(np.float32) / 2147483648.0
119
- elif wav.dtype != np.float32:
120
- # If already in a reasonable range, just convert to float32
121
- if np.abs(wav).max() > 1.0:
122
- wav = wav.astype(np.float32) / np.abs(wav).max()
123
- else:
124
- wav = wav.astype(np.float32)
125
-
126
- print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
127
-
128
- # Calculate sample rate from duration and audio length
129
- # sample_rate = samples / duration_in_seconds
130
- if duration > 0:
131
- calculated_sample_rate = int(len(wav) / duration)
132
- print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
133
- sample_rate = calculated_sample_rate
134
- else:
135
- # Fallback: Try common TTS sample rates
136
- # Many TTS systems use 24000 Hz or 16000 Hz
137
- # If audio sounds slow, try higher sample rate; if fast, try lower
138
- sample_rate = 24000 # Common TTS sample rate
139
- print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
140
-
141
- return (sample_rate, wav), generated_text
142
-
143
- except Exception as e:
144
- error_msg = f"Error processing image: {str(e)}"
145
- full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
146
- print(full_error) # Print full traceback for debugging
147
- return None, error_msg
148
-
149
-
150
- # Create Gradio interface
151
- with gr.Blocks(title="Image to Voice") as demo:
152
- gr.Markdown("# Image to Voice Converter")
153
- gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
154
-
155
- with gr.Row():
156
- with gr.Column():
157
- image_input = gr.Image(type="pil", label="Upload Image")
158
- voice_dropdown = gr.Dropdown(
159
- choices=AVAILABLE_VOICES,
160
- value="M5",
161
- label="Voice Style",
162
- info="Select a voice style for text-to-speech"
163
- )
164
- generate_btn = gr.Button("Generate Speech", variant="primary")
165
-
166
- with gr.Column():
167
- audio_output = gr.Audio(label="Generated Speech", type="numpy")
168
- text_output = gr.Textbox(label="Extracted Text", lines=5)
169
-
170
- generate_btn.click(
171
- fn=image_to_voice,
172
- inputs=[image_input, voice_dropdown],
173
- outputs=[audio_output, text_output]
174
- )
175
-
176
- gr.Examples(
177
- examples=[],
178
- inputs=image_input
179
- )
180
-
181
- if __name__ == "__main__":
182
- demo.launch()
183
-