jonloporto commited on
Commit
f84f0b0
·
verified ·
1 Parent(s): 97efc2b

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -174
app.py DELETED
@@ -1,174 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """ImageToVoice Hugging Face Space
3
-
4
- Converts images to text using Hugging Face's image-to-text pipeline,
5
- then converts the text to speech using Supertonic TTS.
6
- """
7
-
8
- import gradio as gr
9
- from supertonic import TTS
10
- from transformers import pipeline
11
- from PIL import Image
12
- import numpy as np
13
- import traceback
14
-
15
- # Initialize models (load once at startup)
16
- image_to_text = None
17
- tts = None
18
- style = None
19
- init_error = None
20
-
21
- try:
22
- print("Initializing image-to-text pipeline...")
23
- image_to_text = pipeline("image-to-text")
24
- print("Image-to-text pipeline initialized successfully")
25
- except Exception as e:
26
- init_error = f"Failed to initialize image-to-text: {str(e)}"
27
- print(init_error)
28
- traceback.print_exc()
29
-
30
- try:
31
- print("Initializing TTS...")
32
- tts = TTS(auto_download=True)
33
- print("TTS initialized successfully")
34
- except Exception as e:
35
- if init_error:
36
- init_error += f"\nFailed to initialize TTS: {str(e)}"
37
- else:
38
- init_error = f"Failed to initialize TTS: {str(e)}"
39
- print(init_error)
40
- traceback.print_exc()
41
-
42
- if tts is not None:
43
- try:
44
- print("Getting voice style...")
45
- style = tts.get_voice_style(voice_name="M5")
46
- print("Voice style loaded successfully")
47
- except Exception as e:
48
- if init_error:
49
- init_error += f"\nFailed to get voice style: {str(e)}"
50
- else:
51
- init_error = f"Failed to get voice style: {str(e)}"
52
- print(init_error)
53
- traceback.print_exc()
54
-
55
-
56
- def image_to_voice(image):
57
- """Convert image to text, then text to speech."""
58
- if image is None:
59
- return None, "Please upload an image."
60
-
61
- if image_to_text is None or tts is None or style is None:
62
- error_msg = "Error: Models failed to initialize. "
63
- if init_error:
64
- error_msg += f"\n\nDetails: {init_error}"
65
- else:
66
- error_msg += "Please check the logs for more information."
67
- return None, error_msg
68
-
69
- try:
70
- print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
71
-
72
- # Convert PIL Image to format expected by pipeline
73
- if isinstance(image, Image.Image):
74
- # PIL Image should work directly, but ensure it's RGB
75
- if image.mode != 'RGB':
76
- image = image.convert('RGB')
77
- print(f"Converted image to RGB mode")
78
-
79
- # Convert image to text
80
- print("Running image-to-text pipeline...")
81
- result = image_to_text(image)
82
- print(f"Image-to-text result: {result}")
83
-
84
- if not result or len(result) == 0:
85
- return None, "Error: Could not extract text from image. The pipeline returned an empty result."
86
-
87
- generated_text = result[0].get('generated_text', '')
88
- if not generated_text:
89
- return None, "Error: No text was extracted from the image. The generated text is empty."
90
-
91
- print(f"Extracted text: {generated_text}")
92
-
93
- # Convert text to speech
94
- print("Synthesizing speech...")
95
- wav, duration = tts.synthesize(generated_text, voice_style=style)
96
- print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
97
-
98
- # Ensure wav is a numpy array
99
- if not isinstance(wav, np.ndarray):
100
- wav = np.array(wav)
101
- print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
102
-
103
- # Ensure audio is 1D (mono) format
104
- if wav.ndim > 1:
105
- wav = wav.squeeze()
106
- if wav.ndim > 1:
107
- # If still multi-dimensional, take first channel
108
- wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
109
- print(f"Squeezed wav to 1D: shape={wav.shape}")
110
-
111
- # Normalize audio to [-1, 1] range if needed
112
- if wav.dtype == np.int16:
113
- wav = wav.astype(np.float32) / 32768.0
114
- elif wav.dtype == np.int32:
115
- wav = wav.astype(np.float32) / 2147483648.0
116
- elif wav.dtype != np.float32:
117
- # If already in a reasonable range, just convert to float32
118
- if np.abs(wav).max() > 1.0:
119
- wav = wav.astype(np.float32) / np.abs(wav).max()
120
- else:
121
- wav = wav.astype(np.float32)
122
-
123
- print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
124
-
125
- # Calculate sample rate from duration and audio length
126
- # sample_rate = samples / duration_in_seconds
127
- if duration > 0:
128
- calculated_sample_rate = int(len(wav) / duration)
129
- print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
130
- sample_rate = calculated_sample_rate
131
- else:
132
- # Fallback: Try common TTS sample rates
133
- # Many TTS systems use 24000 Hz or 16000 Hz
134
- # If audio sounds slow, try higher sample rate; if fast, try lower
135
- sample_rate = 24000 # Common TTS sample rate
136
- print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
137
-
138
- return (sample_rate, wav), generated_text
139
-
140
- except Exception as e:
141
- error_msg = f"Error processing image: {str(e)}"
142
- full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
143
- print(full_error) # Print full traceback for debugging
144
- return None, error_msg
145
-
146
-
147
- # Create Gradio interface
148
- with gr.Blocks(title="Image to Voice") as demo:
149
- gr.Markdown("# Image to Voice Converter")
150
- gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
151
-
152
- with gr.Row():
153
- with gr.Column():
154
- image_input = gr.Image(type="pil", label="Upload Image")
155
- generate_btn = gr.Button("Generate Speech", variant="primary")
156
-
157
- with gr.Column():
158
- audio_output = gr.Audio(label="Generated Speech", type="numpy")
159
- text_output = gr.Textbox(label="Extracted Text", lines=5)
160
-
161
- generate_btn.click(
162
- fn=image_to_voice,
163
- inputs=image_input,
164
- outputs=[audio_output, text_output]
165
- )
166
-
167
- gr.Examples(
168
- examples=[],
169
- inputs=image_input
170
- )
171
-
172
- if __name__ == "__main__":
173
- demo.launch()
174
-