Danielah17 commited on
Commit
8ae87a8
Β·
verified Β·
1 Parent(s): caf8f9b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from supertonic import TTS
3
+ from transformers import pipeline
4
+ import tempfile
5
+ import os
6
+
7
+ # Initialize the image-to-text pipeline
8
+ image_to_text = pipeline("image-to-text")
9
+
10
+ # Initialize the TTS model
11
+ tts = TTS(auto_download=True)
12
+
13
+ # Available voice styles (common Supertonic voices)
14
+ VOICE_OPTIONS = [
15
+ ("M5 - Male Voice (Default)", "M5"),
16
+ ("M1 - Male Voice 1", "M1"),
17
+ ("M2 - Male Voice 2", "M2"),
18
+ ("M3 - Male Voice 3", "M3"),
19
+ ("M4 - Male Voice 4", "M4"),
20
+ ("F1 - Female Voice 1", "F1"),
21
+ ("F2 - Female Voice 2", "F2"),
22
+ ("F3 - Female Voice 3", "F3"),
23
+ ("F4 - Female Voice 4", "F4"),
24
+ ("F5 - Female Voice 5", "F5"),
25
+ ]
26
+
27
+ def image_to_voice(image, voice_selection):
28
+ """
29
+ Convert an image to text, then text to speech.
30
+
31
+ Args:
32
+ image: Input image (PIL Image or numpy array)
33
+ voice_selection: Selected voice style from dropdown (e.g., "M5 - Male Voice (Default)")
34
+
35
+ Returns:
36
+ Path to the generated audio file and extracted text
37
+ """
38
+ if image is None:
39
+ return None, "Please upload an image to get started."
40
+
41
+ try:
42
+ # Extract voice name from selection (e.g., "M5 - Male Voice (Default)" -> "M5")
43
+ voice_name = None
44
+ for opt_label, opt_value in VOICE_OPTIONS:
45
+ if opt_label == voice_selection:
46
+ voice_name = opt_value
47
+ break
48
+
49
+ if voice_name is None:
50
+ # Fallback: try to extract from the selection if format is unexpected
51
+ voice_name = voice_selection.split(" - ")[0] if " - " in voice_selection else voice_selection
52
+
53
+ # Convert image to text
54
+ result = image_to_text(image)
55
+ generated_text = result[0]['generated_text']
56
+
57
+ # Get the selected voice style
58
+ style = tts.get_voice_style(voice_name=voice_name)
59
+
60
+ # Convert text to speech
61
+ wav, duration = tts.synthesize(generated_text, voice_style=style)
62
+
63
+ # Save to a temporary file
64
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
65
+ tts.save_audio(wav, temp_file.name)
66
+
67
+ return temp_file.name, generated_text
68
+ except Exception as e:
69
+ return None, f"❌ Error: {str(e)}"
70
+
71
+ # Custom CSS for professional styling
72
+ custom_css = """
73
+ .gradio-container {
74
+ font-family: 'Inter', 'Segoe UI', system-ui, sans-serif !important;
75
+ }
76
+ .header {
77
+ text-align: center;
78
+ padding: 2rem 1rem;
79
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
80
+ border-radius: 12px;
81
+ margin-bottom: 2rem;
82
+ color: white;
83
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
84
+ }
85
+ .header h1 {
86
+ margin: 0;
87
+ font-size: 2.5rem;
88
+ font-weight: 700;
89
+ letter-spacing: -0.02em;
90
+ }
91
+ .header p {
92
+ margin: 0.5rem 0 0 0;
93
+ opacity: 0.95;
94
+ font-size: 1.1rem;
95
+ }
96
+ .feature-box {
97
+ background: #f8f9fa;
98
+ border-radius: 10px;
99
+ padding: 1.5rem;
100
+ margin: 1rem 0;
101
+ border-left: 4px solid #667eea;
102
+ }
103
+ .feature-box h3 {
104
+ margin-top: 0;
105
+ color: #333;
106
+ font-size: 1.1rem;
107
+ }
108
+ .main-content {
109
+ max-width: 1200px;
110
+ margin: 0 auto;
111
+ }
112
+ .upload-section {
113
+ background: white;
114
+ border-radius: 12px;
115
+ padding: 2rem;
116
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
117
+ margin-bottom: 1.5rem;
118
+ }
119
+ .output-section {
120
+ background: white;
121
+ border-radius: 12px;
122
+ padding: 2rem;
123
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
124
+ }
125
+ .generate-btn {
126
+ width: 100%;
127
+ padding: 1rem !important;
128
+ font-size: 1.1rem !important;
129
+ font-weight: 600 !important;
130
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
131
+ border: none !important;
132
+ border-radius: 8px !important;
133
+ transition: transform 0.2s, box-shadow 0.2s !important;
134
+ }
135
+ .generate-btn:hover {
136
+ transform: translateY(-2px);
137
+ box-shadow: 0 6px 12px rgba(102, 126, 234, 0.4) !important;
138
+ }
139
+ .footer {
140
+ text-align: center;
141
+ padding: 2rem 1rem;
142
+ margin-top: 3rem;
143
+ color: #666;
144
+ font-size: 0.9rem;
145
+ }
146
+ .section-title {
147
+ margin-top: 1rem;
148
+ margin-bottom: 1rem;
149
+ color: #333;
150
+ font-weight: 600;
151
+ }
152
+ select, .gr-dropdown {
153
+ border-radius: 8px !important;
154
+ border: 2px solid #e0e0e0 !important;
155
+ padding: 0.75rem !important;
156
+ font-size: 1rem !important;
157
+ transition: border-color 0.2s !important;
158
+ }
159
+ select:focus, .gr-dropdown:focus {
160
+ border-color: #667eea !important;
161
+ outline: none !important;
162
+ }
163
+ """
164
+
165
+ # Create Gradio interface
166
+ with gr.Blocks(title="Image to Voice Converter", theme=gr.themes.Soft(), css=custom_css) as demo:
167
+
168
+ # Header Section
169
+ gr.HTML("""
170
+ <div class="header">
171
+ <h1>πŸŽ™οΈ Image to Voice Converter</h1>
172
+ <p>Transform images into speech with AI-powered technology</p>
173
+ </div>
174
+ """)
175
+
176
+ # Main Content Container
177
+ with gr.Column(elem_classes="main-content"):
178
+
179
+ # Instructions Section
180
+ with gr.Row():
181
+ with gr.Column(scale=1):
182
+ gr.HTML("""
183
+ <div class="feature-box">
184
+ <h3>πŸ“· Step 1: Upload Image</h3>
185
+ <p>Upload any image containing text. Our AI will extract it automatically.</p>
186
+ </div>
187
+ """)
188
+ with gr.Column(scale=1):
189
+ gr.HTML("""
190
+ <div class="feature-box">
191
+ <h3>πŸ€– Step 2: AI Processing</h3>
192
+ <p>Advanced vision-language models analyze and extract text from your image.</p>
193
+ </div>
194
+ """)
195
+ with gr.Column(scale=1):
196
+ gr.HTML("""
197
+ <div class="feature-box">
198
+ <h3>πŸ”Š Step 3: Audio Generation</h3>
199
+ <p>Text is converted to natural-sounding speech using Supertonic TTS.</p>
200
+ </div>
201
+ """)
202
+
203
+ # Main Workflow Section
204
+ with gr.Row():
205
+ # Left Column - Input
206
+ with gr.Column(scale=1, elem_classes="upload-section"):
207
+ gr.Markdown("### πŸ“€ Upload Your Image", elem_classes="section-title")
208
+ image_input = gr.Image(
209
+ label="",
210
+ type="pil",
211
+ height=350,
212
+ show_label=False
213
+ )
214
+
215
+ gr.Markdown("### 🎚️ Voice Settings", elem_classes="section-title")
216
+ voice_dropdown = gr.Dropdown(
217
+ choices=[opt[0] for opt in VOICE_OPTIONS],
218
+ label="Select Voice Style",
219
+ value="M5 - Male Voice (Default)",
220
+ info="Choose a voice style for the generated audio"
221
+ )
222
+
223
+ generate_btn = gr.Button(
224
+ "✨ Generate Audio",
225
+ variant="primary",
226
+ elem_classes="generate-btn",
227
+ size="lg"
228
+ )
229
+
230
+ # Right Column - Output
231
+ with gr.Column(scale=1, elem_classes="output-section"):
232
+ gr.Markdown("### πŸ“ Extracted Text", elem_classes="section-title")
233
+ text_output = gr.Textbox(
234
+ label="",
235
+ lines=6,
236
+ show_label=False,
237
+ placeholder="The extracted text will appear here...",
238
+ interactive=False
239
+ )
240
+
241
+ gr.Markdown("### πŸ”Š Generated Audio", elem_classes="section-title")
242
+ audio_output = gr.Audio(
243
+ label="",
244
+ type="filepath",
245
+ show_label=False
246
+ )
247
+
248
+ # Connection
249
+ generate_btn.click(
250
+ fn=image_to_voice,
251
+ inputs=[image_input, voice_dropdown],
252
+ outputs=[audio_output, text_output],
253
+ show_progress="full"
254
+ )
255
+
256
+ # Footer
257
+ gr.HTML("""
258
+ <div class="footer">
259
+ <p>Powered by <strong>Hugging Face Transformers</strong> & <strong>Supertonic TTS</strong> |
260
+ Built with ❀️ using Gradio</p>
261
+ </div>
262
+ """)
263
+
264
+ if __name__ == "__main__":
265
+ demo.launch()
266
+