vedaco commited on
Commit
3194e3d
Β·
verified Β·
1 Parent(s): 29be232

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +905 -296
app.py CHANGED
@@ -1,354 +1,963 @@
1
- import gradio as gr
2
  import numpy as np
3
- import asyncio
4
- import edge_tts
5
- import tempfile
6
- import os
7
- from scipy.io import wavfile
8
  from scipy import signal
9
- import io
 
 
10
 
11
  # ============================================
12
- # VEDES TTS - Text-to-Speech System
 
13
  # ============================================
14
 
15
- print("=" * 50)
16
- print("πŸŽ™οΈ Initializing Vedes TTS...")
17
- print("=" * 50)
 
 
 
18
 
19
- # Available voices
20
- VOICES = {
21
- "Emma (US Female)": "en-US-EmmaNeural",
22
- "Jenny (US Female)": "en-US-JennyNeural",
23
- "Aria (US Female)": "en-US-AriaNeural",
24
- "Guy (US Male)": "en-US-GuyNeural",
25
- "Eric (US Male)": "en-US-EricNeural",
26
- "Ryan (UK Male)": "en-GB-RyanNeural",
27
- "Sonia (UK Female)": "en-GB-SoniaNeural",
28
- "Natasha (AU Female)": "en-AU-NatashaNeural",
29
- "William (AU Male)": "en-AU-WilliamNeural",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  }
31
 
32
- DEFAULT_VOICE = "en-US-EmmaNeural"
33
- SAMPLE_RATE = 24000
34
 
 
 
 
35
 
36
- async def synthesize_async(text, voice, rate, pitch):
37
- """Async TTS synthesis using edge-tts"""
38
-
39
- # Format rate and pitch for edge-tts
40
- rate_str = f"{'+' if rate >= 0 else ''}{int(rate)}%"
41
- pitch_str = f"{'+' if pitch >= 0 else ''}{int(pitch)}Hz"
42
-
43
- communicate = edge_tts.Communicate(
44
- text=text,
45
- voice=voice,
46
- rate=rate_str,
47
- pitch=pitch_str
48
- )
49
 
50
- # Save to temporary file
51
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
52
- tmp_path = tmp_file.name
 
53
 
54
- await communicate.save(tmp_path)
 
 
55
 
56
- return tmp_path
57
-
58
-
59
- def synthesize_speech(text, voice_name, speaking_rate, pitch_shift):
60
- """
61
- Main synthesis function
62
-
63
- Args:
64
- text: Input text to synthesize
65
- voice_name: Selected voice
66
- speaking_rate: Speed adjustment (-50 to +50)
67
- pitch_shift: Pitch adjustment in Hz (-20 to +20)
68
-
69
- Returns:
70
- Path to generated audio file
71
- """
72
- if not text or len(text.strip()) == 0:
73
- return None
74
 
75
- text = text.strip()[:5000] # Limit text length
 
 
 
 
76
 
77
- # Get voice ID
78
- voice = VOICES.get(voice_name, DEFAULT_VOICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Convert speaking rate to percentage
81
- rate = int((speaking_rate - 1.0) * 100)
 
 
 
82
 
83
- # Convert pitch shift
84
- pitch = int(pitch_shift * 10)
 
 
 
85
 
86
- try:
87
- # Run async synthesis
88
- loop = asyncio.new_event_loop()
89
- asyncio.set_event_loop(loop)
90
- audio_path = loop.run_until_complete(
91
- synthesize_async(text, voice, rate, pitch)
92
- )
93
- loop.close()
94
-
95
- return audio_path
96
 
97
- except Exception as e:
98
- print(f"Synthesis error: {e}")
99
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- def text_analysis(text):
103
- """Analyze text and return statistics"""
104
- if not text:
105
- return ""
106
-
107
- words = text.split()
108
- sentences = text.replace('!', '.').replace('?', '.').split('.')
109
- sentences = [s.strip() for s in sentences if s.strip()]
110
-
111
- char_count = len(text)
112
- word_count = len(words)
113
- sentence_count = len(sentences)
114
-
115
- # Estimate duration (average 150 words per minute)
116
- est_duration = word_count / 150 * 60
117
-
118
- return f"""
119
- πŸ“Š **Text Analysis:**
120
- - Characters: {char_count}
121
- - Words: {word_count}
122
- - Sentences: {sentence_count}
123
- - Estimated Duration: {est_duration:.1f} seconds
124
- """
125
 
126
 
127
  # ============================================
128
- # GRADIO INTERFACE
129
  # ============================================
130
 
131
- # Custom CSS
132
- custom_css = """
133
- .gradio-container {
134
- max-width: 900px !important;
135
- }
136
- .title-text {
137
- text-align: center;
138
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
139
- -webkit-background-clip: text;
140
- -webkit-text-fill-color: transparent;
141
- font-size: 2.5rem;
142
- font-weight: bold;
143
- }
144
- .subtitle-text {
145
- text-align: center;
146
- color: #666;
147
- }
148
- """
149
-
150
- with gr.Blocks(
151
- title="Vedes TTS",
152
- css=custom_css,
153
- theme=gr.themes.Soft(
154
- primary_hue="purple",
155
- secondary_hue="blue",
156
- )
157
- ) as demo:
158
-
159
- # Header
160
- gr.HTML("""
161
- <div style="text-align: center; padding: 20px;">
162
- <h1 class="title-text">πŸŽ™οΈ Vedes TTS</h1>
163
- <p class="subtitle-text">High-Quality Text-to-Speech Synthesis</p>
164
- </div>
165
- """)
166
 
167
- with gr.Tabs():
168
- # Main TTS Tab
169
- with gr.TabItem("πŸ”Š Text to Speech"):
170
- with gr.Row():
171
- with gr.Column(scale=2):
172
- text_input = gr.Textbox(
173
- label="πŸ“ Enter Text",
174
- placeholder="Type or paste your text here...\n\nExample: Hello! Welcome to Vedes, a high-quality text-to-speech system. I can read any text you provide with natural-sounding speech.",
175
- lines=6,
176
- max_lines=15
177
- )
178
-
179
- text_stats = gr.Markdown("")
180
-
181
- with gr.Row():
182
- voice_select = gr.Dropdown(
183
- choices=list(VOICES.keys()),
184
- value="Emma (US Female)",
185
- label="πŸ—£οΈ Select Voice",
186
- interactive=True
187
- )
188
-
189
- with gr.Row():
190
- speaking_rate = gr.Slider(
191
- minimum=0.5,
192
- maximum=2.0,
193
- value=1.0,
194
- step=0.1,
195
- label="⏱️ Speaking Rate",
196
- info="0.5x = Slow, 1.0x = Normal, 2.0x = Fast"
197
- )
198
-
199
- pitch_shift = gr.Slider(
200
- minimum=-2.0,
201
- maximum=2.0,
202
- value=0.0,
203
- step=0.1,
204
- label="🎡 Pitch Adjustment",
205
- info="Adjust voice pitch"
206
- )
207
-
208
- synthesize_btn = gr.Button(
209
- "πŸ”Š Generate Speech",
210
- variant="primary",
211
- size="lg"
212
- )
213
-
214
- with gr.Column(scale=1):
215
- audio_output = gr.Audio(
216
- label="🎧 Generated Speech",
217
- type="filepath"
218
- )
219
-
220
- gr.Markdown("""
221
- ### πŸ’‘ Tips:
222
- - Use punctuation for natural pauses
223
- - Add commas for short pauses
224
- - Add periods for longer pauses
225
- - Use "!" and "?" for expression
226
- """)
227
-
228
- # Examples Tab
229
- with gr.TabItem("πŸ“š Examples"):
230
- gr.Markdown("### Click any example to try it:")
231
-
232
- examples = [
233
- ["Hello! Welcome to Vedes text-to-speech. I hope you're having a wonderful day!"],
234
- ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
235
- ["In a world where technology advances rapidly, artificial intelligence continues to reshape how we live and work."],
236
- ["Once upon a time, in a land far away, there lived a wise old wizard who knew the secrets of the universe."],
237
- ["Breaking news: Scientists have discovered a new species of butterfly in the Amazon rainforest."],
238
- ["To be, or not to be, that is the question. Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune."],
239
- ["Good morning! Today's weather forecast predicts sunny skies with a high of 75 degrees Fahrenheit."],
240
- ["Thank you for using Vedes TTS. We appreciate your interest in our text-to-speech technology!"],
241
- ]
242
-
243
- gr.Examples(
244
- examples=examples,
245
- inputs=text_input,
246
- label=""
247
- )
248
 
249
- # Voices Tab
250
- with gr.TabItem("🎭 Voice Gallery"):
251
- gr.Markdown("""
252
- ### Available Voices:
253
-
254
- | Voice | Gender | Accent | Best For |
255
- |-------|--------|--------|----------|
256
- | Emma | Female | US English | General, Friendly |
257
- | Jenny | Female | US English | Professional, Clear |
258
- | Aria | Female | US English | Conversational |
259
- | Guy | Male | US English | Narration, Calm |
260
- | Eric | Male | US English | News, Formal |
261
- | Ryan | Male | UK English | British content |
262
- | Sonia | Female | UK English | British content |
263
- | Natasha | Female | AU English | Australian content |
264
- | William | Male | AU English | Australian content |
265
-
266
- ---
267
-
268
- ### 🎯 Voice Selection Tips:
269
 
270
- - **For storytelling:** Try Emma or Guy
271
- - **For news/formal:** Try Jenny or Eric
272
- - **For casual content:** Try Aria
273
- - **For British accent:** Try Ryan or Sonia
274
- - **For Australian accent:** Try Natasha or William
275
- """)
276
-
277
- # About Tab
278
- with gr.TabItem("ℹ️ About"):
279
- gr.Markdown("""
280
- ## πŸŽ™οΈ About Vedes TTS
 
 
 
281
 
282
- **Vedes** is a text-to-speech application that converts written text into natural-sounding speech.
 
 
 
 
 
283
 
284
- ### ✨ Features:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
- - πŸ—£οΈ **9 High-Quality Voices** - Male and female voices with different accents
287
- - 🌍 **Multiple Accents** - US, UK, and Australian English
288
- - ⏱️ **Adjustable Speed** - From 0.5x to 2.0x speaking rate
289
- - 🎡 **Pitch Control** - Fine-tune the voice pitch
290
- - πŸ“± **Easy to Use** - Simple, intuitive interface
291
- - ⚑ **Fast Generation** - Quick audio synthesis
292
 
293
- ### πŸ”§ How It Works:
 
 
294
 
295
- 1. **Enter Text** - Type or paste your text
296
- 2. **Select Voice** - Choose from 9 available voices
297
- 3. **Adjust Settings** - Modify speed and pitch if needed
298
- 4. **Generate** - Click the button to create speech
299
- 5. **Listen & Download** - Play or save the audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
- ### πŸ“– Best Practices:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- - Use proper punctuation for natural speech rhythm
304
- - Break long texts into paragraphs
305
- - Use commas for short pauses, periods for longer ones
306
- - Add question marks and exclamation points for expression
307
 
308
- ---
 
309
 
310
- ### πŸ› οΈ Technical Details:
 
 
 
 
 
 
 
 
 
 
 
311
 
312
- - **Engine:** Neural TTS
313
- - **Audio Format:** MP3
314
- - **Sample Rate:** 24kHz
315
- - **Max Text Length:** 5000 characters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
- ---
 
 
 
 
 
 
 
 
318
 
319
- *Built with ❀️ using Python and Gradio*
320
- """)
321
-
322
- # Footer
323
- gr.HTML("""
324
- <div style="text-align: center; padding: 20px; color: #888;">
325
- <p>Vedes TTS Β© 2024 | Powered by Neural Speech Synthesis</p>
326
- </div>
327
- """)
328
 
329
- # Event Handlers
330
- text_input.change(
331
- fn=text_analysis,
 
 
 
 
 
 
 
 
 
 
332
  inputs=text_input,
333
- outputs=text_stats
334
  )
335
 
336
- synthesize_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  fn=synthesize_speech,
338
- inputs=[text_input, voice_select, speaking_rate, pitch_shift],
339
  outputs=audio_output
340
  )
341
 
342
  text_input.submit(
343
  fn=synthesize_speech,
344
- inputs=[text_input, voice_select, speaking_rate, pitch_shift],
345
  outputs=audio_output
346
  )
347
 
348
 
349
- # Launch
350
- print("βœ… Vedes TTS Ready!")
351
- print("=" * 50)
352
-
353
  if __name__ == "__main__":
354
  demo.launch()
 
 
1
  import numpy as np
2
+ import gradio as gr
 
 
 
 
3
  from scipy import signal
4
+ from scipy.io import wavfile
5
+ import tempfile
6
+ import re
7
 
8
  # ============================================
9
+ # VEDES TTS - 100% FROM SCRATCH
10
+ # No APIs, No Pre-trained Models
11
  # ============================================
12
 
13
+ SAMPLE_RATE = 22050
14
+
15
+ # ============================================
16
+ # PHONEME DATABASE WITH ACCURATE FORMANTS
17
+ # Based on linguistic research data
18
+ # ============================================
19
 
20
+ PHONEMES = {
21
+ # Vowels: (F1, F2, F3, F4, duration_ms, is_voiced)
22
+ # Formant values based on Peterson & Barney (1952) research
23
+
24
+ # Front vowels
25
+ 'IY': {'f1': 270, 'f2': 2290, 'f3': 3010, 'f4': 3300, 'dur': 80, 'voiced': True}, # beat
26
+ 'IH': {'f1': 390, 'f2': 1990, 'f3': 2550, 'f4': 3300, 'dur': 60, 'voiced': True}, # bit
27
+ 'EH': {'f1': 530, 'f2': 1840, 'f3': 2480, 'f4': 3300, 'dur': 70, 'voiced': True}, # bet
28
+ 'AE': {'f1': 660, 'f2': 1720, 'f3': 2410, 'f4': 3300, 'dur': 90, 'voiced': True}, # bat
29
+
30
+ # Back vowels
31
+ 'AA': {'f1': 730, 'f2': 1090, 'f3': 2440, 'f4': 3300, 'dur': 100, 'voiced': True}, # father
32
+ 'AO': {'f1': 570, 'f2': 840, 'f3': 2410, 'f4': 3300, 'dur': 100, 'voiced': True}, # bought
33
+ 'UH': {'f1': 440, 'f2': 1020, 'f3': 2240, 'f4': 3300, 'dur': 70, 'voiced': True}, # book
34
+ 'UW': {'f1': 300, 'f2': 870, 'f3': 2240, 'f4': 3300, 'dur': 90, 'voiced': True}, # boot
35
+
36
+ # Central vowels
37
+ 'AH': {'f1': 520, 'f2': 1190, 'f3': 2390, 'f4': 3300, 'dur': 60, 'voiced': True}, # but
38
+ 'ER': {'f1': 490, 'f2': 1350, 'f3': 1690, 'f4': 3300, 'dur': 90, 'voiced': True}, # bird
39
+ 'AX': {'f1': 500, 'f2': 1500, 'f3': 2500, 'f4': 3300, 'dur': 40, 'voiced': True}, # about (schwa)
40
+
41
+ # Diphthongs
42
+ 'EY': {'f1': 450, 'f2': 2000, 'f3': 2600, 'f4': 3300, 'dur': 120, 'voiced': True}, # bait
43
+ 'AY': {'f1': 650, 'f2': 1200, 'f3': 2500, 'f4': 3300, 'dur': 130, 'voiced': True}, # bite
44
+ 'OY': {'f1': 500, 'f2': 900, 'f3': 2500, 'f4': 3300, 'dur': 140, 'voiced': True}, # boy
45
+ 'AW': {'f1': 650, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 130, 'voiced': True}, # bout
46
+ 'OW': {'f1': 450, 'f2': 850, 'f3': 2500, 'f4': 3300, 'dur': 120, 'voiced': True}, # boat
47
+
48
+ # Stops (plosives)
49
+ 'P': {'f1': 300, 'f2': 1000, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': False, 'stop': True, 'burst_freq': 800},
50
+ 'B': {'f1': 300, 'f2': 1000, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': True, 'stop': True, 'burst_freq': 800},
51
+ 'T': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': False, 'stop': True, 'burst_freq': 3000},
52
+ 'D': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'stop': True, 'burst_freq': 3000},
53
+ 'K': {'f1': 300, 'f2': 2000, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': False, 'stop': True, 'burst_freq': 1500},
54
+ 'G': {'f1': 300, 'f2': 2000, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'stop': True, 'burst_freq': 1500},
55
+
56
+ # Fricatives
57
+ 'F': {'f1': 300, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 90, 'voiced': False, 'fricative': True, 'fric_freq': 7000},
58
+ 'V': {'f1': 300, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': True, 'fricative': True, 'fric_freq': 7000},
59
+ 'TH': {'f1': 300, 'f2': 1400, 'f3': 2500, 'f4': 3300, 'dur': 90, 'voiced': False, 'fricative': True, 'fric_freq': 5000},
60
+ 'DH': {'f1': 300, 'f2': 1400, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'fricative': True, 'fric_freq': 5000},
61
+ 'S': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 100, 'voiced': False, 'fricative': True, 'fric_freq': 6000},
62
+ 'Z': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': True, 'fricative': True, 'fric_freq': 6000},
63
+ 'SH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 100, 'voiced': False, 'fricative': True, 'fric_freq': 3500},
64
+ 'ZH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': True, 'fricative': True, 'fric_freq': 3500},
65
+ 'HH': {'f1': 500, 'f2': 1500, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': False, 'fricative': True, 'fric_freq': 1500},
66
+
67
+ # Affricates
68
+ 'CH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 110, 'voiced': False, 'affricate': True},
69
+ 'JH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': True, 'affricate': True},
70
+
71
+ # Nasals
72
+ 'M': {'f1': 280, 'f2': 900, 'f3': 2200, 'f4': 3300, 'dur': 70, 'voiced': True, 'nasal': True},
73
+ 'N': {'f1': 280, 'f2': 1700, 'f3': 2600, 'f4': 3300, 'dur': 60, 'voiced': True, 'nasal': True},
74
+ 'NG': {'f1': 280, 'f2': 2300, 'f3': 2750, 'f4': 3300, 'dur': 70, 'voiced': True, 'nasal': True},
75
+
76
+ # Liquids
77
+ 'L': {'f1': 350, 'f2': 1100, 'f3': 2700, 'f4': 3300, 'dur': 60, 'voiced': True, 'liquid': True},
78
+ 'R': {'f1': 420, 'f2': 1300, 'f3': 1600, 'f4': 3300, 'dur': 60, 'voiced': True, 'liquid': True},
79
+
80
+ # Glides
81
+ 'W': {'f1': 300, 'f2': 700, 'f3': 2200, 'f4': 3300, 'dur': 50, 'voiced': True, 'glide': True},
82
+ 'Y': {'f1': 280, 'f2': 2200, 'f3': 2960, 'f4': 3300, 'dur': 50, 'voiced': True, 'glide': True},
83
+
84
+ # Silence
85
+ 'SIL': {'f1': 0, 'f2': 0, 'f3': 0, 'f4': 0, 'dur': 80, 'voiced': False, 'silence': True},
86
+ 'PAU': {'f1': 0, 'f2': 0, 'f3': 0, 'f4': 0, 'dur': 150, 'voiced': False, 'silence': True},
87
  }
88
 
 
 
89
 
90
+ # ============================================
91
+ # PRONUNCIATION DICTIONARY
92
+ # ============================================
93
 
94
+ DICTIONARY = {
95
+ # Common words
96
+ 'a': ['AX'], 'the': ['DH', 'AX'], 'an': ['AE', 'N'],
97
+ 'i': ['AY'], 'you': ['Y', 'UW'], 'he': ['HH', 'IY'],
98
+ 'she': ['SH', 'IY'], 'it': ['IH', 'T'], 'we': ['W', 'IY'],
99
+ 'they': ['DH', 'EY'], 'me': ['M', 'IY'], 'him': ['HH', 'IH', 'M'],
100
+ 'her': ['HH', 'ER'], 'us': ['AH', 'S'], 'them': ['DH', 'EH', 'M'],
 
 
 
 
 
 
101
 
102
+ # Be verbs
103
+ 'is': ['IH', 'Z'], 'are': ['AA', 'R'], 'was': ['W', 'AA', 'Z'],
104
+ 'were': ['W', 'ER'], 'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'],
105
+ 'being': ['B', 'IY', 'IH', 'NG'], 'am': ['AE', 'M'],
106
 
107
+ # Have verbs
108
+ 'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
109
+ 'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
110
 
111
+ # Do verbs
112
+ 'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'], 'did': ['D', 'IH', 'D'],
113
+ 'doing': ['D', 'UW', 'IH', 'NG'], 'done': ['D', 'AH', 'N'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # Modal verbs
116
+ 'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
117
+ 'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
118
+ 'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
119
+ 'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
120
 
121
+ # Common verbs
122
+ 'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
123
+ 'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
124
+ 'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
125
+ 'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
126
+ 'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
127
+ 'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
128
+ 'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
129
+ 'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
130
+ 'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
131
+ 'saying': ['S', 'EY', 'IH', 'NG'],
132
+ 'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'], 'knew': ['N', 'UW'],
133
+ 'known': ['N', 'OW', 'N'], 'knowing': ['N', 'OW', 'IH', 'NG'],
134
+ 'think': ['TH', 'IH', 'NG', 'K'], 'thinks': ['TH', 'IH', 'NG', 'K', 'S'],
135
+ 'thought': ['TH', 'AO', 'T'], 'thinking': ['TH', 'IH', 'NG', 'K', 'IH', 'NG'],
136
+ 'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
137
+ 'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
138
+ 'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'], 'saw': ['S', 'AO'],
139
+ 'seen': ['S', 'IY', 'N'], 'seeing': ['S', 'IY', 'IH', 'NG'],
140
+ 'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
141
+ 'wanted': ['W', 'AA', 'N', 'T', 'IH', 'D'],
142
+ 'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
143
+ 'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
144
+ 'use': ['Y', 'UW', 'Z'], 'uses': ['Y', 'UW', 'Z', 'IH', 'Z'],
145
+ 'used': ['Y', 'UW', 'Z', 'D'], 'using': ['Y', 'UW', 'Z', 'IH', 'NG'],
146
+ 'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
147
+ 'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
148
+ 'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
149
+ 'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
150
+ 'worked': ['W', 'ER', 'K', 'T'], 'working': ['W', 'ER', 'K', 'IH', 'NG'],
151
+ 'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
152
+ 'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
153
+ 'need': ['N', 'IY', 'D'], 'needed': ['N', 'IY', 'D', 'IH', 'D'],
154
+ 'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
155
+ 'become': ['B', 'IH', 'K', 'AH', 'M'],
156
+ 'leave': ['L', 'IY', 'V'], 'left': ['L', 'EH', 'F', 'T'],
157
+ 'put': ['P', 'UH', 'T'], 'keep': ['K', 'IY', 'P'],
158
+ 'let': ['L', 'EH', 'T'], 'begin': ['B', 'IH', 'G', 'IH', 'N'],
159
+ 'seem': ['S', 'IY', 'M'], 'help': ['HH', 'EH', 'L', 'P'],
160
+ 'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
161
+ 'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
162
+ 'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
163
+ 'believe': ['B', 'IH', 'L', 'IY', 'V'],
164
 
165
+ # Question words
166
+ 'what': ['W', 'AH', 'T'], 'where': ['W', 'EH', 'R'],
167
+ 'when': ['W', 'EH', 'N'], 'why': ['W', 'AY'],
168
+ 'how': ['HH', 'AW'], 'who': ['HH', 'UW'],
169
+ 'which': ['W', 'IH', 'CH'],
170
 
171
+ # Conjunctions
172
+ 'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'],
173
+ 'but': ['B', 'AH', 'T'], 'if': ['IH', 'F'],
174
+ 'then': ['DH', 'EH', 'N'], 'because': ['B', 'IH', 'K', 'AO', 'Z'],
175
+ 'so': ['S', 'OW'], 'than': ['DH', 'AE', 'N'],
176
 
177
+ # Prepositions
178
+ 'of': ['AH', 'V'], 'to': ['T', 'UW'], 'in': ['IH', 'N'],
179
+ 'for': ['F', 'AO', 'R'], 'on': ['AA', 'N'], 'with': ['W', 'IH', 'TH'],
180
+ 'at': ['AE', 'T'], 'by': ['B', 'AY'], 'from': ['F', 'R', 'AH', 'M'],
181
+ 'up': ['AH', 'P'], 'about': ['AX', 'B', 'AW', 'T'],
182
+ 'into': ['IH', 'N', 'T', 'UW'], 'over': ['OW', 'V', 'ER'],
183
+ 'after': ['AE', 'F', 'T', 'ER'], 'out': ['AW', 'T'],
184
+ 'down': ['D', 'AW', 'N'], 'off': ['AO', 'F'],
185
+ 'under': ['AH', 'N', 'D', 'ER'], 'again': ['AX', 'G', 'EH', 'N'],
186
+ 'there': ['DH', 'EH', 'R'], 'here': ['HH', 'IY', 'R'],
187
 
188
+ # Articles/Determiners
189
+ 'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
190
+ 'these': ['DH', 'IY', 'Z'], 'those': ['DH', 'OW', 'Z'],
191
+ 'my': ['M', 'AY'], 'your': ['Y', 'AO', 'R'],
192
+ 'his': ['HH', 'IH', 'Z'], 'its': ['IH', 'T', 'S'],
193
+ 'our': ['AW', 'ER'], 'their': ['DH', 'EH', 'R'],
194
+ 'some': ['S', 'AH', 'M'], 'any': ['EH', 'N', 'IY'],
195
+ 'no': ['N', 'OW'], 'all': ['AO', 'L'],
196
+ 'each': ['IY', 'CH'], 'every': ['EH', 'V', 'R', 'IY'],
197
+ 'both': ['B', 'OW', 'TH'], 'few': ['F', 'Y', 'UW'],
198
+ 'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
199
+ 'other': ['AH', 'DH', 'ER'], 'such': ['S', 'AH', 'CH'],
200
+
201
+ # Adjectives
202
+ 'good': ['G', 'UH', 'D'], 'new': ['N', 'UW'],
203
+ 'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
204
+ 'long': ['L', 'AO', 'NG'], 'great': ['G', 'R', 'EY', 'T'],
205
+ 'little': ['L', 'IH', 'T', 'AX', 'L'], 'own': ['OW', 'N'],
206
+ 'old': ['OW', 'L', 'D'], 'right': ['R', 'AY', 'T'],
207
+ 'big': ['B', 'IH', 'G'], 'high': ['HH', 'AY'],
208
+ 'different': ['D', 'IH', 'F', 'ER', 'AX', 'N', 'T'],
209
+ 'small': ['S', 'M', 'AO', 'L'], 'large': ['L', 'AA', 'R', 'JH'],
210
+ 'next': ['N', 'EH', 'K', 'S', 'T'], 'early': ['ER', 'L', 'IY'],
211
+ 'young': ['Y', 'AH', 'NG'], 'important': ['IH', 'M', 'P', 'AO', 'R', 'T', 'AX', 'N', 'T'],
212
+ 'public': ['P', 'AH', 'B', 'L', 'IH', 'K'],
213
+ 'bad': ['B', 'AE', 'D'], 'same': ['S', 'EY', 'M'],
214
+
215
+ # Adverbs
216
+ 'now': ['N', 'AW'], 'just': ['JH', 'AH', 'S', 'T'],
217
+ 'only': ['OW', 'N', 'L', 'IY'], 'very': ['V', 'EH', 'R', 'IY'],
218
+ 'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
219
+ 'back': ['B', 'AE', 'K'], 'even': ['IY', 'V', 'AX', 'N'],
220
+ 'still': ['S', 'T', 'IH', 'L'], 'too': ['T', 'UW'],
221
+ 'here': ['HH', 'IY', 'R'], 'much': ['M', 'AH', 'CH'],
222
+ 'really': ['R', 'IY', 'L', 'IY'], 'always': ['AO', 'L', 'W', 'EY', 'Z'],
223
+ 'never': ['N', 'EH', 'V', 'ER'], 'today': ['T', 'AX', 'D', 'EY'],
224
+
225
+ # Nouns
226
+ 'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
227
+ 'people': ['P', 'IY', 'P', 'AX', 'L'], 'way': ['W', 'EY'],
228
+ 'day': ['D', 'EY'], 'man': ['M', 'AE', 'N'],
229
+ 'thing': ['TH', 'IH', 'NG'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
230
+ 'life': ['L', 'AY', 'F'], 'child': ['CH', 'AY', 'L', 'D'],
231
+ 'world': ['W', 'ER', 'L', 'D'], 'school': ['S', 'K', 'UW', 'L'],
232
+ 'state': ['S', 'T', 'EY', 'T'], 'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
233
+ 'student': ['S', 'T', 'UW', 'D', 'AX', 'N', 'T'],
234
+ 'group': ['G', 'R', 'UW', 'P'], 'country': ['K', 'AH', 'N', 'T', 'R', 'IY'],
235
+ 'problem': ['P', 'R', 'AA', 'B', 'L', 'AX', 'M'],
236
+ 'hand': ['HH', 'AE', 'N', 'D'], 'part': ['P', 'AA', 'R', 'T'],
237
+ 'place': ['P', 'L', 'EY', 'S'], 'case': ['K', 'EY', 'S'],
238
+ 'week': ['W', 'IY', 'K'], 'company': ['K', 'AH', 'M', 'P', 'AX', 'N', 'IY'],
239
+ 'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
240
+ 'program': ['P', 'R', 'OW', 'G', 'R', 'AE', 'M'],
241
+ 'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
242
+ 'government': ['G', 'AH', 'V', 'ER', 'N', 'M', 'AX', 'N', 'T'],
243
+ 'number': ['N', 'AH', 'M', 'B', 'ER'],
244
+ 'night': ['N', 'AY', 'T'], 'point': ['P', 'OY', 'N', 'T'],
245
+ 'home': ['HH', 'OW', 'M'], 'water': ['W', 'AO', 'T', 'ER'],
246
+ 'room': ['R', 'UW', 'M'], 'mother': ['M', 'AH', 'DH', 'ER'],
247
+ 'area': ['EH', 'R', 'IY', 'AX'], 'money': ['M', 'AH', 'N', 'IY'],
248
+ 'story': ['S', 'T', 'AO', 'R', 'IY'], 'fact': ['F', 'AE', 'K', 'T'],
249
+ 'month': ['M', 'AH', 'N', 'TH'], 'lot': ['L', 'AA', 'T'],
250
+ 'study': ['S', 'T', 'AH', 'D', 'IY'], 'book': ['B', 'UH', 'K'],
251
+ 'eye': ['AY'], 'job': ['JH', 'AA', 'B'],
252
+ 'word': ['W', 'ER', 'D'], 'business': ['B', 'IH', 'Z', 'N', 'IH', 'S'],
253
+ 'issue': ['IH', 'SH', 'UW'], 'side': ['S', 'AY', 'D'],
254
+ 'kind': ['K', 'AY', 'N', 'D'], 'head': ['HH', 'EH', 'D'],
255
+ 'house': ['HH', 'AW', 'S'], 'friend': ['F', 'R', 'EH', 'N', 'D'],
256
+ 'father': ['F', 'AA', 'DH', 'ER'], 'power': ['P', 'AW', 'ER'],
257
+ 'hour': ['AW', 'ER'], 'game': ['G', 'EY', 'M'],
258
+ 'line': ['L', 'AY', 'N'], 'end': ['EH', 'N', 'D'],
259
+ 'member': ['M', 'EH', 'M', 'B', 'ER'], 'law': ['L', 'AO'],
260
+ 'car': ['K', 'AA', 'R'], 'city': ['S', 'IH', 'T', 'IY'],
261
+ 'name': ['N', 'EY', 'M'], 'team': ['T', 'IY', 'M'],
262
+ 'minute': ['M', 'IH', 'N', 'IH', 'T'], 'idea': ['AY', 'D', 'IY', 'AX'],
263
+ 'body': ['B', 'AA', 'D', 'IY'], 'information': ['IH', 'N', 'F', 'ER', 'M', 'EY', 'SH', 'AX', 'N'],
264
+ 'face': ['F', 'EY', 'S'], 'others': ['AH', 'DH', 'ER', 'Z'],
265
+ 'level': ['L', 'EH', 'V', 'AX', 'L'], 'office': ['AO', 'F', 'IH', 'S'],
266
+ 'door': ['D', 'AO', 'R'], 'health': ['HH', 'EH', 'L', 'TH'],
267
+ 'person': ['P', 'ER', 'S', 'AX', 'N'], 'art': ['AA', 'R', 'T'],
268
+ 'war': ['W', 'AO', 'R'], 'history': ['HH', 'IH', 'S', 'T', 'ER', 'IY'],
269
+ 'party': ['P', 'AA', 'R', 'T', 'IY'], 'result': ['R', 'IH', 'Z', 'AH', 'L', 'T'],
270
+ 'change': ['CH', 'EY', 'N', 'JH'], 'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
271
+ 'reason': ['R', 'IY', 'Z', 'AX', 'N'], 'research': ['R', 'IY', 'S', 'ER', 'CH'],
272
+ 'girl': ['G', 'ER', 'L'], 'guy': ['G', 'AY'],
273
+ 'food': ['F', 'UW', 'D'], 'moment': ['M', 'OW', 'M', 'AX', 'N', 'T'],
274
+ 'teacher': ['T', 'IY', 'CH', 'ER'], 'force': ['F', 'AO', 'R', 'S'],
275
+ 'education': ['EH', 'JH', 'AX', 'K', 'EY', 'SH', 'AX', 'N'],
276
+
277
+ # Numbers
278
+ 'one': ['W', 'AH', 'N'], 'two': ['T', 'UW'],
279
+ 'three': ['TH', 'R', 'IY'], 'four': ['F', 'AO', 'R'],
280
+ 'five': ['F', 'AY', 'V'], 'six': ['S', 'IH', 'K', 'S'],
281
+ 'seven': ['S', 'EH', 'V', 'AX', 'N'], 'eight': ['EY', 'T'],
282
+ 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
283
+ 'zero': ['Z', 'IY', 'R', 'OW'],
284
+
285
+ # Greetings
286
+ 'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
287
+ 'hey': ['HH', 'EY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
288
+ 'goodbye': ['G', 'UH', 'D', 'B', 'AY'], 'bye': ['B', 'AY'],
289
+ 'thanks': ['TH', 'AE', 'NG', 'K', 'S'], 'thank': ['TH', 'AE', 'NG', 'K'],
290
+ 'please': ['P', 'L', 'IY', 'Z'], 'sorry': ['S', 'AA', 'R', 'IY'],
291
+ 'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'],
292
+ 'no': ['N', 'OW'], 'not': ['N', 'AA', 'T'],
293
+ 'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
294
+
295
+ # TTS related
296
+ 'text': ['T', 'EH', 'K', 'S', 'T'],
297
+ 'speech': ['S', 'P', 'IY', 'CH'],
298
+ 'voice': ['V', 'OY', 'S'],
299
+ 'sound': ['S', 'AW', 'N', 'D'],
300
+ 'audio': ['AO', 'D', 'IY', 'OW'],
301
+ 'vedes': ['V', 'IY', 'D', 'EH', 'S'],
302
+ 'synthesis': ['S', 'IH', 'N', 'TH', 'AX', 'S', 'IH', 'S'],
303
+ 'synthesize': ['S', 'IH', 'N', 'TH', 'AX', 'S', 'AY', 'Z'],
304
+ 'generate': ['JH', 'EH', 'N', 'ER', 'EY', 'T'],
305
+ 'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
306
+ 'technology': ['T', 'EH', 'K', 'N', 'AA', 'L', 'AX', 'JH', 'IY'],
307
+ }
308
 
309
+ # Letter patterns for unknown words
310
+ PATTERNS = [
311
+ ('tion', ['SH', 'AX', 'N']),
312
+ ('sion', ['ZH', 'AX', 'N']),
313
+ ('ight', ['AY', 'T']),
314
+ ('ough', ['AO']),
315
+ ('ould', ['UH', 'D']),
316
+ ('ious', ['IY', 'AX', 'S']),
317
+ ('eous', ['IY', 'AX', 'S']),
318
+ ('ness', ['N', 'AX', 'S']),
319
+ ('ment', ['M', 'AX', 'N', 'T']),
320
+ ('able', ['AX', 'B', 'AX', 'L']),
321
+ ('ible', ['AX', 'B', 'AX', 'L']),
322
+ ('ally', ['AX', 'L', 'IY']),
323
+ ('ful', ['F', 'AX', 'L']),
324
+ ('less', ['L', 'AX', 'S']),
325
+ ('ing', ['IH', 'NG']),
326
+ ('ck', ['K']),
327
+ ('th', ['TH']),
328
+ ('sh', ['SH']),
329
+ ('ch', ['CH']),
330
+ ('wh', ['W']),
331
+ ('ph', ['F']),
332
+ ('gh', []),
333
+ ('ng', ['NG']),
334
+ ('qu', ['K', 'W']),
335
+ ('ee', ['IY']),
336
+ ('ea', ['IY']),
337
+ ('oo', ['UW']),
338
+ ('ou', ['AW']),
339
+ ('ow', ['OW']),
340
+ ('ai', ['EY']),
341
+ ('ay', ['EY']),
342
+ ('ey', ['IY']),
343
+ ('oy', ['OY']),
344
+ ('oi', ['OY']),
345
+ ('au', ['AO']),
346
+ ('aw', ['AO']),
347
+ ('ie', ['IY']),
348
+ ('ue', ['UW']),
349
+ ('ew', ['UW']),
350
+ ('er', ['ER']),
351
+ ('ir', ['ER']),
352
+ ('ur', ['ER']),
353
+ ('or', ['AO', 'R']),
354
+ ('ar', ['AA', 'R']),
355
+ ]
356
 
357
+ LETTER_PHONEMES = {
358
+ 'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH',
359
+ 'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH',
360
+ 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA',
361
+ 'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T',
362
+ 'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'Y', 'z': 'Z'
363
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
 
366
  # ============================================
367
+ # TEXT TO PHONEME CONVERTER
368
  # ============================================
369
 
370
+ class TextToPhoneme:
371
+ def __init__(self):
372
+ self.dictionary = DICTIONARY
373
+ self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0]))
374
+ self.letters = LETTER_PHONEMES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
+ def convert(self, text):
377
+ """Convert text to phoneme sequence"""
378
+ text = text.lower().strip()
379
+ text = re.sub(r'[^\w\s.,!?\'-]', '', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ words = re.findall(r"[\w']+|[.,!?]", text)
382
+ phonemes = []
383
+
384
+ for i, word in enumerate(words):
385
+ if word in '.,!?':
386
+ phonemes.append('PAU')
387
+ elif word in self.dictionary:
388
+ phonemes.extend(self.dictionary[word])
389
+ else:
390
+ phonemes.extend(self._convert_unknown(word))
 
 
 
 
 
 
 
 
 
 
391
 
392
+ # Add short pause between words
393
+ if i < len(words) - 1 and word not in '.,!?':
394
+ phonemes.append('SIL')
395
+
396
+ return phonemes
397
+
398
+ def _convert_unknown(self, word):
399
+ """Convert unknown word using patterns"""
400
+ phonemes = []
401
+ i = 0
402
+ word = word.lower()
403
+
404
+ while i < len(word):
405
+ matched = False
406
 
407
+ for pattern, phons in self.patterns:
408
+ if word[i:].startswith(pattern):
409
+ phonemes.extend(phons)
410
+ i += len(pattern)
411
+ matched = True
412
+ break
413
 
414
+ if not matched:
415
+ char = word[i]
416
+ if char in self.letters:
417
+ phonemes.append(self.letters[char])
418
+ i += 1
419
+
420
+ return phonemes
421
+
422
+
423
+ # ============================================
424
+ # KLATT FORMANT SYNTHESIZER
425
+ # ============================================
426
+
427
+ class KlattSynthesizer:
428
+ """Klatt-style formant synthesizer - 100% from scratch"""
429
+
430
+ def __init__(self, sample_rate=22050):
431
+ self.sample_rate = sample_rate
432
+ self.base_f0 = 120
433
+
434
+ def synthesize(self, phonemes, rate=1.0, pitch=1.0):
435
+ """Synthesize audio from phonemes"""
436
+ if not phonemes:
437
+ return np.zeros(int(self.sample_rate * 0.5), dtype=np.float32)
438
+
439
+ f0 = self.base_f0 * pitch
440
+ audio_segments = []
441
+
442
+ for i, phoneme in enumerate(phonemes):
443
+ if phoneme not in PHONEMES:
444
+ continue
445
 
446
+ params = PHONEMES[phoneme]
447
+ duration_ms = params['dur'] / rate
448
+ duration_ms = max(20, min(duration_ms, 300))
 
 
 
449
 
450
+ # Get neighboring phonemes for coarticulation
451
+ prev_phon = phonemes[i-1] if i > 0 else None
452
+ next_phon = phonemes[i+1] if i < len(phonemes)-1 else None
453
 
454
+ segment = self._synthesize_phoneme(
455
+ phoneme, params, f0, duration_ms, prev_phon, next_phon
456
+ )
457
+ audio_segments.append(segment)
458
+
459
+ if not audio_segments:
460
+ return np.zeros(int(self.sample_rate * 0.5), dtype=np.float32)
461
+
462
+ # Concatenate with overlap
463
+ audio = self._concatenate(audio_segments)
464
+
465
+ # Final processing
466
+ audio = self._apply_final_envelope(audio)
467
+ audio = audio / (np.max(np.abs(audio)) + 1e-8)
468
+
469
+ return audio.astype(np.float32)
470
+
471
+ def _synthesize_phoneme(self, phoneme, params, f0, duration_ms, prev_phon, next_phon):
472
+ """Synthesize single phoneme"""
473
+ n_samples = int(self.sample_rate * duration_ms / 1000)
474
+ n_samples = max(n_samples, 10)
475
+
476
+ if params.get('silence'):
477
+ return np.zeros(n_samples, dtype=np.float32)
478
+
479
+ t = np.arange(n_samples) / self.sample_rate
480
+
481
+ # Generate source signal
482
+ if params['voiced']:
483
+ source = self._generate_glottal_source(t, f0)
484
+ else:
485
+ source = self._generate_noise(n_samples)
486
+
487
+ # Handle different phoneme types
488
+ if params.get('stop'):
489
+ audio = self._synthesize_stop(source, params, n_samples, t)
490
+ elif params.get('fricative'):
491
+ audio = self._synthesize_fricative(source, params, n_samples, t, f0)
492
+ elif params.get('affricate'):
493
+ audio = self._synthesize_affricate(source, params, n_samples, t, f0)
494
+ elif params.get('nasal'):
495
+ audio = self._synthesize_nasal(source, params, n_samples, t, f0)
496
+ else:
497
+ # Vowels and approximants
498
+ audio = self._apply_formants(source, params)
499
+
500
+ # Apply envelope
501
+ audio = self._apply_envelope(audio, phoneme, params)
502
+
503
+ # Coarticulation
504
+ audio = self._apply_coarticulation(audio, phoneme, prev_phon, next_phon)
505
+
506
+ return audio
507
+
508
+ def _generate_glottal_source(self, t, f0):
509
+ """Generate glottal pulse train using LF model approximation"""
510
+ # Rosenberg glottal pulse approximation
511
+ T0 = 1.0 / f0
512
+ phase = (t % T0) / T0
513
+
514
+ # Glottal waveform
515
+ glottal = np.zeros_like(t)
516
+
517
+ # Opening phase (0 to 0.4)
518
+ mask1 = phase < 0.4
519
+ glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
520
+
521
+ # Closing phase (0.4 to 0.6)
522
+ mask2 = (phase >= 0.4) & (phase < 0.6)
523
+ glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
524
+
525
+ # Closed phase (0.6 to 1.0)
526
+ mask3 = phase >= 0.6
527
+ glottal[mask3] = 0
528
+
529
+ # Add jitter (frequency perturbation) and shimmer (amplitude perturbation)
530
+ jitter = 1 + 0.01 * np.random.randn(len(t))
531
+ shimmer = 1 + 0.03 * np.random.randn(len(t))
532
+
533
+ glottal = glottal * shimmer
534
+
535
+ # Add aspiration noise
536
+ aspiration = np.random.randn(len(t)) * 0.02
537
+ glottal = glottal + aspiration
538
+
539
+ return glottal
540
+
541
+ def _generate_noise(self, n_samples):
542
+ """Generate white noise"""
543
+ return np.random.randn(n_samples)
544
+
545
+ def _apply_formants(self, source, params):
546
+ """Apply formant filtering using cascaded resonators"""
547
+ audio = source.copy()
548
+
549
+ formants = [
550
+ (params['f1'], 80), # F1 with bandwidth
551
+ (params['f2'], 100), # F2
552
+ (params['f3'], 120), # F3
553
+ (params['f4'], 150), # F4
554
+ ]
555
+
556
+ result = np.zeros_like(audio)
557
+
558
+ for freq, bw in formants:
559
+ if freq <= 0 or freq >= self.sample_rate / 2:
560
+ continue
561
 
562
+ # Design resonator (second-order bandpass)
563
+ filtered = self._resonator(audio, freq, bw)
564
+ result += filtered
565
+
566
+ return result
567
+
568
+ def _resonator(self, signal, freq, bandwidth):
569
+ """Second-order resonator (formant filter)"""
570
+ if freq <= 0 or freq >= self.sample_rate / 2:
571
+ return signal
572
+
573
+ # Convert to digital filter coefficients
574
+ r = np.exp(-np.pi * bandwidth / self.sample_rate)
575
+ theta = 2 * np.pi * freq / self.sample_rate
576
+
577
+ # IIR filter coefficients
578
+ a1 = -2 * r * np.cos(theta)
579
+ a2 = r * r
580
+ b0 = 1 - r
581
+
582
+ # Apply filter using direct form
583
+ y = np.zeros_like(signal)
584
+ for i in range(2, len(signal)):
585
+ y[i] = b0 * signal[i] - a1 * y[i-1] - a2 * y[i-2]
586
+
587
+ return y
588
+
589
+ def _synthesize_stop(self, source, params, n_samples, t):
590
+ """Synthesize stop consonant"""
591
+ audio = np.zeros(n_samples)
592
+
593
+ # Closure phase (silence)
594
+ closure_len = n_samples // 2
595
+
596
+ # Burst phase
597
+ burst_len = n_samples - closure_len
598
+ burst_start = closure_len
599
+
600
+ # Generate burst
601
+ burst_freq = params.get('burst_freq', 1500)
602
+ burst = np.random.randn(burst_len) * 0.5
603
+
604
+ # Filter burst
605
+ if burst_freq < self.sample_rate / 2:
606
+ try:
607
+ b, a = signal.butter(2, burst_freq / (self.sample_rate / 2), 'low')
608
+ burst = signal.filtfilt(b, a, burst)
609
+ except:
610
+ pass
611
+
612
+ audio[burst_start:] = burst
613
+
614
+ # Add voice bar for voiced stops
615
+ if params['voiced']:
616
+ voice_bar = self._generate_glottal_source(t[:closure_len], 100) * 0.3
617
+ audio[:closure_len] = voice_bar
618
+
619
+ return audio
620
+
621
+ def _synthesize_fricative(self, source, params, n_samples, t, f0):
622
+ """Synthesize fricative consonant"""
623
+ # Generate frication noise
624
+ noise = np.random.randn(n_samples)
625
+
626
+ # Filter based on frication frequency
627
+ fric_freq = params.get('fric_freq', 4000)
628
+
629
+ try:
630
+ if fric_freq > 3000:
631
+ # High-pass for /s/, /f/
632
+ b, a = signal.butter(4, 2000 / (self.sample_rate / 2), 'high')
633
+ else:
634
+ # Band-pass for /sh/
635
+ low = max(100, fric_freq - 1000)
636
+ high = min(fric_freq + 1000, self.sample_rate / 2 - 100)
637
+ b, a = signal.butter(2, [low / (self.sample_rate / 2),
638
+ high / (self.sample_rate / 2)], 'band')
639
+ noise = signal.filtfilt(b, a, noise)
640
+ except:
641
+ pass
642
+
643
+ audio = noise * 0.4
644
+
645
+ # Add voicing for voiced fricatives
646
+ if params['voiced']:
647
+ voiced = self._generate_glottal_source(t, f0)
648
+ voiced = self._apply_formants(voiced, params) * 0.3
649
+ audio = audio + voiced
650
+
651
+ return audio
652
+
653
+ def _synthesize_affricate(self, source, params, n_samples, t, f0):
654
+ """Synthesize affricate (stop + fricative)"""
655
+ stop_len = n_samples // 3
656
+ fric_len = n_samples - stop_len
657
+
658
+ audio = np.zeros(n_samples)
659
+
660
+ # Stop portion
661
+ audio[:stop_len] = 0
662
+
663
+ # Fricative portion
664
+ fric = np.random.randn(fric_len) * 0.4
665
+ try:
666
+ b, a = signal.butter(2, 2500 / (self.sample_rate / 2), 'high')
667
+ fric = signal.filtfilt(b, a, fric)
668
+ except:
669
+ pass
670
+
671
+ audio[stop_len:] = fric
672
+
673
+ return audio
674
+
675
+ def _synthesize_nasal(self, source, params, n_samples, t, f0):
676
+ """Synthesize nasal consonant"""
677
+ # Generate voiced source
678
+ voiced = self._generate_glottal_source(t, f0)
679
+
680
+ # Apply nasal formants (lower frequencies)
681
+ audio = self._apply_formants(voiced, params)
682
+
683
+ # Add nasal resonance (around 250-300 Hz)
684
+ try:
685
+ b, a = signal.butter(2, 400 / (self.sample_rate / 2), 'low')
686
+ nasal = signal.filtfilt(b, a, voiced) * 0.5
687
+ audio = audio + nasal
688
+ except:
689
+ pass
690
+
691
+ # Add anti-resonance effect
692
+ audio = audio * 0.7
693
+
694
+ return audio
695
+
696
+ def _apply_envelope(self, audio, phoneme, params):
697
+ """Apply amplitude envelope"""
698
+ n = len(audio)
699
+ if n < 4:
700
+ return audio
701
+
702
+ envelope = np.ones(n)
703
+
704
+ if params.get('stop'):
705
+ # Sharp attack for stops
706
+ attack = max(1, n // 10)
707
+ release = max(1, n // 4)
708
+ elif params.get('fricative'):
709
+ attack = max(1, n // 5)
710
+ release = max(1, n // 5)
711
+ else:
712
+ # Smooth envelope for vowels
713
+ attack = max(1, n // 6)
714
+ release = max(1, n // 6)
715
+
716
+ envelope[:attack] = np.linspace(0.01, 1, attack)
717
+ envelope[-release:] = np.linspace(1, 0.01, release)
718
+
719
+ return audio * envelope
720
+
721
+ def _apply_coarticulation(self, audio, current, prev_phon, next_phon):
722
+ """Apply coarticulation effects"""
723
+ n = len(audio)
724
+ if n < 20:
725
+ return audio
726
+
727
+ # Simple transition smoothing
728
+ transition_len = min(n // 4, 50)
729
+
730
+ # Fade in from previous phoneme
731
+ if prev_phon and prev_phon not in ['SIL', 'PAU']:
732
+ fade_in = np.linspace(0.7, 1.0, transition_len)
733
+ audio[:transition_len] *= fade_in
734
+
735
+ # Fade out to next phoneme
736
+ if next_phon and next_phon not in ['SIL', 'PAU']:
737
+ fade_out = np.linspace(1.0, 0.7, transition_len)
738
+ audio[-transition_len:] *= fade_out
739
+
740
+ return audio
741
+
742
+ def _concatenate(self, segments):
743
+ """Concatenate segments with crossfade"""
744
+ if len(segments) == 0:
745
+ return np.zeros(1000)
746
+
747
+ if len(segments) == 1:
748
+ return segments[0]
749
+
750
+ # Overlap-add with crossfade
751
+ overlap = 32
752
+
753
+ total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
754
+ total_len = max(total_len, 1)
755
+
756
+ audio = np.zeros(total_len)
757
+ pos = 0
758
+
759
+ for i, seg in enumerate(segments):
760
+ if len(seg) == 0:
761
+ continue
762
 
763
+ end = min(pos + len(seg), total_len)
764
+ seg_len = end - pos
 
 
765
 
766
+ if seg_len <= 0:
767
+ break
768
 
769
+ if i > 0 and pos >= overlap:
770
+ # Crossfade
771
+ fade_len = min(overlap, seg_len)
772
+ fade_in = np.linspace(0, 1, fade_len)
773
+ fade_out = np.linspace(1, 0, fade_len)
774
+
775
+ audio[pos:pos + fade_len] *= fade_out
776
+ seg_copy = seg[:seg_len].copy()
777
+ seg_copy[:fade_len] *= fade_in
778
+ audio[pos:end] += seg_copy
779
+ else:
780
+ audio[pos:end] = seg[:seg_len]
781
 
782
+ pos = end - overlap
783
+ pos = max(0, pos)
784
+
785
+ return audio
786
+
787
+ def _apply_final_envelope(self, audio):
788
+ """Apply final envelope to entire audio"""
789
+ n = len(audio)
790
+ if n < 100:
791
+ return audio
792
+
793
+ fade_len = min(n // 30, 300)
794
+ audio[:fade_len] *= np.linspace(0, 1, fade_len)
795
+ audio[-fade_len:] *= np.linspace(1, 0, fade_len)
796
+
797
+ return audio
798
+
799
+
800
+ # ============================================
801
+ # MAIN TTS CLASS
802
+ # ============================================
803
+
804
+ class VedesTTS:
805
+ """Vedes TTS - 100% From Scratch"""
806
+
807
+ def __init__(self, sample_rate=22050):
808
+ self.sample_rate = sample_rate
809
+ self.text_to_phoneme = TextToPhoneme()
810
+ self.synthesizer = KlattSynthesizer(sample_rate)
811
+
812
+ def synthesize(self, text, rate=1.0, pitch=1.0):
813
+ """Convert text to speech"""
814
+ if not text or not text.strip():
815
+ return np.zeros(self.sample_rate, dtype=np.float32)
816
+
817
+ # Convert text to phonemes
818
+ phonemes = self.text_to_phoneme.convert(text)
819
+
820
+ if not phonemes:
821
+ return np.zeros(self.sample_rate, dtype=np.float32)
822
+
823
+ # Synthesize audio
824
+ audio = self.synthesizer.synthesize(phonemes, rate, pitch)
825
+
826
+ return audio
827
+
828
+
829
+ # ============================================
830
+ # INITIALIZE
831
+ # ============================================
832
+
833
+ print("=" * 50)
834
+ print("πŸŽ™οΈ VEDES TTS - 100% From Scratch")
835
+ print("No APIs, No Pre-trained Models")
836
+ print("=" * 50)
837
+
838
+ tts = VedesTTS(SAMPLE_RATE)
839
+
840
+ print("βœ… Initialized successfully!")
841
+ print("=" * 50)
842
+
843
+
844
+ # ============================================
845
+ # GRADIO INTERFACE
846
+ # ============================================
847
+
848
+ def synthesize_speech(text, speaking_rate, pitch_shift):
849
+ """Gradio synthesis function"""
850
+ if not text or not text.strip():
851
+ return None
852
+
853
+ text = text.strip()[:500]
854
+
855
+ try:
856
+ # Convert pitch shift to multiplier
857
+ pitch_mult = 2 ** (pitch_shift / 12)
858
+
859
+ # Synthesize
860
+ audio = tts.synthesize(text, rate=speaking_rate, pitch=pitch_mult)
861
+
862
+ if len(audio) < 100:
863
+ return None
864
+
865
+ # Convert to int16
866
+ audio = np.clip(audio, -1, 1)
867
+ audio_int16 = (audio * 32767).astype(np.int16)
868
+
869
+ return (SAMPLE_RATE, audio_int16)
870
+
871
+ except Exception as e:
872
+ print(f"Error: {e}")
873
+ return None
874
+
875
+
876
+ # Create Gradio interface
877
+ with gr.Blocks(
878
+ title="Vedes TTS",
879
+ theme=gr.themes.Soft(primary_hue="indigo")
880
+ ) as demo:
881
+
882
+ gr.Markdown("""
883
+ # πŸŽ™οΈ Vedes TTS - From Scratch
884
+ ### 100% Custom Built - No APIs, No Pre-trained Models
885
+
886
+ This TTS uses **Klatt formant synthesis** - the same technique used in early
887
+ speech synthesizers. It converts text to phonemes, then generates audio using
888
+ digital resonators that simulate the human vocal tract.
889
+ """)
890
+
891
+ with gr.Row():
892
+ with gr.Column(scale=2):
893
+ text_input = gr.Textbox(
894
+ label="πŸ“ Enter Text",
895
+ placeholder="Type something... (e.g., Hello, how are you today?)",
896
+ lines=4
897
+ )
898
 
899
+ with gr.Row():
900
+ rate_slider = gr.Slider(
901
+ minimum=0.5, maximum=2.0, value=1.0, step=0.1,
902
+ label="⏱️ Speaking Rate"
903
+ )
904
+ pitch_slider = gr.Slider(
905
+ minimum=-6, maximum=6, value=0, step=1,
906
+ label="🎡 Pitch (semitones)"
907
+ )
908
 
909
+ synth_btn = gr.Button("πŸ”Š Synthesize", variant="primary", size="lg")
910
+
911
+ with gr.Column(scale=1):
912
+ audio_output = gr.Audio(label="🎧 Output", type="numpy")
 
 
 
 
 
913
 
914
+ gr.Examples(
915
+ examples=[
916
+ ["Hello, welcome to Vedes."],
917
+ ["How are you today?"],
918
+ ["This is a test."],
919
+ ["The quick brown fox."],
920
+ ["Good morning!"],
921
+ ["Thank you very much."],
922
+ ["I am fine."],
923
+ ["What is your name?"],
924
+ ["Nice to meet you."],
925
+ ["Have a good day."],
926
+ ],
927
  inputs=text_input,
928
+ label="πŸ“š Examples"
929
  )
930
 
931
+ gr.Markdown("""
932
+ ---
933
+ ### πŸ”§ How It Works
934
+
935
+ 1. **Text β†’ Phonemes**: Converts words to speech sounds using a dictionary
936
+ 2. **Glottal Source**: Generates vocal cord vibrations mathematically
937
+ 3. **Formant Filters**: Shapes sound using resonators (F1, F2, F3, F4)
938
+ 4. **Coarticulation**: Smooths transitions between sounds
939
+
940
+ ### ⚠️ Limitations
941
+
942
+ This is **educational/demonstration** quality - not production TTS.
943
+ Real TTS systems use neural networks trained on thousands of hours of speech.
944
+
945
+ ---
946
+ *Built from scratch with NumPy and SciPy - No external TTS APIs!*
947
+ """)
948
+
949
+ synth_btn.click(
950
  fn=synthesize_speech,
951
+ inputs=[text_input, rate_slider, pitch_slider],
952
  outputs=audio_output
953
  )
954
 
955
  text_input.submit(
956
  fn=synthesize_speech,
957
+ inputs=[text_input, rate_slider, pitch_slider],
958
  outputs=audio_output
959
  )
960
 
961
 
 
 
 
 
962
  if __name__ == "__main__":
963
  demo.launch()