Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on 18 days ago

Commit

ee1f09c

verified ·

1 Parent(s): fec6ba4

Update app.py

Browse files

Files changed (1) hide show

app.py +288 -736

app.py CHANGED Viewed

@@ -1,802 +1,354 @@
-import numpy as np
 import gradio as gr
-from scipy import signal
 from scipy.io import wavfile
 import io
-import re
 # ============================================
-# VEDES TTS - Formant-Based Speech Synthesizer
 # ============================================
-class VedesConfig:
-    """Configuration"""
-    sample_rate = 22050
-config = VedesConfig()
-# ============================================
-# PHONEME DEFINITIONS
-# ============================================
-# Phoneme to formant mapping (F1, F2, F3, duration_ms, is_voiced)
-PHONEMES = {
-    # Vowels (voiced)
-    'AA': (710, 1100, 2540, 120, True),   # father
-    'AE': (660, 1720, 2410, 120, True),   # cat
-    'AH': (520, 1190, 2390, 100, True),   # but
-    'AO': (570, 840, 2410, 120, True),    # dog
-    'AW': (630, 1200, 2550, 150, True),   # how
-    'AY': (710, 1100, 2540, 150, True),   # my
-    'EH': (530, 1840, 2480, 100, True),   # bed
-    'ER': (490, 1350, 1690, 120, True),   # bird
-    'EY': (450, 2100, 2680, 140, True),   # say
-    'IH': (400, 1920, 2560, 80, True),    # bit
-    'IY': (270, 2290, 3010, 120, True),   # see
-    'OW': (450, 850, 2500, 140, True),    # go
-    'OY': (490, 1350, 2480, 160, True),   # boy
-    'UH': (440, 1020, 2240, 100, True),   # book
-    'UW': (300, 870, 2240, 120, True),    # too
-    # Consonants - Stops
-    'B': (200, 1100, 2150, 60, True),
-    'D': (200, 1600, 2600, 50, True),
-    'G': (200, 1990, 2850, 50, True),
-    'P': (200, 800, 2000, 80, False),
-    'T': (200, 1600, 2600, 70, False),
-    'K': (200, 1990, 2850, 80, False),
-    # Consonants - Fricatives
-    'F': (175, 900, 2400, 100, False),
-    'V': (175, 1100, 2400, 80, True),
-    'TH': (200, 1400, 2200, 80, False),
-    'DH': (200, 1600, 2400, 60, True),
-    'S': (200, 1800, 4000, 100, False),
-    'Z': (200, 1600, 3500, 80, True),
-    'SH': (200, 1800, 2600, 100, False),
-    'ZH': (200, 1800, 2600, 80, True),
-    'HH': (280, 1200, 2400, 80, False),
-    # Consonants - Nasals
-    'M': (280, 900, 2200, 80, True),
-    'N': (280, 1700, 2600, 70, True),
-    'NG': (280, 2300, 2750, 80, True),
-    # Consonants - Liquids
-    'L': (350, 1100, 2700, 70, True),
-    'R': (420, 1300, 1600, 70, True),
-    # Consonants - Glides
-    'W': (300, 870, 2240, 60, True),
-    'Y': (280, 2250, 3000, 50, True),
-    # Special
-    'CH': (200, 1800, 2600, 100, False),
-    'JH': (200, 1800, 2600, 80, True),
-    # Silence
-    'SIL': (0, 0, 0, 100, False),
-    'PAU': (0, 0, 0, 150, False),
-}
-# Letter to phoneme mapping (simplified)
-LETTER_TO_PHONEME = {
-    'a': ['AE'],
-    'b': ['B'],
-    'c': ['K'],
-    'd': ['D'],
-    'e': ['EH'],
-    'f': ['F'],
-    'g': ['G'],
-    'h': ['HH'],
-    'i': ['IH'],
-    'j': ['JH'],
-    'k': ['K'],
-    'l': ['L'],
-    'm': ['M'],
-    'n': ['N'],
-    'o': ['AA'],
-    'p': ['P'],
-    'q': ['K', 'W'],
-    'r': ['R'],
-    's': ['S'],
-    't': ['T'],
-    'u': ['AH'],
-    'v': ['V'],
-    'w': ['W'],
-    'x': ['K', 'S'],
-    'y': ['Y'],
-    'z': ['Z'],
-    ' ': ['SIL'],
-    '.': ['PAU'],
-    ',': ['PAU'],
-    '!': ['PAU'],
-    '?': ['PAU'],
-    '-': ['SIL'],
-    "'": [],
-}
-# Common word pronunciations
-WORD_PRONUNCIATIONS = {
-    'the': ['DH', 'AH'],
-    'a': ['AH'],
-    'an': ['AE', 'N'],
-    'is': ['IH', 'Z'],
-    'are': ['AA', 'R'],
-    'was': ['W', 'AA', 'Z'],
-    'were': ['W', 'ER'],
-    'be': ['B', 'IY'],
-    'been': ['B', 'IH', 'N'],
-    'have': ['HH', 'AE', 'V'],
-    'has': ['HH', 'AE', 'Z'],
-    'had': ['HH', 'AE', 'D'],
-    'do': ['D', 'UW'],
-    'does': ['D', 'AH', 'Z'],
-    'did': ['D', 'IH', 'D'],
-    'will': ['W', 'IH', 'L'],
-    'would': ['W', 'UH', 'D'],
-    'could': ['K', 'UH', 'D'],
-    'should': ['SH', 'UH', 'D'],
-    'can': ['K', 'AE', 'N'],
-    'may': ['M', 'EY'],
-    'might': ['M', 'AY', 'T'],
-    'must': ['M', 'AH', 'S', 'T'],
-    'i': ['AY'],
-    'you': ['Y', 'UW'],
-    'he': ['HH', 'IY'],
-    'she': ['SH', 'IY'],
-    'it': ['IH', 'T'],
-    'we': ['W', 'IY'],
-    'they': ['DH', 'EY'],
-    'this': ['DH', 'IH', 'S'],
-    'that': ['DH', 'AE', 'T'],
-    'what': ['W', 'AH', 'T'],
-    'which': ['W', 'IH', 'CH'],
-    'who': ['HH', 'UW'],
-    'how': ['HH', 'AW'],
-    'when': ['W', 'EH', 'N'],
-    'where': ['W', 'EH', 'R'],
-    'why': ['W', 'AY'],
-    'all': ['AO', 'L'],
-    'each': ['IY', 'CH'],
-    'every': ['EH', 'V', 'R', 'IY'],
-    'both': ['B', 'OW', 'TH'],
-    'few': ['F', 'Y', 'UW'],
-    'more': ['M', 'AO', 'R'],
-    'most': ['M', 'OW', 'S', 'T'],
-    'other': ['AH', 'DH', 'ER'],
-    'some': ['S', 'AH', 'M'],
-    'such': ['S', 'AH', 'CH'],
-    'no': ['N', 'OW'],
-    'not': ['N', 'AA', 'T'],
-    'only': ['OW', 'N', 'L', 'IY'],
-    'same': ['S', 'EY', 'M'],
-    'so': ['S', 'OW'],
-    'than': ['DH', 'AE', 'N'],
-    'too': ['T', 'UW'],
-    'very': ['V', 'EH', 'R', 'IY'],
-    'just': ['JH', 'AH', 'S', 'T'],
-    'hello': ['HH', 'EH', 'L', 'OW'],
-    'hi': ['HH', 'AY'],
-    'welcome': ['W', 'EH', 'L', 'K', 'AH', 'M'],
-    'to': ['T', 'UW'],
-    'world': ['W', 'ER', 'L', 'D'],
-    'speech': ['S', 'P', 'IY', 'CH'],
-    'text': ['T', 'EH', 'K', 'S', 'T'],
-    'voice': ['V', 'OY', 'S'],
-    'sound': ['S', 'AW', 'N', 'D'],
-    'good': ['G', 'UH', 'D'],
-    'great': ['G', 'R', 'EY', 'T'],
-    'nice': ['N', 'AY', 'S'],
-    'thank': ['TH', 'AE', 'NG', 'K'],
-    'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
-    'please': ['P', 'L', 'IY', 'Z'],
-    'yes': ['Y', 'EH', 'S'],
-    'yeah': ['Y', 'AE'],
-    'ok': ['OW', 'K', 'EY'],
-    'okay': ['OW', 'K', 'EY'],
-    'and': ['AE', 'N', 'D'],
-    'or': ['AO', 'R'],
-    'but': ['B', 'AH', 'T'],
-    'if': ['IH', 'F'],
-    'then': ['DH', 'EH', 'N'],
-    'because': ['B', 'IH', 'K', 'AO', 'Z'],
-    'as': ['AE', 'Z'],
-    'until': ['AH', 'N', 'T', 'IH', 'L'],
-    'while': ['W', 'AY', 'L'],
-    'of': ['AH', 'V'],
-    'at': ['AE', 'T'],
-    'by': ['B', 'AY'],
-    'for': ['F', 'AO', 'R'],
-    'with': ['W', 'IH', 'TH'],
-    'about': ['AH', 'B', 'AW', 'T'],
-    'into': ['IH', 'N', 'T', 'UW'],
-    'through': ['TH', 'R', 'UW'],
-    'during': ['D', 'UH', 'R', 'IH', 'NG'],
-    'before': ['B', 'IH', 'F', 'AO', 'R'],
-    'after': ['AE', 'F', 'T', 'ER'],
-    'above': ['AH', 'B', 'AH', 'V'],
-    'below': ['B', 'IH', 'L', 'OW'],
-    'from': ['F', 'R', 'AH', 'M'],
-    'up': ['AH', 'P'],
-    'down': ['D', 'AW', 'N'],
-    'in': ['IH', 'N'],
-    'out': ['AW', 'T'],
-    'on': ['AA', 'N'],
-    'off': ['AO', 'F'],
-    'over': ['OW', 'V', 'ER'],
-    'under': ['AH', 'N', 'D', 'ER'],
-    'again': ['AH', 'G', 'EH', 'N'],
-    'there': ['DH', 'EH', 'R'],
-    'here': ['HH', 'IY', 'R'],
-    'today': ['T', 'AH', 'D', 'EY'],
-    'now': ['N', 'AW'],
-    'my': ['M', 'AY'],
-    'your': ['Y', 'AO', 'R'],
-    'his': ['HH', 'IH', 'Z'],
-    'her': ['HH', 'ER'],
-    'our': ['AW', 'ER'],
-    'their': ['DH', 'EH', 'R'],
-    'test': ['T', 'EH', 'S', 'T'],
-    'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
-    'one': ['W', 'AH', 'N'],
-    'two': ['T', 'UW'],
-    'three': ['TH', 'R', 'IY'],
-    'four': ['F', 'AO', 'R'],
-    'five': ['F', 'AY', 'V'],
-    'name': ['N', 'EY', 'M'],
-    'vedes': ['V', 'IY', 'D', 'EH', 'S'],
-    'synthesis': ['S', 'IH', 'N', 'TH', 'AH', 'S', 'IH', 'S'],
-    'system': ['S', 'IH', 'S', 'T', 'AH', 'M'],
 }
-# Common letter patterns
-PATTERNS = [
-    (r'tion', ['SH', 'AH', 'N']),
-    (r'sion', ['ZH', 'AH', 'N']),
-    (r'ough', ['AH', 'F']),
-    (r'ight', ['AY', 'T']),
-    (r'ould', ['UH', 'D']),
-    (r'tion', ['SH', 'AH', 'N']),
-    (r'th', ['TH']),
-    (r'ch', ['CH']),
-    (r'sh', ['SH']),
-    (r'ph', ['F']),
-    (r'wh', ['W']),
-    (r'ck', ['K']),
-    (r'ng', ['NG']),
-    (r'qu', ['K', 'W']),
-    (r'ee', ['IY']),
-    (r'ea', ['IY']),
-    (r'oo', ['UW']),
-    (r'ou', ['AW']),
-    (r'ow', ['OW']),
-    (r'ai', ['EY']),
-    (r'ay', ['EY']),
-    (r'oy', ['OY']),
-    (r'oi', ['OY']),
-    (r'au', ['AO']),
-    (r'aw', ['AO']),
-    (r'ie', ['IY']),
-    (r'ei', ['EY']),
-    (r'ue', ['UW']),
-    (r'ew', ['UW']),
-]
-# ============================================
-# TEXT TO PHONEME CONVERTER
-# ============================================
-class TextToPhoneme:
-    """Convert text to phoneme sequence"""
-    def __init__(self):
-        self.word_dict = WORD_PRONUNCIATIONS
-        self.letter_map = LETTER_TO_PHONEME
-        self.patterns = PATTERNS
-    def convert(self, text):
-        """Convert text to phoneme list"""
-        text = text.lower().strip()
-        words = re.findall(r"[\w']+|[.,!?;:\-]|\s+", text)
-        phonemes = []
-        for word in words:
-            word = word.strip()
-            if not word:
-                continue
-            if word in self.word_dict:
-                phonemes.extend(self.word_dict[word])
-            elif word.isspace():
-                phonemes.append('SIL')
-            elif word in '.,!?;:':
-                phonemes.append('PAU')
-            else:
-                # Convert letter by letter with pattern matching
-                phonemes.extend(self._convert_word(word))
-        return phonemes
-    def _convert_word(self, word):
-        """Convert a single word to phonemes"""
-        phonemes = []
-        i = 0
-        word = word.lower()
-        while i < len(word):
-            matched = False
-            # Try pattern matching (longer patterns first)
-            for pattern, phon_list in sorted(self.patterns, key=lambda x: -len(x[0])):
-                if word[i:].startswith(pattern):
-                    phonemes.extend(phon_list)
-                    i += len(pattern)
-                    matched = True
-                    break
-            if not matched:
-                # Single letter conversion
-                char = word[i]
-                if char in self.letter_map:
-                    phonemes.extend(self.letter_map[char])
-                i += 1
-        return phonemes
-# ============================================
-# FORMANT SYNTHESIZER
-# ============================================
-class FormantSynthesizer:
-    """Klatt-style formant synthesizer"""
-    def __init__(self, sample_rate=22050):
-        self.sample_rate = sample_rate
-        self.base_f0 = 120  # Base fundamental frequency
-    def synthesize(self, phonemes, speaking_rate=1.0, pitch_shift=0):
-        """Synthesize audio from phoneme sequence"""
-        if not phonemes:
-            return np.zeros(1000, dtype=np.float32)
-        # Adjust pitch
-        f0 = self.base_f0 * (2 ** (pitch_shift / 12))
-        audio_segments = []
-        for i, phoneme in enumerate(phonemes):
-            if phoneme not in PHONEMES:
-                continue
-            f1, f2, f3, duration_ms, is_voiced = PHONEMES[phoneme]
-            # Adjust duration for speaking rate
-            duration_ms = int(duration_ms / speaking_rate)
-            duration_ms = max(30, min(duration_ms, 300))
-            # Generate phoneme audio
-            segment = self._generate_phoneme(
-                f0, f1, f2, f3, duration_ms, is_voiced, phoneme
-            )
-            audio_segments.append(segment)
-        if not audio_segments:
-            return np.zeros(1000, dtype=np.float32)
-        # Concatenate with smoothing
-        audio = self._concatenate_smooth(audio_segments)
-        # Apply overall envelope and normalization
-        audio = self._apply_envelope(audio)
-        audio = audio / (np.max(np.abs(audio)) + 1e-8)
-        return audio.astype(np.float32)
-    def _generate_phoneme(self, f0, f1, f2, f3, duration_ms, is_voiced, phoneme):
-        """Generate audio for a single phoneme"""
-        n_samples = int(self.sample_rate * duration_ms / 1000)
-        t = np.linspace(0, duration_ms / 1000, n_samples)
-        if phoneme in ['SIL', 'PAU']:
-            return np.zeros(n_samples, dtype=np.float32)
-        if is_voiced:
-            # Generate glottal pulse train
-            source = self._generate_voice_source(t, f0)
-        else:
-            # Generate noise for unvoiced
-            source = np.random.randn(n_samples) * 0.3
-        # Apply formant filtering
-        if f1 > 0:
-            audio = self._apply_formants(source, [f1, f2, f3])
-        else:
-            audio = source
-        # Apply consonant characteristics
-        audio = self._apply_consonant_shape(audio, phoneme)
-        # Apply envelope
-        audio = self._apply_phoneme_envelope(audio, phoneme)
-        return audio.astype(np.float32)
-    def _generate_voice_source(self, t, f0):
-        """Generate glottal source with harmonics"""
-        source = np.zeros_like(t)
-        # Add harmonics with decreasing amplitude
-        for harmonic in range(1, 12):
-            freq = f0 * harmonic
-            if freq > self.sample_rate / 2:
-                break
-            amp = 1.0 / (harmonic ** 1.2)
-            # Add slight vibrato
-            vibrato = 1 + 0.01 * np.sin(2 * np.pi * 5 * t)
-            source += amp * np.sin(2 * np.pi * freq * vibrato * t)
-        # Add some noise for naturalness
-        source += np.random.randn(len(t)) * 0.02
-        return source
-    def _apply_formants(self, source, formants):
-        """Apply formant filtering using resonators"""
-        audio = source.copy()
-        for i, f in enumerate(formants):
-            if f <= 0 or f >= self.sample_rate / 2:
-                continue
-            # Bandwidth increases with formant number
-            bandwidth = 60 + i * 40
-            # Design bandpass filter
-            try:
-                low = max(20, f - bandwidth)
-                high = min(self.sample_rate / 2 - 100, f + bandwidth)
-                if low >= high:
-                    continue
-                b, a = signal.butter(
-                    2,
-                    [low / (self.sample_rate / 2), high / (self.sample_rate / 2)],
-                    btype='band'
-                )
-                filtered = signal.filtfilt(b, a, source)
-                # Weight formants (F1 strongest)
-                weight = 1.0 / (i + 1)
-                audio = audio + filtered * weight
-            except Exception:
-                pass
-        return audio
-    def _apply_consonant_shape(self, audio, phoneme):
-        """Apply consonant-specific characteristics"""
-        n = len(audio)
-        # Plosives: silence then burst
-        if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
-            silence_len = n // 3
-            audio[:silence_len] = 0
-            burst = np.random.randn(n // 6) * 0.5
-            audio[silence_len:silence_len + len(burst)] += burst
-        # Fricatives: add more noise
-        elif phoneme in ['F', 'S', 'SH', 'TH', 'HH']:
-            noise = np.random.randn(n) * 0.3
-            # High-pass for 's' and 'sh'
-            if phoneme in ['S', 'SH']:
-                try:
-                    b, a = signal.butter(2, 3000 / (self.sample_rate / 2), btype='high')
-                    noise = signal.filtfilt(b, a, noise)
-                except:
-                    pass
-            audio = audio * 0.3 + noise * 0.7
-        # Nasals: add low frequency resonance
-        elif phoneme in ['M', 'N', 'NG']:
-            try:
-                b, a = signal.butter(2, 500 / (self.sample_rate / 2), btype='low')
-                low_comp = signal.filtfilt(b, a, audio)
-                audio = audio * 0.5 + low_comp * 0.5
-            except:
-                pass
-        return audio
-    def _apply_phoneme_envelope(self, audio, phoneme):
-        """Apply amplitude envelope to phoneme"""
-        n = len(audio)
-        if n < 4:
-            return audio
-        envelope = np.ones(n)
-        # Attack and release times depend on phoneme type
-        if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
-            # Plosives: sharp attack
-            attack = max(1, n // 8)
-            release = max(1, n // 4)
-        elif phoneme in ['F', 'S', 'SH', 'V', 'Z', 'ZH', 'TH', 'DH']:
-            # Fricatives: gradual
-            attack = max(1, n // 4)
-            release = max(1, n // 4)
-        else:
-            # Vowels and sonorants
-            attack = max(1, n // 5)
-            release = max(1, n // 5)
-        envelope[:attack] = np.linspace(0, 1, attack)
-        envelope[-release:] = np.linspace(1, 0, release)
-        return audio * envelope
-    def _concatenate_smooth(self, segments):
-        """Concatenate segments with crossfade"""
-        if len(segments) == 0:
-            return np.zeros(1000, dtype=np.float32)
-        if len(segments) == 1:
-            return segments[0]
-        # Calculate total length with overlap
-        overlap = 64
-        total_length = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
-        total_length = max(total_length, 1)
-        audio = np.zeros(total_length, dtype=np.float32)
-        pos = 0
-        for i, segment in enumerate(segments):
-            if len(segment) == 0:
-                continue
-            end_pos = min(pos + len(segment), total_length)
-            seg_len = end_pos - pos
-            if seg_len <= 0:
-                break
-            # Crossfade with previous segment
-            if i > 0 and pos > 0:
-                fade_len = min(overlap, seg_len, pos)
-                if fade_len > 0:
-                    fade_in = np.linspace(0, 1, fade_len)
-                    fade_out = np.linspace(1, 0, fade_len)
-                    audio[pos:pos + fade_len] *= fade_out
-                    segment_copy = segment[:seg_len].copy()
-                    segment_copy[:fade_len] *= fade_in
-                    audio[pos:end_pos] += segment_copy
-                else:
-                    audio[pos:end_pos] = segment[:seg_len]
-            else:
-                audio[pos:end_pos] = segment[:seg_len]
-            pos = end_pos - overlap
-            pos = max(0, pos)
-        return audio
-    def _apply_envelope(self, audio):
-        """Apply overall envelope"""
-        n = len(audio)
-        if n < 100:
-            return audio
-        fade_len = min(n // 20, 500)
-        audio[:fade_len] *= np.linspace(0, 1, fade_len)
-        audio[-fade_len:] *= np.linspace(1, 0, fade_len)
-        return audio
-# ============================================
-# VEDES TTS MAIN CLASS
-# ============================================
-class VedesTTS:
-    """Main TTS class"""
-    def __init__(self, sample_rate=22050):
-        self.sample_rate = sample_rate
-        self.text_to_phoneme = TextToPhoneme()
-        self.synthesizer = FormantSynthesizer(sample_rate)
-    def synthesize(self, text, speaking_rate=1.0, pitch_shift=0):
-        """Convert text to speech"""
-        # Text to phonemes
-        phonemes = self.text_to_phoneme.convert(text)
-        if not phonemes:
-            return np.zeros(self.sample_rate, dtype=np.float32)
-        # Phonemes to audio
-        audio = self.synthesizer.synthesize(phonemes, speaking_rate, pitch_shift)
-        return audio
-# ============================================
-# INITIALIZE
-# ============================================
-print("=" * 50)
-print("🎙️ Initializing Vedes TTS...")
-print("=" * 50)
-tts = VedesTTS(config.sample_rate)
-print("✅ Vedes TTS initialized successfully!")
-print("=" * 50)
-# ============================================
-# SYNTHESIS FUNCTION
-# ============================================
-def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0, voice_type="neutral"):
-    """Main synthesis function for Gradio"""
-    if not text or len(text.strip()) == 0:
-        return None
-    text = text.strip()[:1000]  # Limit length
-    try:
-        # Adjust base pitch for voice type
-        pitch_adjust = pitch_shift
-        if voice_type == "high":
-            pitch_adjust += 5
-        elif voice_type == "low":
-            pitch_adjust -= 5
-        # Synthesize
-        audio = tts.synthesize(text, speaking_rate, pitch_adjust)
-        if len(audio) < 100:
-            return None
-        # Convert to int16
-        audio = np.clip(audio, -1, 1)
-        audio_int16 = (audio * 32767).astype(np.int16)
-        return (config.sample_rate, audio_int16)
-    except Exception as e:
-        print(f"Synthesis error: {e}")
-        return None
 # ============================================
 # GRADIO INTERFACE
 # ============================================
 with gr.Blocks(
     title="Vedes TTS",
-    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
 ) as demo:
-    gr.Markdown(
-        """
-        # 🎙️ Vedes TTS - Text-to-Speech Synthesis
-        ### A formant-based speech synthesizer built from scratch
-        Type any text below and hear it spoken!
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=2):
-            text_input = gr.Textbox(
-                label="📝 Enter Text",
-                placeholder="Type something to synthesize... (e.g., 'Hello, welcome to Vedes!')",
-                lines=4,
-                max_lines=10
-            )
             with gr.Row():
-                speaking_rate = gr.Slider(
-                    minimum=0.5,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="🎚️ Speaking Rate",
-                    info="Slower ← → Faster"
-                )
-                pitch_shift = gr.Slider(
-                    minimum=-10,
-                    maximum=10,
-                    value=0,
-                    step=1,
-                    label="🎵 Pitch Shift",
-                    info="Lower ← → Higher"
-                )
-            voice_type = gr.Radio(
-                choices=["neutral", "high", "low"],
-                value="neutral",
-                label="🗣️ Voice Type"
-            )
-            synthesize_btn = gr.Button(
-                "🔊 Synthesize Speech",
-                variant="primary",
-                size="lg"
             )
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="🎧 Generated Speech",
-                type="numpy"
-            )
-    gr.Examples(
-        examples=[
-            ["Hello, welcome to Vedes text to speech!"],
-            ["The quick brown fox jumps over the lazy dog."],
-            ["How are you doing today?"],
-            ["This is a test of the speech synthesis system."],
-            ["Good morning! Nice to meet you."],
-            ["One, two, three, four, five."],
-            ["Please say hello to my friend."],
-            ["What is your name?"],
-        ],
-        inputs=text_input,
-        label="📚 Try These Examples"
-    )
-    gr.Markdown(
-        """
-        ---
-        ### ℹ️ About Vedes TTS
-        **How it works:**
-        1. **Text Processing** - Converts text to phonemes using pronunciation rules
-        2. **Formant Synthesis** - Generates speech using formant frequencies (F1, F2, F3)
-        3. **Source-Filter Model** - Combines glottal source with vocal tract filtering
-        **Features:**
-        - 🔤 Letter-to-phoneme conversion with common word dictionary
-        - 🎵 Adjustable pitch and speaking rate
-        - 🗣️ Multiple voice types (neutral, high, low pitch)
-        - ⚡ Real-time synthesis - no neural network required!
-        **Supported:** English text with basic punctuation
-        ---
-        *Built with Python, NumPy, SciPy, and Gradio* ❤️
-        """
     )
-    # Event handlers
     synthesize_btn.click(
         fn=synthesize_speech,
-        inputs=[text_input, speaking_rate, pitch_shift, voice_type],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
-        inputs=[text_input, speaking_rate, pitch_shift, voice_type],
         outputs=audio_output
     )
 # Launch
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import numpy as np
+import asyncio
+import edge_tts
+import tempfile
+import os
 from scipy.io import wavfile
+from scipy import signal
 import io
 # ============================================
+# VEDES TTS - Text-to-Speech System
 # ============================================
+print("=" * 50)
+print("🎙️ Initializing Vedes TTS...")
+print("=" * 50)
+# Available voices
+VOICES = {
+    "Emma (US Female)": "en-US-EmmaNeural",
+    "Jenny (US Female)": "en-US-JennyNeural",
+    "Aria (US Female)": "en-US-AriaNeural",
+    "Guy (US Male)": "en-US-GuyNeural",
+    "Eric (US Male)": "en-US-EricNeural",
+    "Ryan (UK Male)": "en-GB-RyanNeural",
+    "Sonia (UK Female)": "en-GB-SoniaNeural",
+    "Natasha (AU Female)": "en-AU-NatashaNeural",
+    "William (AU Male)": "en-AU-WilliamNeural",
 }
+DEFAULT_VOICE = "en-US-EmmaNeural"
+SAMPLE_RATE = 24000
+async def synthesize_async(text, voice, rate, pitch):
+    """Async TTS synthesis using edge-tts"""
+    # Format rate and pitch for edge-tts
+    rate_str = f"{'+' if rate >= 0 else ''}{int(rate)}%"
+    pitch_str = f"{'+' if pitch >= 0 else ''}{int(pitch)}Hz"
+    communicate = edge_tts.Communicate(
+        text=text,
+        voice=voice,
+        rate=rate_str,
+        pitch=pitch_str
+    )
+    # Save to temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    await communicate.save(tmp_path)
+    return tmp_path
+def synthesize_speech(text, voice_name, speaking_rate, pitch_shift):
+    """
+    Main synthesis function
+    Args:
+        text: Input text to synthesize
+        voice_name: Selected voice
+        speaking_rate: Speed adjustment (-50 to +50)
+        pitch_shift: Pitch adjustment in Hz (-20 to +20)
+    Returns:
+        Path to generated audio file
+    """
+    if not text or len(text.strip()) == 0:
+        return None
+    text = text.strip()[:5000]  # Limit text length
+    # Get voice ID
+    voice = VOICES.get(voice_name, DEFAULT_VOICE)
+    # Convert speaking rate to percentage
+    rate = int((speaking_rate - 1.0) * 100)
+    # Convert pitch shift
+    pitch = int(pitch_shift * 10)
+    try:
+        # Run async synthesis
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        audio_path = loop.run_until_complete(
+            synthesize_async(text, voice, rate, pitch)
+        )
+        loop.close()
+        return audio_path
+    except Exception as e:
+        print(f"Synthesis error: {e}")
+        return None
+def text_analysis(text):
+    """Analyze text and return statistics"""
+    if not text:
+        return ""
+    words = text.split()
+    sentences = text.replace('!', '.').replace('?', '.').split('.')
+    sentences = [s.strip() for s in sentences if s.strip()]
+    char_count = len(text)
+    word_count = len(words)
+    sentence_count = len(sentences)
+    # Estimate duration (average 150 words per minute)
+    est_duration = word_count / 150 * 60
+    return f"""
+    📊 **Text Analysis:**
+    - Characters: {char_count}
+    - Words: {word_count}
+    - Sentences: {sentence_count}
+    - Estimated Duration: {est_duration:.1f} seconds
+    """
 # ============================================
 # GRADIO INTERFACE
 # ============================================
+# Custom CSS
+custom_css = """
+.gradio-container {
+    max-width: 900px !important;
+}
+.title-text {
+    text-align: center;
+    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-size: 2.5rem;
+    font-weight: bold;
+}
+.subtitle-text {
+    text-align: center;
+    color: #666;
+}
+"""
 with gr.Blocks(
     title="Vedes TTS",
+    css=custom_css,
+    theme=gr.themes.Soft(
+        primary_hue="purple",
+        secondary_hue="blue",
+    )
 ) as demo:
+    # Header
+    gr.HTML("""
+        <div style="text-align: center; padding: 20px;">
+            <h1 class="title-text">🎙️ Vedes TTS</h1>
+            <p class="subtitle-text">High-Quality Text-to-Speech Synthesis</p>
+        </div>
+    """)
+    with gr.Tabs():
+        # Main TTS Tab
+        with gr.TabItem("🔊 Text to Speech"):
             with gr.Row():
+                with gr.Column(scale=2):
+                    text_input = gr.Textbox(
+                        label="📝 Enter Text",
+                        placeholder="Type or paste your text here...\n\nExample: Hello! Welcome to Vedes, a high-quality text-to-speech system. I can read any text you provide with natural-sounding speech.",
+                        lines=6,
+                        max_lines=15
+                    )
+                    text_stats = gr.Markdown("")
+                    with gr.Row():
+                        voice_select = gr.Dropdown(
+                            choices=list(VOICES.keys()),
+                            value="Emma (US Female)",
+                            label="🗣️ Select Voice",
+                            interactive=True
+                        )
+                    with gr.Row():
+                        speaking_rate = gr.Slider(
+                            minimum=0.5,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                            label="⏱️ Speaking Rate",
+                            info="0.5x = Slow, 1.0x = Normal, 2.0x = Fast"
+                        )
+                        pitch_shift = gr.Slider(
+                            minimum=-2.0,
+                            maximum=2.0,
+                            value=0.0,
+                            step=0.1,
+                            label="🎵 Pitch Adjustment",
+                            info="Adjust voice pitch"
+                        )
+                    synthesize_btn = gr.Button(
+                        "🔊 Generate Speech",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column(scale=1):
+                    audio_output = gr.Audio(
+                        label="🎧 Generated Speech",
+                        type="filepath"
+                    )
+                    gr.Markdown("""
+                    ### 💡 Tips:
+                    - Use punctuation for natural pauses
+                    - Add commas for short pauses
+                    - Add periods for longer pauses
+                    - Use "!" and "?" for expression
+                    """)
+        # Examples Tab
+        with gr.TabItem("📚 Examples"):
+            gr.Markdown("### Click any example to try it:")
+            examples = [
+                ["Hello! Welcome to Vedes text-to-speech. I hope you're having a wonderful day!"],
+                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
+                ["In a world where technology advances rapidly, artificial intelligence continues to reshape how we live and work."],
+                ["Once upon a time, in a land far away, there lived a wise old wizard who knew the secrets of the universe."],
+                ["Breaking news: Scientists have discovered a new species of butterfly in the Amazon rainforest."],
+                ["To be, or not to be, that is the question. Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune."],
+                ["Good morning! Today's weather forecast predicts sunny skies with a high of 75 degrees Fahrenheit."],
+                ["Thank you for using Vedes TTS. We appreciate your interest in our text-to-speech technology!"],
+            ]
+            gr.Examples(
+                examples=examples,
+                inputs=text_input,
+                label=""
             )
+        # Voices Tab
+        with gr.TabItem("🎭 Voice Gallery"):
+            gr.Markdown("""
+            ### Available Voices:
+            | Voice | Gender | Accent | Best For |
+            |-------|--------|--------|----------|
+            | Emma | Female | US English | General, Friendly |
+            | Jenny | Female | US English | Professional, Clear |
+            | Aria | Female | US English | Conversational |
+            | Guy | Male | US English | Narration, Calm |
+            | Eric | Male | US English | News, Formal |
+            | Ryan | Male | UK English | British content |
+            | Sonia | Female | UK English | British content |
+            | Natasha | Female | AU English | Australian content |
+            | William | Male | AU English | Australian content |
+            ---
+            ### 🎯 Voice Selection Tips:
+            - **For storytelling:** Try Emma or Guy
+            - **For news/formal:** Try Jenny or Eric
+            - **For casual content:** Try Aria
+            - **For British accent:** Try Ryan or Sonia
+            - **For Australian accent:** Try Natasha or William
+            """)
+        # About Tab
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown("""
+            ## 🎙️ About Vedes TTS
+            **Vedes** is a text-to-speech application that converts written text into natural-sounding speech.
+            ### ✨ Features:
+            - 🗣️ **9 High-Quality Voices** - Male and female voices with different accents
+            - 🌍 **Multiple Accents** - US, UK, and Australian English
+            - ⏱️ **Adjustable Speed** - From 0.5x to 2.0x speaking rate
+            - 🎵 **Pitch Control** - Fine-tune the voice pitch
+            - 📱 **Easy to Use** - Simple, intuitive interface
+            - ⚡ **Fast Generation** - Quick audio synthesis
+            ### 🔧 How It Works:
+            1. **Enter Text** - Type or paste your text
+            2. **Select Voice** - Choose from 9 available voices
+            3. **Adjust Settings** - Modify speed and pitch if needed
+            4. **Generate** - Click the button to create speech
+            5. **Listen & Download** - Play or save the audio
+            ### 📖 Best Practices:
+            - Use proper punctuation for natural speech rhythm
+            - Break long texts into paragraphs
+            - Use commas for short pauses, periods for longer ones
+            - Add question marks and exclamation points for expression
+            ---
+            ### 🛠️ Technical Details:
+            - **Engine:** Neural TTS
+            - **Audio Format:** MP3
+            - **Sample Rate:** 24kHz
+            - **Max Text Length:** 5000 characters
+            ---
+            *Built with ❤️ using Python and Gradio*
+            """)
+    # Footer
+    gr.HTML("""
+        <div style="text-align: center; padding: 20px; color: #888;">
+            <p>Vedes TTS © 2024 | Powered by Neural Speech Synthesis</p>
+        </div>
+    """)
+    # Event Handlers
+    text_input.change(
+        fn=text_analysis,
+        inputs=text_input,
+        outputs=text_stats
     )
     synthesize_btn.click(
         fn=synthesize_speech,
+        inputs=[text_input, voice_select, speaking_rate, pitch_shift],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
+        inputs=[text_input, voice_select, speaking_rate, pitch_shift],
         outputs=audio_output
     )
 # Launch
+print("✅ Vedes TTS Ready!")
+print("=" * 50)
 if __name__ == "__main__":
     demo.launch()