Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import json
|
|
| 8 |
import os
|
| 9 |
|
| 10 |
# ============================================
|
| 11 |
-
# VEDES TTS - WITH VOICE TRAINING
|
| 12 |
# 100% From Scratch - No APIs
|
| 13 |
# ============================================
|
| 14 |
|
|
@@ -22,9 +22,9 @@ VOICE_PROFILES = {
|
|
| 22 |
"Emma (Female)": {
|
| 23 |
"name": "Emma",
|
| 24 |
"gender": "female",
|
| 25 |
-
"f0": 210,
|
| 26 |
"f0_variation": 30,
|
| 27 |
-
"formant_shift": 1.15,
|
| 28 |
"breathiness": 0.04,
|
| 29 |
"speed": 1.0,
|
| 30 |
"brightness": 1.1,
|
|
@@ -33,9 +33,9 @@ VOICE_PROFILES = {
|
|
| 33 |
"James (Male)": {
|
| 34 |
"name": "James",
|
| 35 |
"gender": "male",
|
| 36 |
-
"f0": 110,
|
| 37 |
"f0_variation": 20,
|
| 38 |
-
"formant_shift": 0.9,
|
| 39 |
"breathiness": 0.02,
|
| 40 |
"speed": 0.95,
|
| 41 |
"brightness": 0.95,
|
|
@@ -44,7 +44,7 @@ VOICE_PROFILES = {
|
|
| 44 |
"Sophie (Child)": {
|
| 45 |
"name": "Sophie",
|
| 46 |
"gender": "child",
|
| 47 |
-
"f0": 280,
|
| 48 |
"f0_variation": 40,
|
| 49 |
"formant_shift": 1.25,
|
| 50 |
"breathiness": 0.03,
|
|
@@ -55,7 +55,7 @@ VOICE_PROFILES = {
|
|
| 55 |
"David (Deep Male)": {
|
| 56 |
"name": "David",
|
| 57 |
"gender": "male",
|
| 58 |
-
"f0": 85,
|
| 59 |
"f0_variation": 15,
|
| 60 |
"formant_shift": 0.82,
|
| 61 |
"breathiness": 0.02,
|
|
@@ -109,7 +109,7 @@ VOICE_PROFILES = {
|
|
| 109 |
},
|
| 110 |
}
|
| 111 |
|
| 112 |
-
# Custom voices storage
|
| 113 |
custom_voices = {}
|
| 114 |
|
| 115 |
# ============================================
|
|
@@ -366,7 +366,7 @@ LETTERS = {
|
|
| 366 |
|
| 367 |
|
| 368 |
# ============================================
|
| 369 |
-
# VOICE ANALYZER
|
| 370 |
# ============================================
|
| 371 |
|
| 372 |
class VoiceAnalyzer:
|
|
@@ -377,19 +377,18 @@ class VoiceAnalyzer:
|
|
| 377 |
|
| 378 |
def analyze(self, audio):
|
| 379 |
"""Extract voice features from audio sample"""
|
| 380 |
-
if len(audio) < self.sr * 0.
|
| 381 |
return None
|
| 382 |
|
| 383 |
-
# Normalize
|
| 384 |
audio = audio.astype(np.float32)
|
| 385 |
-
|
|
|
|
|
|
|
| 386 |
|
| 387 |
-
# Extract features
|
| 388 |
f0 = self._estimate_pitch(audio)
|
| 389 |
formants = self._estimate_formants(audio)
|
| 390 |
breathiness = self._estimate_breathiness(audio)
|
| 391 |
|
| 392 |
-
# Create voice profile
|
| 393 |
profile = {
|
| 394 |
"name": "Custom Voice",
|
| 395 |
"gender": "custom",
|
|
@@ -399,40 +398,45 @@ class VoiceAnalyzer:
|
|
| 399 |
"breathiness": breathiness,
|
| 400 |
"speed": 1.0,
|
| 401 |
"brightness": formants.get('brightness', 1.0),
|
| 402 |
-
"description": "
|
| 403 |
}
|
| 404 |
|
| 405 |
return profile
|
| 406 |
|
| 407 |
def _estimate_pitch(self, audio):
|
| 408 |
-
"""Estimate fundamental frequency
|
| 409 |
-
|
| 410 |
-
frame_size = int(self.sr * 0.03) # 30ms frames
|
| 411 |
-
|
| 412 |
pitches = []
|
|
|
|
| 413 |
for i in range(0, len(audio) - frame_size, frame_size):
|
| 414 |
frame = audio[i:i + frame_size]
|
| 415 |
|
|
|
|
|
|
|
|
|
|
| 416 |
# Autocorrelation
|
| 417 |
corr = np.correlate(frame, frame, mode='full')
|
| 418 |
corr = corr[len(corr)//2:]
|
| 419 |
|
| 420 |
-
# Find
|
| 421 |
d = np.diff(corr)
|
| 422 |
-
|
| 423 |
|
| 424 |
-
if len(
|
| 425 |
-
start =
|
| 426 |
-
|
| 427 |
|
| 428 |
-
if
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
if pitches:
|
| 434 |
-
return np.median(pitches)
|
| 435 |
-
return 130
|
| 436 |
|
| 437 |
def _estimate_f0_variation(self, audio, base_f0):
|
| 438 |
"""Estimate pitch variation"""
|
|
@@ -441,41 +445,42 @@ class VoiceAnalyzer:
|
|
| 441 |
|
| 442 |
for i in range(0, len(audio) - frame_size, frame_size):
|
| 443 |
frame = audio[i:i + frame_size]
|
|
|
|
|
|
|
| 444 |
corr = np.correlate(frame, frame, mode='full')
|
| 445 |
corr = corr[len(corr)//2:]
|
| 446 |
|
| 447 |
d = np.diff(corr)
|
| 448 |
-
|
| 449 |
|
| 450 |
-
if len(
|
| 451 |
-
start =
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
if len(pitches) > 2:
|
| 459 |
-
return min(np.std(pitches), 50)
|
| 460 |
-
return 20
|
| 461 |
|
| 462 |
def _estimate_formants(self, audio):
|
| 463 |
"""Estimate formant characteristics"""
|
| 464 |
-
# Simple spectral analysis
|
| 465 |
frame_size = 2048
|
| 466 |
|
| 467 |
if len(audio) < frame_size:
|
| 468 |
return {'shift': 1.0, 'brightness': 1.0}
|
| 469 |
|
| 470 |
-
# Get spectrum
|
| 471 |
spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
|
| 472 |
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
|
| 473 |
|
| 474 |
-
|
| 475 |
-
centroid = np.sum(freqs * spectrum) /
|
| 476 |
|
| 477 |
-
# Estimate formant shift based on centroid
|
| 478 |
-
# Average male ~1200Hz, female ~1400Hz
|
| 479 |
if centroid > 1600:
|
| 480 |
shift = 1.2
|
| 481 |
brightness = 1.15
|
|
@@ -495,7 +500,7 @@ class VoiceAnalyzer:
|
|
| 495 |
return {'shift': shift, 'brightness': brightness}
|
| 496 |
|
| 497 |
def _estimate_breathiness(self, audio):
|
| 498 |
-
"""Estimate breathiness
|
| 499 |
frame_size = 2048
|
| 500 |
|
| 501 |
if len(audio) < frame_size:
|
|
@@ -504,12 +509,14 @@ class VoiceAnalyzer:
|
|
| 504 |
spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
|
| 505 |
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
high_energy = np.sum(spectrum[(freqs > 2000) & (freqs < 5000)])
|
| 510 |
|
| 511 |
-
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
| 513 |
|
| 514 |
return breathiness
|
| 515 |
|
|
@@ -569,7 +576,7 @@ class TextToPhoneme:
|
|
| 569 |
|
| 570 |
|
| 571 |
# ============================================
|
| 572 |
-
# VOICE
|
| 573 |
# ============================================
|
| 574 |
|
| 575 |
class VoiceSynthesizer:
|
|
@@ -583,7 +590,6 @@ class VoiceSynthesizer:
|
|
| 583 |
|
| 584 |
voice = voice_profile or self.default_voice
|
| 585 |
|
| 586 |
-
# Get voice parameters
|
| 587 |
f0 = voice.get('f0', 130) * pitch
|
| 588 |
f0_var = voice.get('f0_variation', 20)
|
| 589 |
formant_shift = voice.get('formant_shift', 1.0)
|
|
@@ -597,7 +603,6 @@ class VoiceSynthesizer:
|
|
| 597 |
prev_phon = phonemes[i - 1] if i > 0 else None
|
| 598 |
next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
|
| 599 |
|
| 600 |
-
# Add pitch variation
|
| 601 |
phrase_pos = i / max(len(phonemes), 1)
|
| 602 |
f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
|
| 603 |
|
|
@@ -620,39 +625,28 @@ class VoiceSynthesizer:
|
|
| 620 |
|
| 621 |
if phon in VOWELS:
|
| 622 |
return self._synth_vowel(phon, f0, speed, formant_shift,
|
| 623 |
-
breathiness, brightness
|
| 624 |
|
| 625 |
if phon in CONSONANTS:
|
| 626 |
return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
|
| 627 |
|
| 628 |
return np.zeros(100, dtype=np.float32)
|
| 629 |
|
| 630 |
-
def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness,
|
| 631 |
-
brightness, prev_phon, next_phon):
|
| 632 |
params = VOWELS[phon]
|
| 633 |
f1, f2, f3, dur_ms, amp, voiced = params
|
| 634 |
|
| 635 |
-
# Apply formant shift
|
| 636 |
f1 = f1 * formant_shift
|
| 637 |
-
f2 = f2 * formant_shift
|
| 638 |
-
f3 = f3 * formant_shift
|
| 639 |
-
|
| 640 |
-
# Apply brightness
|
| 641 |
-
f2 = f2 * brightness
|
| 642 |
-
f3 = f3 * brightness
|
| 643 |
|
| 644 |
dur_ms = dur_ms / speed
|
| 645 |
n = int(self.sr * dur_ms / 1000)
|
| 646 |
n = max(n, 100)
|
| 647 |
t = np.arange(n) / self.sr
|
| 648 |
|
| 649 |
-
# Generate glottal source with voice characteristics
|
| 650 |
source = self._glottal_source(t, f0, breathiness)
|
| 651 |
-
|
| 652 |
-
# Apply formants
|
| 653 |
audio = self._apply_formants(source, f1, f2, f3)
|
| 654 |
-
|
| 655 |
-
# Apply envelope
|
| 656 |
envelope = self._vowel_envelope(n)
|
| 657 |
audio = audio * envelope * amp
|
| 658 |
|
|
@@ -663,17 +657,17 @@ class VoiceSynthesizer:
|
|
| 663 |
ctype = params['type']
|
| 664 |
|
| 665 |
if ctype == 'stop':
|
| 666 |
-
return self._synth_stop(
|
| 667 |
elif ctype == 'fric':
|
| 668 |
-
return self._synth_fricative(
|
| 669 |
elif ctype == 'affric':
|
| 670 |
-
return self._synth_affricate(
|
| 671 |
elif ctype == 'nasal':
|
| 672 |
-
return self._synth_nasal(
|
| 673 |
elif ctype == 'liquid':
|
| 674 |
-
return self._synth_liquid(
|
| 675 |
elif ctype == 'glide':
|
| 676 |
-
return self._synth_glide(
|
| 677 |
|
| 678 |
return np.zeros(100, dtype=np.float32)
|
| 679 |
|
|
@@ -688,10 +682,7 @@ class VoiceSynthesizer:
|
|
| 688 |
mask2 = (phase >= 0.4) & (phase < 0.6)
|
| 689 |
glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
|
| 690 |
|
| 691 |
-
# Add breathiness
|
| 692 |
glottal += np.random.randn(len(t)) * breathiness
|
| 693 |
-
|
| 694 |
-
# Add shimmer
|
| 695 |
shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
|
| 696 |
glottal *= shimmer
|
| 697 |
|
|
@@ -743,7 +734,7 @@ class VoiceSynthesizer:
|
|
| 743 |
|
| 744 |
return env
|
| 745 |
|
| 746 |
-
def _synth_stop(self,
|
| 747 |
closure_ms = params['closure'] / speed
|
| 748 |
burst_ms = params['burst'] / speed
|
| 749 |
|
|
@@ -775,7 +766,7 @@ class VoiceSynthesizer:
|
|
| 775 |
|
| 776 |
return audio
|
| 777 |
|
| 778 |
-
def _synth_fricative(self,
|
| 779 |
dur_ms = params['dur'] / speed
|
| 780 |
n = int(self.sr * dur_ms / 1000)
|
| 781 |
|
|
@@ -802,7 +793,7 @@ class VoiceSynthesizer:
|
|
| 802 |
|
| 803 |
return audio.astype(np.float32)
|
| 804 |
|
| 805 |
-
def _synth_affricate(self,
|
| 806 |
closure_ms = params['closure'] / speed
|
| 807 |
fric_ms = params['fric'] / speed
|
| 808 |
|
|
@@ -837,7 +828,7 @@ class VoiceSynthesizer:
|
|
| 837 |
|
| 838 |
return audio
|
| 839 |
|
| 840 |
-
def _synth_nasal(self,
|
| 841 |
dur_ms = params['dur'] / speed
|
| 842 |
n = int(self.sr * dur_ms / 1000)
|
| 843 |
t = np.arange(n) / self.sr
|
|
@@ -863,7 +854,7 @@ class VoiceSynthesizer:
|
|
| 863 |
|
| 864 |
return audio.astype(np.float32)
|
| 865 |
|
| 866 |
-
def _synth_liquid(self,
|
| 867 |
dur_ms = params['dur'] / speed
|
| 868 |
n = int(self.sr * dur_ms / 1000)
|
| 869 |
t = np.arange(n) / self.sr
|
|
@@ -879,7 +870,7 @@ class VoiceSynthesizer:
|
|
| 879 |
|
| 880 |
return audio.astype(np.float32)
|
| 881 |
|
| 882 |
-
def _synth_glide(self,
|
| 883 |
dur_ms = params['dur'] / speed
|
| 884 |
n = int(self.sr * dur_ms / 1000)
|
| 885 |
t = np.arange(n) / self.sr
|
|
@@ -919,7 +910,7 @@ class VoiceSynthesizer:
|
|
| 919 |
if seg_len <= 0:
|
| 920 |
break
|
| 921 |
|
| 922 |
-
seg_to_add = seg[:seg_len]
|
| 923 |
|
| 924 |
if i > 0 and pos > overlap:
|
| 925 |
fade_len = min(overlap, seg_len)
|
|
@@ -927,7 +918,6 @@ class VoiceSynthesizer:
|
|
| 927 |
fade_out = np.linspace(1, 0, fade_len) ** 0.5
|
| 928 |
|
| 929 |
audio[pos:pos + fade_len] *= fade_out
|
| 930 |
-
seg_to_add = seg_to_add.copy()
|
| 931 |
seg_to_add[:fade_len] *= fade_in
|
| 932 |
|
| 933 |
audio[pos:end_pos] += seg_to_add
|
|
@@ -964,39 +954,45 @@ class VedesTTS:
|
|
| 964 |
self.voice_analyzer = VoiceAnalyzer(sample_rate)
|
| 965 |
self.current_voice = VOICE_PROFILES["Emma (Female)"]
|
| 966 |
|
| 967 |
-
def
|
| 968 |
if voice_name in VOICE_PROFILES:
|
| 969 |
-
|
| 970 |
elif voice_name in custom_voices:
|
| 971 |
-
|
|
|
|
| 972 |
|
| 973 |
def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
|
| 974 |
if not text or not text.strip():
|
| 975 |
return np.zeros(self.sr, dtype=np.float32)
|
| 976 |
|
| 977 |
-
if voice_name
|
| 978 |
-
self.set_voice(voice_name)
|
| 979 |
-
|
| 980 |
phonemes = self.text_to_phoneme.convert(text)
|
| 981 |
|
| 982 |
if not phonemes:
|
| 983 |
return np.zeros(self.sr, dtype=np.float32)
|
| 984 |
|
| 985 |
-
audio = self.synthesizer.synthesize(phonemes,
|
| 986 |
|
| 987 |
return audio
|
| 988 |
|
| 989 |
def train_voice(self, audio_data, voice_name="My Voice"):
|
| 990 |
"""Train a new voice from audio sample"""
|
|
|
|
|
|
|
| 991 |
if audio_data is None:
|
| 992 |
return None
|
| 993 |
|
| 994 |
-
# Handle
|
| 995 |
if isinstance(audio_data, tuple):
|
| 996 |
sr, audio = audio_data
|
| 997 |
audio = audio.astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 998 |
if sr != self.sr:
|
| 999 |
-
# Resample
|
| 1000 |
duration = len(audio) / sr
|
| 1001 |
new_length = int(duration * self.sr)
|
| 1002 |
audio = signal.resample(audio, new_length)
|
|
@@ -1004,13 +1000,16 @@ class VedesTTS:
|
|
| 1004 |
audio = audio_data.astype(np.float32)
|
| 1005 |
|
| 1006 |
# Normalize
|
| 1007 |
-
|
|
|
|
|
|
|
| 1008 |
|
| 1009 |
-
# Analyze
|
| 1010 |
profile = self.voice_analyzer.analyze(audio)
|
| 1011 |
|
| 1012 |
if profile:
|
| 1013 |
profile['name'] = voice_name
|
|
|
|
| 1014 |
custom_voices[voice_name] = profile
|
| 1015 |
return profile
|
| 1016 |
|
|
@@ -1033,10 +1032,38 @@ print("=" * 50)
|
|
| 1033 |
|
| 1034 |
|
| 1035 |
# ============================================
|
| 1036 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1037 |
# ============================================
|
| 1038 |
|
| 1039 |
def synthesize(text, voice_name, rate, pitch):
|
|
|
|
| 1040 |
if not text or not text.strip():
|
| 1041 |
return None
|
| 1042 |
|
|
@@ -1044,17 +1071,7 @@ def synthesize(text, voice_name, rate, pitch):
|
|
| 1044 |
|
| 1045 |
try:
|
| 1046 |
pitch_mult = 2 ** (pitch / 12)
|
| 1047 |
-
|
| 1048 |
-
# Check custom voices first
|
| 1049 |
-
if voice_name in custom_voices:
|
| 1050 |
-
voice = custom_voices[voice_name]
|
| 1051 |
-
elif voice_name in VOICE_PROFILES:
|
| 1052 |
-
voice = VOICE_PROFILES[voice_name]
|
| 1053 |
-
else:
|
| 1054 |
-
voice = VOICE_PROFILES["Emma (Female)"]
|
| 1055 |
-
|
| 1056 |
-
tts.current_voice = voice
|
| 1057 |
-
audio = tts.speak(text, rate=rate, pitch=pitch_mult)
|
| 1058 |
|
| 1059 |
if len(audio) < 100:
|
| 1060 |
return None
|
|
@@ -1065,67 +1082,61 @@ def synthesize(text, voice_name, rate, pitch):
|
|
| 1065 |
return (SAMPLE_RATE, audio_int16)
|
| 1066 |
|
| 1067 |
except Exception as e:
|
| 1068 |
-
print(f"
|
| 1069 |
return None
|
| 1070 |
|
| 1071 |
|
| 1072 |
def train_voice(audio, voice_name):
|
|
|
|
|
|
|
|
|
|
| 1073 |
if audio is None:
|
| 1074 |
-
return "❌
|
| 1075 |
|
| 1076 |
if not voice_name or not voice_name.strip():
|
| 1077 |
-
voice_name = "
|
| 1078 |
|
| 1079 |
voice_name = voice_name.strip()[:30]
|
| 1080 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1081 |
try:
|
| 1082 |
profile = tts.train_voice(audio, voice_name)
|
| 1083 |
|
| 1084 |
if profile:
|
| 1085 |
-
|
| 1086 |
-
✅ Voice "{voice_name}" created
|
| 1087 |
|
| 1088 |
-
**
|
| 1089 |
- Pitch (F0): {profile['f0']:.1f} Hz
|
| 1090 |
- Pitch Variation: {profile['f0_variation']:.1f} Hz
|
| 1091 |
- Formant Shift: {profile['formant_shift']:.2f}
|
| 1092 |
- Breathiness: {profile['breathiness']:.3f}
|
| 1093 |
- Brightness: {profile['brightness']:.2f}
|
|
|
|
|
|
|
| 1094 |
"""
|
| 1095 |
-
return
|
| 1096 |
else:
|
| 1097 |
-
return "❌ Could not analyze voice. Try a longer sample.",
|
| 1098 |
|
| 1099 |
except Exception as e:
|
| 1100 |
-
return f"❌ Error: {str(e)}",
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
def get_all_voices():
|
| 1104 |
-
voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
|
| 1105 |
-
return voices
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
def get_voice_info(voice_name):
|
| 1109 |
-
if voice_name in VOICE_PROFILES:
|
| 1110 |
-
v = VOICE_PROFILES[voice_name]
|
| 1111 |
-
elif voice_name in custom_voices:
|
| 1112 |
-
v = custom_voices[voice_name]
|
| 1113 |
-
else:
|
| 1114 |
-
return "Select a voice"
|
| 1115 |
-
|
| 1116 |
-
return f"""
|
| 1117 |
-
**{v.get('name', voice_name)}**
|
| 1118 |
-
- Type: {v.get('gender', 'unknown').title()}
|
| 1119 |
-
- Pitch: {v.get('f0', 130):.0f} Hz
|
| 1120 |
-
- {v.get('description', '')}
|
| 1121 |
-
"""
|
| 1122 |
|
| 1123 |
|
| 1124 |
def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
|
|
|
|
|
|
|
|
|
|
| 1125 |
if not name or not name.strip():
|
| 1126 |
-
return "❌ Please enter a voice name",
|
|
|
|
|
|
|
| 1127 |
|
| 1128 |
-
name
|
|
|
|
| 1129 |
|
| 1130 |
profile = {
|
| 1131 |
"name": name,
|
|
@@ -1136,15 +1147,23 @@ def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
|
|
| 1136 |
"breathiness": breathiness / 100,
|
| 1137 |
"speed": speed,
|
| 1138 |
"brightness": brightness,
|
| 1139 |
-
"description": f"Custom voice (
|
| 1140 |
}
|
| 1141 |
|
| 1142 |
custom_voices[name] = profile
|
| 1143 |
|
| 1144 |
-
return f"✅ Voice
|
|
|
|
| 1145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
|
| 1147 |
-
# Build interface
|
| 1148 |
with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
| 1149 |
|
| 1150 |
gr.Markdown("""
|
|
@@ -1159,15 +1178,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
|
| 1159 |
with gr.Column(scale=2):
|
| 1160 |
text_input = gr.Textbox(
|
| 1161 |
label="📝 Text to Speak",
|
| 1162 |
-
placeholder="Type
|
| 1163 |
lines=3
|
| 1164 |
)
|
| 1165 |
|
| 1166 |
-
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
-
|
| 1170 |
-
|
|
|
|
|
|
|
|
|
|
| 1171 |
|
| 1172 |
voice_info = gr.Markdown("Select a voice")
|
| 1173 |
|
|
@@ -1197,44 +1219,43 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
|
| 1197 |
gr.Markdown("""
|
| 1198 |
### Train a New Voice from Audio
|
| 1199 |
|
| 1200 |
-
Record or upload
|
| 1201 |
|
| 1202 |
-
**Tips
|
| 1203 |
-
-
|
| 1204 |
-
- Speak clearly and naturally
|
| 1205 |
- Avoid background noise
|
|
|
|
| 1206 |
""")
|
| 1207 |
|
| 1208 |
with gr.Row():
|
| 1209 |
with gr.Column():
|
| 1210 |
audio_input = gr.Audio(
|
| 1211 |
-
label="🎤 Record or Upload
|
| 1212 |
sources=["microphone", "upload"],
|
| 1213 |
type="numpy"
|
| 1214 |
)
|
| 1215 |
|
| 1216 |
voice_name_input = gr.Textbox(
|
| 1217 |
label="Voice Name",
|
| 1218 |
-
placeholder="My Voice",
|
| 1219 |
-
value="
|
| 1220 |
)
|
| 1221 |
|
| 1222 |
train_btn = gr.Button("🧠 Train Voice", variant="primary")
|
| 1223 |
|
| 1224 |
with gr.Column():
|
| 1225 |
-
train_result = gr.Markdown("
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
)
|
|
|
|
|
|
|
|
|
|
| 1230 |
|
| 1231 |
# ===== CREATE VOICE TAB =====
|
| 1232 |
with gr.TabItem("⚙️ Create Voice"):
|
| 1233 |
-
gr.Markdown(""
|
| 1234 |
-
### Create Custom Voice Manually
|
| 1235 |
-
|
| 1236 |
-
Adjust the parameters to create your own voice:
|
| 1237 |
-
""")
|
| 1238 |
|
| 1239 |
with gr.Row():
|
| 1240 |
with gr.Column():
|
|
@@ -1246,19 +1267,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
|
| 1246 |
custom_pitch = gr.Slider(
|
| 1247 |
60, 300, 150,
|
| 1248 |
label="Pitch (Hz)",
|
| 1249 |
-
info="
|
| 1250 |
)
|
| 1251 |
|
| 1252 |
custom_formant = gr.Slider(
|
| 1253 |
0.7, 1.4, 1.0, step=0.05,
|
| 1254 |
label="Formant Shift",
|
| 1255 |
-
info="<1.0
|
| 1256 |
)
|
| 1257 |
|
| 1258 |
custom_breathiness = gr.Slider(
|
| 1259 |
1, 10, 3,
|
| 1260 |
-
label="Breathiness"
|
| 1261 |
-
info="Higher = more breathy/airy voice"
|
| 1262 |
)
|
| 1263 |
|
| 1264 |
custom_speed = gr.Slider(
|
|
@@ -1268,72 +1288,79 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
|
| 1268 |
|
| 1269 |
custom_brightness = gr.Slider(
|
| 1270 |
0.8, 1.3, 1.0, step=0.05,
|
| 1271 |
-
label="Brightness"
|
| 1272 |
-
info="Higher = brighter, more forward voice"
|
| 1273 |
)
|
| 1274 |
|
| 1275 |
create_btn = gr.Button("✨ Create Voice", variant="primary")
|
| 1276 |
|
| 1277 |
with gr.Column():
|
| 1278 |
create_result = gr.Markdown("")
|
| 1279 |
-
created_voice_select = gr.Dropdown(
|
| 1280 |
-
choices=get_all_voices(),
|
| 1281 |
-
label="Created Voices"
|
| 1282 |
-
)
|
| 1283 |
|
| 1284 |
gr.Markdown("""
|
| 1285 |
-
###
|
| 1286 |
|
| 1287 |
-
|
|
| 1288 |
-
|------
|
| 1289 |
-
|
|
| 1290 |
-
|
|
| 1291 |
-
|
|
| 1292 |
-
|
|
|
|
|
| 1293 |
""")
|
| 1294 |
|
| 1295 |
-
# ===== VOICES TAB =====
|
| 1296 |
with gr.TabItem("👥 All Voices"):
|
| 1297 |
-
gr.Markdown("###
|
| 1298 |
|
| 1299 |
-
|
| 1300 |
for name, v in VOICE_PROFILES.items():
|
| 1301 |
-
|
| 1302 |
**{name}**
|
| 1303 |
-
- Type: {v['gender'].title()}
|
| 1304 |
-
- Pitch: {v['f0']} Hz
|
| 1305 |
- {v['description']}
|
| 1306 |
|
| 1307 |
-
---
|
| 1308 |
"""
|
| 1309 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 1310 |
|
| 1311 |
-
#
|
|
|
|
|
|
|
| 1312 |
voice_select.change(get_voice_info, voice_select, voice_info)
|
|
|
|
| 1313 |
speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
|
| 1314 |
text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
|
| 1315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1316 |
train_btn.click(
|
| 1317 |
-
|
| 1318 |
-
[audio_input, voice_name_input],
|
| 1319 |
-
[train_result,
|
| 1320 |
)
|
| 1321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1322 |
create_btn.click(
|
| 1323 |
-
|
| 1324 |
[custom_name, custom_pitch, custom_formant, custom_breathiness,
|
| 1325 |
custom_speed, custom_brightness],
|
| 1326 |
-
[create_result,
|
| 1327 |
-
)
|
| 1328 |
-
|
| 1329 |
-
# Update voice selectors when new voices are created
|
| 1330 |
-
trained_voice_select.change(
|
| 1331 |
-
lambda x: x, trained_voice_select, voice_select
|
| 1332 |
-
)
|
| 1333 |
-
created_voice_select.change(
|
| 1334 |
-
lambda x: x, created_voice_select, voice_select
|
| 1335 |
)
|
| 1336 |
|
| 1337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1338 |
if __name__ == "__main__":
|
| 1339 |
demo.launch()
|
|
|
|
| 8 |
import os
|
| 9 |
|
| 10 |
# ============================================
|
| 11 |
+
# VEDES TTS - WITH VOICE TRAINING (FIXED)
|
| 12 |
# 100% From Scratch - No APIs
|
| 13 |
# ============================================
|
| 14 |
|
|
|
|
| 22 |
"Emma (Female)": {
|
| 23 |
"name": "Emma",
|
| 24 |
"gender": "female",
|
| 25 |
+
"f0": 210,
|
| 26 |
"f0_variation": 30,
|
| 27 |
+
"formant_shift": 1.15,
|
| 28 |
"breathiness": 0.04,
|
| 29 |
"speed": 1.0,
|
| 30 |
"brightness": 1.1,
|
|
|
|
| 33 |
"James (Male)": {
|
| 34 |
"name": "James",
|
| 35 |
"gender": "male",
|
| 36 |
+
"f0": 110,
|
| 37 |
"f0_variation": 20,
|
| 38 |
+
"formant_shift": 0.9,
|
| 39 |
"breathiness": 0.02,
|
| 40 |
"speed": 0.95,
|
| 41 |
"brightness": 0.95,
|
|
|
|
| 44 |
"Sophie (Child)": {
|
| 45 |
"name": "Sophie",
|
| 46 |
"gender": "child",
|
| 47 |
+
"f0": 280,
|
| 48 |
"f0_variation": 40,
|
| 49 |
"formant_shift": 1.25,
|
| 50 |
"breathiness": 0.03,
|
|
|
|
| 55 |
"David (Deep Male)": {
|
| 56 |
"name": "David",
|
| 57 |
"gender": "male",
|
| 58 |
+
"f0": 85,
|
| 59 |
"f0_variation": 15,
|
| 60 |
"formant_shift": 0.82,
|
| 61 |
"breathiness": 0.02,
|
|
|
|
| 109 |
},
|
| 110 |
}
|
| 111 |
|
| 112 |
+
# Custom voices storage (global)
|
| 113 |
custom_voices = {}
|
| 114 |
|
| 115 |
# ============================================
|
|
|
|
| 366 |
|
| 367 |
|
| 368 |
# ============================================
|
| 369 |
+
# VOICE ANALYZER
|
| 370 |
# ============================================
|
| 371 |
|
| 372 |
class VoiceAnalyzer:
|
|
|
|
| 377 |
|
| 378 |
def analyze(self, audio):
|
| 379 |
"""Extract voice features from audio sample"""
|
| 380 |
+
if len(audio) < self.sr * 0.3:
|
| 381 |
return None
|
| 382 |
|
|
|
|
| 383 |
audio = audio.astype(np.float32)
|
| 384 |
+
max_val = np.max(np.abs(audio))
|
| 385 |
+
if max_val > 0:
|
| 386 |
+
audio = audio / max_val
|
| 387 |
|
|
|
|
| 388 |
f0 = self._estimate_pitch(audio)
|
| 389 |
formants = self._estimate_formants(audio)
|
| 390 |
breathiness = self._estimate_breathiness(audio)
|
| 391 |
|
|
|
|
| 392 |
profile = {
|
| 393 |
"name": "Custom Voice",
|
| 394 |
"gender": "custom",
|
|
|
|
| 398 |
"breathiness": breathiness,
|
| 399 |
"speed": 1.0,
|
| 400 |
"brightness": formants.get('brightness', 1.0),
|
| 401 |
+
"description": f"Custom voice (F0={f0:.0f}Hz)"
|
| 402 |
}
|
| 403 |
|
| 404 |
return profile
|
| 405 |
|
| 406 |
def _estimate_pitch(self, audio):
|
| 407 |
+
"""Estimate fundamental frequency using autocorrelation"""
|
| 408 |
+
frame_size = int(self.sr * 0.03)
|
|
|
|
|
|
|
| 409 |
pitches = []
|
| 410 |
+
|
| 411 |
for i in range(0, len(audio) - frame_size, frame_size):
|
| 412 |
frame = audio[i:i + frame_size]
|
| 413 |
|
| 414 |
+
# Remove DC
|
| 415 |
+
frame = frame - np.mean(frame)
|
| 416 |
+
|
| 417 |
# Autocorrelation
|
| 418 |
corr = np.correlate(frame, frame, mode='full')
|
| 419 |
corr = corr[len(corr)//2:]
|
| 420 |
|
| 421 |
+
# Find peaks
|
| 422 |
d = np.diff(corr)
|
| 423 |
+
start_indices = np.where(d > 0)[0]
|
| 424 |
|
| 425 |
+
if len(start_indices) > 0:
|
| 426 |
+
start = start_indices[0]
|
| 427 |
+
search_end = min(start + int(self.sr / 60), len(corr))
|
| 428 |
|
| 429 |
+
if search_end > start:
|
| 430 |
+
peak = start + np.argmax(corr[start:search_end])
|
| 431 |
+
|
| 432 |
+
if peak > 0:
|
| 433 |
+
f0 = self.sr / peak
|
| 434 |
+
if 60 < f0 < 400:
|
| 435 |
+
pitches.append(f0)
|
| 436 |
|
| 437 |
if pitches:
|
| 438 |
+
return float(np.median(pitches))
|
| 439 |
+
return 130.0
|
| 440 |
|
| 441 |
def _estimate_f0_variation(self, audio, base_f0):
|
| 442 |
"""Estimate pitch variation"""
|
|
|
|
| 445 |
|
| 446 |
for i in range(0, len(audio) - frame_size, frame_size):
|
| 447 |
frame = audio[i:i + frame_size]
|
| 448 |
+
frame = frame - np.mean(frame)
|
| 449 |
+
|
| 450 |
corr = np.correlate(frame, frame, mode='full')
|
| 451 |
corr = corr[len(corr)//2:]
|
| 452 |
|
| 453 |
d = np.diff(corr)
|
| 454 |
+
start_indices = np.where(d > 0)[0]
|
| 455 |
|
| 456 |
+
if len(start_indices) > 0:
|
| 457 |
+
start = start_indices[0]
|
| 458 |
+
search_end = min(start + int(self.sr / 60), len(corr))
|
| 459 |
+
|
| 460 |
+
if search_end > start:
|
| 461 |
+
peak = start + np.argmax(corr[start:search_end])
|
| 462 |
+
if peak > 0:
|
| 463 |
+
f0 = self.sr / peak
|
| 464 |
+
if 60 < f0 < 400:
|
| 465 |
+
pitches.append(f0)
|
| 466 |
|
| 467 |
if len(pitches) > 2:
|
| 468 |
+
return min(float(np.std(pitches)), 50.0)
|
| 469 |
+
return 20.0
|
| 470 |
|
| 471 |
def _estimate_formants(self, audio):
|
| 472 |
"""Estimate formant characteristics"""
|
|
|
|
| 473 |
frame_size = 2048
|
| 474 |
|
| 475 |
if len(audio) < frame_size:
|
| 476 |
return {'shift': 1.0, 'brightness': 1.0}
|
| 477 |
|
|
|
|
| 478 |
spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
|
| 479 |
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
|
| 480 |
|
| 481 |
+
total_energy = np.sum(spectrum) + 1e-8
|
| 482 |
+
centroid = np.sum(freqs * spectrum) / total_energy
|
| 483 |
|
|
|
|
|
|
|
| 484 |
if centroid > 1600:
|
| 485 |
shift = 1.2
|
| 486 |
brightness = 1.15
|
|
|
|
| 500 |
return {'shift': shift, 'brightness': brightness}
|
| 501 |
|
| 502 |
def _estimate_breathiness(self, audio):
|
| 503 |
+
"""Estimate breathiness"""
|
| 504 |
frame_size = 2048
|
| 505 |
|
| 506 |
if len(audio) < frame_size:
|
|
|
|
| 509 |
spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
|
| 510 |
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
|
| 511 |
|
| 512 |
+
low_mask = freqs < 1000
|
| 513 |
+
high_mask = (freqs > 2000) & (freqs < 5000)
|
|
|
|
| 514 |
|
| 515 |
+
low_energy = np.sum(spectrum[low_mask]) + 1e-8
|
| 516 |
+
high_energy = np.sum(spectrum[high_mask])
|
| 517 |
+
|
| 518 |
+
ratio = high_energy / low_energy
|
| 519 |
+
breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))
|
| 520 |
|
| 521 |
return breathiness
|
| 522 |
|
|
|
|
| 576 |
|
| 577 |
|
| 578 |
# ============================================
|
| 579 |
+
# VOICE SYNTHESIZER
|
| 580 |
# ============================================
|
| 581 |
|
| 582 |
class VoiceSynthesizer:
|
|
|
|
| 590 |
|
| 591 |
voice = voice_profile or self.default_voice
|
| 592 |
|
|
|
|
| 593 |
f0 = voice.get('f0', 130) * pitch
|
| 594 |
f0_var = voice.get('f0_variation', 20)
|
| 595 |
formant_shift = voice.get('formant_shift', 1.0)
|
|
|
|
| 603 |
prev_phon = phonemes[i - 1] if i > 0 else None
|
| 604 |
next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
|
| 605 |
|
|
|
|
| 606 |
phrase_pos = i / max(len(phonemes), 1)
|
| 607 |
f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
|
| 608 |
|
|
|
|
| 625 |
|
| 626 |
if phon in VOWELS:
|
| 627 |
return self._synth_vowel(phon, f0, speed, formant_shift,
|
| 628 |
+
breathiness, brightness)
|
| 629 |
|
| 630 |
if phon in CONSONANTS:
|
| 631 |
return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
|
| 632 |
|
| 633 |
return np.zeros(100, dtype=np.float32)
|
| 634 |
|
| 635 |
+
def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
|
|
|
|
| 636 |
params = VOWELS[phon]
|
| 637 |
f1, f2, f3, dur_ms, amp, voiced = params
|
| 638 |
|
|
|
|
| 639 |
f1 = f1 * formant_shift
|
| 640 |
+
f2 = f2 * formant_shift * brightness
|
| 641 |
+
f3 = f3 * formant_shift * brightness
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
dur_ms = dur_ms / speed
|
| 644 |
n = int(self.sr * dur_ms / 1000)
|
| 645 |
n = max(n, 100)
|
| 646 |
t = np.arange(n) / self.sr
|
| 647 |
|
|
|
|
| 648 |
source = self._glottal_source(t, f0, breathiness)
|
|
|
|
|
|
|
| 649 |
audio = self._apply_formants(source, f1, f2, f3)
|
|
|
|
|
|
|
| 650 |
envelope = self._vowel_envelope(n)
|
| 651 |
audio = audio * envelope * amp
|
| 652 |
|
|
|
|
| 657 |
ctype = params['type']
|
| 658 |
|
| 659 |
if ctype == 'stop':
|
| 660 |
+
return self._synth_stop(params, f0, speed, formant_shift)
|
| 661 |
elif ctype == 'fric':
|
| 662 |
+
return self._synth_fricative(params, f0, speed)
|
| 663 |
elif ctype == 'affric':
|
| 664 |
+
return self._synth_affricate(params, f0, speed)
|
| 665 |
elif ctype == 'nasal':
|
| 666 |
+
return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
|
| 667 |
elif ctype == 'liquid':
|
| 668 |
+
return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
|
| 669 |
elif ctype == 'glide':
|
| 670 |
+
return self._synth_glide(params, f0, speed, formant_shift, breathiness)
|
| 671 |
|
| 672 |
return np.zeros(100, dtype=np.float32)
|
| 673 |
|
|
|
|
| 682 |
mask2 = (phase >= 0.4) & (phase < 0.6)
|
| 683 |
glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
|
| 684 |
|
|
|
|
| 685 |
glottal += np.random.randn(len(t)) * breathiness
|
|
|
|
|
|
|
| 686 |
shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
|
| 687 |
glottal *= shimmer
|
| 688 |
|
|
|
|
| 734 |
|
| 735 |
return env
|
| 736 |
|
| 737 |
+
def _synth_stop(self, params, f0, speed, formant_shift):
|
| 738 |
closure_ms = params['closure'] / speed
|
| 739 |
burst_ms = params['burst'] / speed
|
| 740 |
|
|
|
|
| 766 |
|
| 767 |
return audio
|
| 768 |
|
| 769 |
+
def _synth_fricative(self, params, f0, speed):
|
| 770 |
dur_ms = params['dur'] / speed
|
| 771 |
n = int(self.sr * dur_ms / 1000)
|
| 772 |
|
|
|
|
| 793 |
|
| 794 |
return audio.astype(np.float32)
|
| 795 |
|
| 796 |
+
def _synth_affricate(self, params, f0, speed):
|
| 797 |
closure_ms = params['closure'] / speed
|
| 798 |
fric_ms = params['fric'] / speed
|
| 799 |
|
|
|
|
| 828 |
|
| 829 |
return audio
|
| 830 |
|
| 831 |
+
def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
|
| 832 |
dur_ms = params['dur'] / speed
|
| 833 |
n = int(self.sr * dur_ms / 1000)
|
| 834 |
t = np.arange(n) / self.sr
|
|
|
|
| 854 |
|
| 855 |
return audio.astype(np.float32)
|
| 856 |
|
| 857 |
+
def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
|
| 858 |
dur_ms = params['dur'] / speed
|
| 859 |
n = int(self.sr * dur_ms / 1000)
|
| 860 |
t = np.arange(n) / self.sr
|
|
|
|
| 870 |
|
| 871 |
return audio.astype(np.float32)
|
| 872 |
|
| 873 |
+
def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
|
| 874 |
dur_ms = params['dur'] / speed
|
| 875 |
n = int(self.sr * dur_ms / 1000)
|
| 876 |
t = np.arange(n) / self.sr
|
|
|
|
| 910 |
if seg_len <= 0:
|
| 911 |
break
|
| 912 |
|
| 913 |
+
seg_to_add = seg[:seg_len].copy()
|
| 914 |
|
| 915 |
if i > 0 and pos > overlap:
|
| 916 |
fade_len = min(overlap, seg_len)
|
|
|
|
| 918 |
fade_out = np.linspace(1, 0, fade_len) ** 0.5
|
| 919 |
|
| 920 |
audio[pos:pos + fade_len] *= fade_out
|
|
|
|
| 921 |
seg_to_add[:fade_len] *= fade_in
|
| 922 |
|
| 923 |
audio[pos:end_pos] += seg_to_add
|
|
|
|
| 954 |
self.voice_analyzer = VoiceAnalyzer(sample_rate)
|
| 955 |
self.current_voice = VOICE_PROFILES["Emma (Female)"]
|
| 956 |
|
| 957 |
+
def get_voice(self, voice_name):
|
| 958 |
if voice_name in VOICE_PROFILES:
|
| 959 |
+
return VOICE_PROFILES[voice_name]
|
| 960 |
elif voice_name in custom_voices:
|
| 961 |
+
return custom_voices[voice_name]
|
| 962 |
+
return self.current_voice
|
| 963 |
|
| 964 |
def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
|
| 965 |
if not text or not text.strip():
|
| 966 |
return np.zeros(self.sr, dtype=np.float32)
|
| 967 |
|
| 968 |
+
voice = self.get_voice(voice_name) if voice_name else self.current_voice
|
|
|
|
|
|
|
| 969 |
phonemes = self.text_to_phoneme.convert(text)
|
| 970 |
|
| 971 |
if not phonemes:
|
| 972 |
return np.zeros(self.sr, dtype=np.float32)
|
| 973 |
|
| 974 |
+
audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)
|
| 975 |
|
| 976 |
return audio
|
| 977 |
|
| 978 |
def train_voice(self, audio_data, voice_name="My Voice"):
|
| 979 |
"""Train a new voice from audio sample"""
|
| 980 |
+
global custom_voices
|
| 981 |
+
|
| 982 |
if audio_data is None:
|
| 983 |
return None
|
| 984 |
|
| 985 |
+
# Handle tuple format (sample_rate, audio)
|
| 986 |
if isinstance(audio_data, tuple):
|
| 987 |
sr, audio = audio_data
|
| 988 |
audio = audio.astype(np.float32)
|
| 989 |
+
|
| 990 |
+
# Handle stereo
|
| 991 |
+
if len(audio.shape) > 1:
|
| 992 |
+
audio = audio.mean(axis=1)
|
| 993 |
+
|
| 994 |
+
# Resample if needed
|
| 995 |
if sr != self.sr:
|
|
|
|
| 996 |
duration = len(audio) / sr
|
| 997 |
new_length = int(duration * self.sr)
|
| 998 |
audio = signal.resample(audio, new_length)
|
|
|
|
| 1000 |
audio = audio_data.astype(np.float32)
|
| 1001 |
|
| 1002 |
# Normalize
|
| 1003 |
+
max_val = np.max(np.abs(audio))
|
| 1004 |
+
if max_val > 0:
|
| 1005 |
+
audio = audio / max_val
|
| 1006 |
|
| 1007 |
+
# Analyze
|
| 1008 |
profile = self.voice_analyzer.analyze(audio)
|
| 1009 |
|
| 1010 |
if profile:
|
| 1011 |
profile['name'] = voice_name
|
| 1012 |
+
profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
|
| 1013 |
custom_voices[voice_name] = profile
|
| 1014 |
return profile
|
| 1015 |
|
|
|
|
| 1032 |
|
| 1033 |
|
| 1034 |
# ============================================
|
| 1035 |
+
# HELPER FUNCTIONS
|
| 1036 |
+
# ============================================
|
| 1037 |
+
|
| 1038 |
+
def get_all_voices():
|
| 1039 |
+
"""Get list of all available voices"""
|
| 1040 |
+
voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
|
| 1041 |
+
return voices
|
| 1042 |
+
|
| 1043 |
+
|
| 1044 |
+
def get_voice_info(voice_name):
|
| 1045 |
+
"""Get info about a voice"""
|
| 1046 |
+
if voice_name in VOICE_PROFILES:
|
| 1047 |
+
v = VOICE_PROFILES[voice_name]
|
| 1048 |
+
elif voice_name in custom_voices:
|
| 1049 |
+
v = custom_voices[voice_name]
|
| 1050 |
+
else:
|
| 1051 |
+
return "Select a voice"
|
| 1052 |
+
|
| 1053 |
+
return f"""
|
| 1054 |
+
**{v.get('name', voice_name)}**
|
| 1055 |
+
- Type: {v.get('gender', 'unknown').title()}
|
| 1056 |
+
- Pitch: {v.get('f0', 130):.0f} Hz
|
| 1057 |
+
- {v.get('description', '')}
|
| 1058 |
+
"""
|
| 1059 |
+
|
| 1060 |
+
|
| 1061 |
+
# ============================================
|
| 1062 |
+
# GRADIO FUNCTIONS
|
| 1063 |
# ============================================
|
| 1064 |
|
| 1065 |
def synthesize(text, voice_name, rate, pitch):
|
| 1066 |
+
"""Synthesize speech"""
|
| 1067 |
if not text or not text.strip():
|
| 1068 |
return None
|
| 1069 |
|
|
|
|
| 1071 |
|
| 1072 |
try:
|
| 1073 |
pitch_mult = 2 ** (pitch / 12)
|
| 1074 |
+
audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1075 |
|
| 1076 |
if len(audio) < 100:
|
| 1077 |
return None
|
|
|
|
| 1082 |
return (SAMPLE_RATE, audio_int16)
|
| 1083 |
|
| 1084 |
except Exception as e:
|
| 1085 |
+
print(f"Synthesis error: {e}")
|
| 1086 |
return None
|
| 1087 |
|
| 1088 |
|
| 1089 |
def train_voice(audio, voice_name):
|
| 1090 |
+
"""Train a new voice from audio"""
|
| 1091 |
+
global custom_voices
|
| 1092 |
+
|
| 1093 |
if audio is None:
|
| 1094 |
+
return "❌ Please record or upload audio first.", get_all_voices()
|
| 1095 |
|
| 1096 |
if not voice_name or not voice_name.strip():
|
| 1097 |
+
voice_name = f"Custom Voice {len(custom_voices) + 1}"
|
| 1098 |
|
| 1099 |
voice_name = voice_name.strip()[:30]
|
| 1100 |
|
| 1101 |
+
# Check if name already exists
|
| 1102 |
+
if voice_name in VOICE_PROFILES:
|
| 1103 |
+
voice_name = f"{voice_name} (custom)"
|
| 1104 |
+
|
| 1105 |
try:
|
| 1106 |
profile = tts.train_voice(audio, voice_name)
|
| 1107 |
|
| 1108 |
if profile:
|
| 1109 |
+
result = f"""
|
| 1110 |
+
✅ **Voice "{voice_name}" created!**
|
| 1111 |
|
| 1112 |
+
**Detected Parameters:**
|
| 1113 |
- Pitch (F0): {profile['f0']:.1f} Hz
|
| 1114 |
- Pitch Variation: {profile['f0_variation']:.1f} Hz
|
| 1115 |
- Formant Shift: {profile['formant_shift']:.2f}
|
| 1116 |
- Breathiness: {profile['breathiness']:.3f}
|
| 1117 |
- Brightness: {profile['brightness']:.2f}
|
| 1118 |
+
|
| 1119 |
+
You can now select this voice in the Speak tab!
|
| 1120 |
"""
|
| 1121 |
+
return result, get_all_voices()
|
| 1122 |
else:
|
| 1123 |
+
return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()
|
| 1124 |
|
| 1125 |
except Exception as e:
|
| 1126 |
+
return f"❌ Error: {str(e)}", get_all_voices()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1127 |
|
| 1128 |
|
| 1129 |
def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
|
| 1130 |
+
"""Create a custom voice from parameters"""
|
| 1131 |
+
global custom_voices
|
| 1132 |
+
|
| 1133 |
if not name or not name.strip():
|
| 1134 |
+
return "❌ Please enter a voice name.", get_all_voices()
|
| 1135 |
+
|
| 1136 |
+
name = name.strip()[:30]
|
| 1137 |
|
| 1138 |
+
if name in VOICE_PROFILES:
|
| 1139 |
+
name = f"{name} (custom)"
|
| 1140 |
|
| 1141 |
profile = {
|
| 1142 |
"name": name,
|
|
|
|
| 1147 |
"breathiness": breathiness / 100,
|
| 1148 |
"speed": speed,
|
| 1149 |
"brightness": brightness,
|
| 1150 |
+
"description": f"Custom voice (F0={pitch}Hz)"
|
| 1151 |
}
|
| 1152 |
|
| 1153 |
custom_voices[name] = profile
|
| 1154 |
|
| 1155 |
+
return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices()
|
| 1156 |
+
|
| 1157 |
|
| 1158 |
+
def refresh_voices():
|
| 1159 |
+
"""Refresh the voice list"""
|
| 1160 |
+
return gr.update(choices=get_all_voices())
|
| 1161 |
+
|
| 1162 |
+
|
| 1163 |
+
# ============================================
|
| 1164 |
+
# GRADIO INTERFACE
|
| 1165 |
+
# ============================================
|
| 1166 |
|
|
|
|
| 1167 |
with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
|
| 1168 |
|
| 1169 |
gr.Markdown("""
|
|
|
|
| 1178 |
with gr.Column(scale=2):
|
| 1179 |
text_input = gr.Textbox(
|
| 1180 |
label="📝 Text to Speak",
|
| 1181 |
+
placeholder="Type something...",
|
| 1182 |
lines=3
|
| 1183 |
)
|
| 1184 |
|
| 1185 |
+
with gr.Row():
|
| 1186 |
+
voice_select = gr.Dropdown(
|
| 1187 |
+
choices=get_all_voices(),
|
| 1188 |
+
value="Emma (Female)",
|
| 1189 |
+
label="🗣️ Voice",
|
| 1190 |
+
interactive=True
|
| 1191 |
+
)
|
| 1192 |
+
refresh_btn = gr.Button("🔄", size="sm")
|
| 1193 |
|
| 1194 |
voice_info = gr.Markdown("Select a voice")
|
| 1195 |
|
|
|
|
| 1219 |
gr.Markdown("""
|
| 1220 |
### Train a New Voice from Audio
|
| 1221 |
|
| 1222 |
+
Record or upload 3-10 seconds of clear speech.
|
| 1223 |
|
| 1224 |
+
**Tips:**
|
| 1225 |
+
- Speak naturally and clearly
|
|
|
|
| 1226 |
- Avoid background noise
|
| 1227 |
+
- Read a few sentences
|
| 1228 |
""")
|
| 1229 |
|
| 1230 |
with gr.Row():
|
| 1231 |
with gr.Column():
|
| 1232 |
audio_input = gr.Audio(
|
| 1233 |
+
label="🎤 Record or Upload",
|
| 1234 |
sources=["microphone", "upload"],
|
| 1235 |
type="numpy"
|
| 1236 |
)
|
| 1237 |
|
| 1238 |
voice_name_input = gr.Textbox(
|
| 1239 |
label="Voice Name",
|
| 1240 |
+
placeholder="e.g., My Voice",
|
| 1241 |
+
value=""
|
| 1242 |
)
|
| 1243 |
|
| 1244 |
train_btn = gr.Button("🧠 Train Voice", variant="primary")
|
| 1245 |
|
| 1246 |
with gr.Column():
|
| 1247 |
+
train_result = gr.Markdown("Record audio and click Train")
|
| 1248 |
+
|
| 1249 |
+
gr.Markdown("""
|
| 1250 |
+
### What Gets Analyzed:
|
| 1251 |
+
- **Pitch (F0)**: How high/low the voice is
|
| 1252 |
+
- **Formants**: Voice quality/timbre
|
| 1253 |
+
- **Breathiness**: Air in the voice
|
| 1254 |
+
""")
|
| 1255 |
|
| 1256 |
# ===== CREATE VOICE TAB =====
|
| 1257 |
with gr.TabItem("⚙️ Create Voice"):
|
| 1258 |
+
gr.Markdown("### Create Custom Voice Manually")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1259 |
|
| 1260 |
with gr.Row():
|
| 1261 |
with gr.Column():
|
|
|
|
| 1267 |
custom_pitch = gr.Slider(
|
| 1268 |
60, 300, 150,
|
| 1269 |
label="Pitch (Hz)",
|
| 1270 |
+
info="60-130=Male, 150-250=Female, 250+=Child"
|
| 1271 |
)
|
| 1272 |
|
| 1273 |
custom_formant = gr.Slider(
|
| 1274 |
0.7, 1.4, 1.0, step=0.05,
|
| 1275 |
label="Formant Shift",
|
| 1276 |
+
info="<1.0=Male, >1.0=Female/Child"
|
| 1277 |
)
|
| 1278 |
|
| 1279 |
custom_breathiness = gr.Slider(
|
| 1280 |
1, 10, 3,
|
| 1281 |
+
label="Breathiness (%)"
|
|
|
|
| 1282 |
)
|
| 1283 |
|
| 1284 |
custom_speed = gr.Slider(
|
|
|
|
| 1288 |
|
| 1289 |
custom_brightness = gr.Slider(
|
| 1290 |
0.8, 1.3, 1.0, step=0.05,
|
| 1291 |
+
label="Brightness"
|
|
|
|
| 1292 |
)
|
| 1293 |
|
| 1294 |
create_btn = gr.Button("✨ Create Voice", variant="primary")
|
| 1295 |
|
| 1296 |
with gr.Column():
|
| 1297 |
create_result = gr.Markdown("")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1298 |
|
| 1299 |
gr.Markdown("""
|
| 1300 |
+
### Quick Presets:
|
| 1301 |
|
| 1302 |
+
| Type | Pitch | Formant |
|
| 1303 |
+
|------|-------|---------|
|
| 1304 |
+
| Deep Male | 85 | 0.85 |
|
| 1305 |
+
| Male | 120 | 0.92 |
|
| 1306 |
+
| Female | 200 | 1.12 |
|
| 1307 |
+
| High Female | 240 | 1.20 |
|
| 1308 |
+
| Child | 280 | 1.25 |
|
| 1309 |
""")
|
| 1310 |
|
| 1311 |
+
# ===== ALL VOICES TAB =====
|
| 1312 |
with gr.TabItem("👥 All Voices"):
|
| 1313 |
+
gr.Markdown("### Pre-built Voices")
|
| 1314 |
|
| 1315 |
+
voice_info_md = ""
|
| 1316 |
for name, v in VOICE_PROFILES.items():
|
| 1317 |
+
voice_info_md += f"""
|
| 1318 |
**{name}**
|
| 1319 |
+
- Type: {v['gender'].title()} | Pitch: {v['f0']} Hz
|
|
|
|
| 1320 |
- {v['description']}
|
| 1321 |
|
|
|
|
| 1322 |
"""
|
| 1323 |
+
gr.Markdown(voice_info_md)
|
| 1324 |
+
|
| 1325 |
+
gr.Markdown("### Custom Voices")
|
| 1326 |
+
custom_voices_display = gr.Markdown("*No custom voices yet*")
|
| 1327 |
|
| 1328 |
+
# ===== EVENT HANDLERS =====
|
| 1329 |
+
|
| 1330 |
+
# Speak tab
|
| 1331 |
voice_select.change(get_voice_info, voice_select, voice_info)
|
| 1332 |
+
refresh_btn.click(refresh_voices, outputs=voice_select)
|
| 1333 |
speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
|
| 1334 |
text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
|
| 1335 |
|
| 1336 |
+
# Train tab - Fixed: update choices first, then set value separately
|
| 1337 |
+
def train_and_update(audio, name):
|
| 1338 |
+
result, voices = train_voice(audio, name)
|
| 1339 |
+
# Return result and updated dropdown with new choices
|
| 1340 |
+
return result, gr.update(choices=voices)
|
| 1341 |
+
|
| 1342 |
train_btn.click(
|
| 1343 |
+
train_and_update,
|
| 1344 |
+
[audio_input, voice_name_input],
|
| 1345 |
+
[train_result, voice_select]
|
| 1346 |
)
|
| 1347 |
|
| 1348 |
+
# Create tab - Fixed similarly
|
| 1349 |
+
def create_and_update(name, pitch, formant, breathiness, speed, brightness):
|
| 1350 |
+
result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
|
| 1351 |
+
return result, gr.update(choices=voices)
|
| 1352 |
+
|
| 1353 |
create_btn.click(
|
| 1354 |
+
create_and_update,
|
| 1355 |
[custom_name, custom_pitch, custom_formant, custom_breathiness,
|
| 1356 |
custom_speed, custom_brightness],
|
| 1357 |
+
[create_result, voice_select]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1358 |
)
|
| 1359 |
|
| 1360 |
|
| 1361 |
+
# ============================================
|
| 1362 |
+
# LAUNCH
|
| 1363 |
+
# ============================================
|
| 1364 |
+
|
| 1365 |
if __name__ == "__main__":
|
| 1366 |
demo.launch()
|