beleata74 commited on
Commit
1e4ab0e
·
verified ·
1 Parent(s): 3767908

Upload server.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. server.py +222 -0
server.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BgTTS-38M Web Server — Gradio Interface
3
+ ========================================
4
+ Voice cloning TTS with Bulgarian + English support.
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ import torch
10
+ import numpy as np
11
+ import tempfile
12
+ import time
13
+ import soundfile as sf
14
+
15
+ # Add parent dir to path for imports
16
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ from config import (
19
+ AUDIO_OFFSET, NUM_AUDIO_TOKENS, END_OF_SPEECH_TOKEN_ID,
20
+ START_OF_SPEECH_TOKEN_ID, CODEC_SAMPLE_RATE, CODEC_FRAME_RATE,
21
+ )
22
+ from tokenizer import TTSTokenizer
23
+ from codec import CodecV6
24
+ from model import load_for_inference
25
+ from inference import generate, _split_text
26
+
27
+ # ── Global state ──────────────────────────────────────────────
28
+ MODEL = None
29
+ TOKENIZER = None
30
+ CODEC = None
31
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
+ CHECKPOINT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "checkpoint_inference.pt")
33
+
34
+
35
+ def load_model():
36
+ """Load model, tokenizer, codec once at startup."""
37
+ global MODEL, TOKENIZER, CODEC
38
+ print(f"Loading model from {CHECKPOINT_PATH} on {DEVICE}...")
39
+ MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
40
+ TOKENIZER = TTSTokenizer()
41
+ CODEC = CodecV6(device=DEVICE)
42
+ print("Model loaded!")
43
+
44
+
45
+ def synthesize_speech(text, ref_audio, temperature, top_k, top_p, rep_penalty):
46
+ """
47
+ Generate speech from text using reference audio for voice cloning.
48
+
49
+ Returns: (sample_rate, audio_array) tuple for Gradio
50
+ """
51
+ if not text or not text.strip():
52
+ return None
53
+
54
+ if ref_audio is None:
55
+ return None
56
+
57
+ # Encode reference audio for speaker embedding
58
+ sr_ref, audio_ref = ref_audio
59
+ audio_ref = audio_ref.astype(np.float32)
60
+ if audio_ref.max() > 1.0 or audio_ref.min() < -1.0:
61
+ audio_ref = audio_ref / max(abs(audio_ref.max()), abs(audio_ref.min()))
62
+
63
+ waveform = torch.from_numpy(audio_ref)
64
+ if waveform.dim() == 2:
65
+ waveform = waveform.mean(1)
66
+
67
+ result = CODEC.encode_waveform(waveform, sr_ref)
68
+ speaker_emb = result['global_embedding'].to(DEVICE)
69
+
70
+ # Split text into chunks
71
+ chunks = _split_text(text, TOKENIZER, max_len=250)
72
+
73
+ t0 = time.time()
74
+ all_codes = []
75
+ for chunk in chunks:
76
+ codes = generate(
77
+ MODEL, TOKENIZER, chunk, speaker_emb,
78
+ max_new_tokens=512,
79
+ temperature=temperature,
80
+ top_k=int(top_k),
81
+ top_p=top_p,
82
+ rep_penalty=rep_penalty,
83
+ device=DEVICE
84
+ )
85
+ if codes is not None and len(codes) > 0:
86
+ all_codes.append(codes)
87
+
88
+ gen_time = time.time() - t0
89
+
90
+ if not all_codes:
91
+ return None
92
+
93
+ codes = torch.cat(all_codes)
94
+ audio_dur = len(codes) / CODEC_FRAME_RATE
95
+ rtf = gen_time / audio_dur if audio_dur > 0 else float('inf')
96
+
97
+ # Decode to waveform
98
+ wav = CODEC.decode(codes, speaker_emb)
99
+ wav_np = wav.numpy()
100
+
101
+ info = f"✅ {len(codes)} tokens | {audio_dur:.1f}s audio | {gen_time:.1f}s gen | RTF: {rtf:.3f}"
102
+
103
+ return (CODEC_SAMPLE_RATE, wav_np), info
104
+
105
+
106
+ def build_ui():
107
+ """Build Gradio interface."""
108
+ import gradio as gr
109
+
110
+ with gr.Blocks(
111
+ title="BgTTS-38M — Bulgarian Text-to-Speech",
112
+ theme=gr.themes.Soft(
113
+ primary_hue="blue",
114
+ secondary_hue="slate",
115
+ ),
116
+ css="""
117
+ .main-title { text-align: center; margin-bottom: 0.5em; }
118
+ .subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
119
+ """
120
+ ) as app:
121
+ gr.HTML('<h1 class="main-title">🎙️ BgTTS-38M</h1>')
122
+ gr.HTML('<p class="subtitle">Bulgarian + English Text-to-Speech with Voice Cloning | 38M params | 153MB</p>')
123
+
124
+ with gr.Row():
125
+ with gr.Column(scale=2):
126
+ text_input = gr.Textbox(
127
+ label="Текст / Text",
128
+ placeholder="Въведете текст на български или английски...\nEnter text in Bulgarian or English...",
129
+ lines=5,
130
+ max_lines=15,
131
+ )
132
+
133
+ ref_audio = gr.Audio(
134
+ label="🎤 Reference Voice (за клониране на глас)",
135
+ type="numpy",
136
+ sources=["upload", "microphone"],
137
+ )
138
+
139
+ with gr.Row():
140
+ generate_btn = gr.Button("🔊 Генерирай / Generate", variant="primary", size="lg")
141
+ clear_btn = gr.Button("🗑️ Изчисти", size="lg")
142
+
143
+ with gr.Column(scale=1):
144
+ with gr.Accordion("⚙️ Настройки / Settings", open=False):
145
+ temperature = gr.Slider(
146
+ minimum=0.05, maximum=1.5, value=0.3, step=0.05,
147
+ label="Temperature",
148
+ info="По-ниска = по-чисто, по-висока = по-разнообразно"
149
+ )
150
+ top_k = gr.Slider(
151
+ minimum=1, maximum=500, value=250, step=10,
152
+ label="Top-K"
153
+ )
154
+ top_p = gr.Slider(
155
+ minimum=0.1, maximum=1.0, value=0.95, step=0.05,
156
+ label="Top-P (Nucleus)"
157
+ )
158
+ rep_penalty = gr.Slider(
159
+ minimum=1.0, maximum=2.0, value=1.1, step=0.05,
160
+ label="Repetition Penalty"
161
+ )
162
+
163
+ output_audio = gr.Audio(
164
+ label="🔊 Резултат / Output",
165
+ type="numpy",
166
+ interactive=False,
167
+ )
168
+
169
+ info_text = gr.Textbox(
170
+ label="ℹ️ Информация",
171
+ interactive=False,
172
+ lines=2,
173
+ )
174
+
175
+ # Examples
176
+ gr.Examples(
177
+ examples=[
178
+ ["Българският език е изключително богат и мелодичен."],
179
+ ["Artificial intelligence has reached a fascinating stage."],
180
+ ["Когато говорим за истински multitasking, способността ми да превключвам плавно между български и English е от огромно значение."],
181
+ ["Здравейте! Казвам се Ани и мога да говоря на български и английски."],
182
+ ["The quick brown fox jumps over the lazy dog."],
183
+ ],
184
+ inputs=[text_input],
185
+ label="📝 Примери / Examples",
186
+ )
187
+
188
+ # Event handlers
189
+ generate_btn.click(
190
+ fn=synthesize_speech,
191
+ inputs=[text_input, ref_audio, temperature, top_k, top_p, rep_penalty],
192
+ outputs=[output_audio, info_text],
193
+ )
194
+
195
+ clear_btn.click(
196
+ fn=lambda: (None, None, ""),
197
+ outputs=[text_input, output_audio, info_text],
198
+ )
199
+
200
+ return app
201
+
202
+
203
+ if __name__ == "__main__":
204
+ import argparse
205
+ p = argparse.ArgumentParser()
206
+ p.add_argument("--checkpoint", default=CHECKPOINT_PATH)
207
+ p.add_argument("--host", default="0.0.0.0")
208
+ p.add_argument("--port", type=int, default=7860)
209
+ p.add_argument("--share", action="store_true")
210
+ p.add_argument("--device", default=DEVICE)
211
+ args = p.parse_args()
212
+
213
+ CHECKPOINT_PATH = args.checkpoint
214
+ DEVICE = args.device
215
+
216
+ load_model()
217
+ app = build_ui()
218
+ app.launch(
219
+ server_name=args.host,
220
+ server_port=args.port,
221
+ share=args.share,
222
+ )