File size: 23,990 Bytes
7e624a7
 
 
 
 
66037ad
 
 
 
7e624a7
66037ad
 
 
 
 
7e624a7
 
 
 
66037ad
 
 
7e624a7
66037ad
 
 
 
7e624a7
66037ad
 
 
7e624a7
 
66037ad
9299a15
 
 
 
 
 
 
7e624a7
 
 
 
 
 
 
66037ad
 
7e624a7
fbb6865
7e624a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9299a15
7e624a7
 
 
 
 
 
 
 
 
 
 
 
 
0b0f2b2
 
 
f705686
98c5286
0b0f2b2
 
 
 
66037ad
7e624a7
66037ad
7e624a7
66037ad
7e624a7
9299a15
 
0b0f2b2
 
 
7e624a7
9299a15
 
 
 
 
 
 
 
 
7e624a7
9299a15
 
 
66037ad
 
 
7e624a7
 
 
 
 
 
 
 
 
 
9299a15
7e624a7
 
66037ad
 
 
7e624a7
 
66037ad
 
 
7e624a7
 
66037ad
 
7e624a7
 
 
 
 
 
66037ad
7e624a7
 
 
66037ad
 
7e624a7
 
 
 
66037ad
 
7e624a7
 
66037ad
7e624a7
66037ad
0b0f2b2
7e624a7
 
66037ad
7e624a7
66037ad
7e624a7
 
 
 
 
 
 
 
 
 
 
 
66037ad
7e624a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9299a15
7e624a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66037ad
 
9299a15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
"""VOX ANI TTS β€” FastAPI + HTML UI
===================================
- HTML UI served from static/index.html
- All voices & synthesis logic preserved
- REST endpoints for Vox Player app
"""

import os
import sys
import json
import time
import torch
import numpy as np
import soundfile as sf
import tempfile

from fastapi import FastAPI, Query, HTTPException, UploadFile, File as FastFile, BackgroundTasks
from fastapi.responses import FileResponse, HTMLResponse
from fastapi.staticfiles import StaticFiles

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from config import CODEC_SAMPLE_RATE, CODEC_FRAME_RATE
from tokenizer import TTSTokenizer
from codec import CodecV6
from model import load_for_inference
from inference import generate, _split_text
from audio_enhance import enhance_voice_for_cloning

# ── Config ────────────────────────────────────────────────────
CHECKPOINT_PATH = "checkpoint_inference.pt"
VOICES_FILE     = "voices.json"
DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"

# ── Embedded Voices (Permanent Presets) ───────────────────────
STATIC_VOICES = {
    "NOVA": [1.1905542612075806, 0.911335289478302, 0.017048384994268417, 0.6219748854637146, -3.8700151443481445, 0.5901893377304077, 0.2003730833530426, 0.07304413616657257, 0.3560754358768463, -4.402383327484131, 0.13412430882453918, 0.7333290576934814, 0.6954804062843323, 0.03965197131037712, 0.4772234857082367, -2.9969065189361572, 0.14260149002075195, 0.6045278906822205, 0.43753159046173096, 0.27066364884376526, 0.05965322256088257, -7.528304576873779, 0.061316393315792084, 0.37170031666755676, 0.0899418294429779, -3.191102981567383, -0.10583972930908203, -0.34356924891471863, 0.6052097678184509, 0.8864829540252686, -0.12419029325246811, 0.18624518811702728, 0.5465328693389893, 0.10085536539554596, 0.361403226852417, 0.28294241428375244, 0.11407288908958435, 0.4020424485206604, 0.318211168050766, 0.18416491150856018, 1.2316043376922607, 0.05566386878490448, -3.0626754760742188, 0.39995479583740234, 0.1184023767709732, 0.5414358973503113, 0.24752962589263916, 0.3401140570640564, 0.03436635807156563, 0.06832876801490784, 0.005995089188218117, 0.9363076686859131, 0.05009560286998749, 0.10749686509370804, -3.1572816371917725, 0.014406569302082062, 0.033463407307863235, 0.8389100432395935, 0.38054540753364563, 0.12472259253263474, -0.13591259717941284, 0.06685292720794678, 0.20993970334529877, 0.05220950022339821, 0.285030335187912, 0.23420803248882294, 0.001779097132384777, -2.928344249725342, 0.420032799243927, 0.5976344347000122, 1.2419675588607788, -0.013005070388317108, -2.794372797012329, 0.6308440566062927, 0.37192124128341675, 0.26056531071662903, 0.8862340450286865, -0.010409781709313393, 0.19720959663391113, -3.4644970893859863, 0.5564914345741272, 0.30465129017829895, -2.8717682361602783, 0.6245219111442566, 0.1030757948756218, 0.05254669114947319, 0.6154380440711975, 0.3203871548175812, 0.5704132318496704, -0.001082802191376686, 0.11111843585968018, -2.4022271633148193, 0.05973700061440468, 0.32718172669410706, 0.46028679609298706, 0.6836906671524048, 0.49810439348220825, 0.26284804940223694, 0.5748746991157532, 0.40610945224761963, 0.8076421618461609, 0.31935280561447144, 0.03156723827123642, 1.0723943710327148, 0.5207588076591492, 1.5836009979248047, 0.21744099259376526, 0.2677614390850067, 0.48335105180740356, 0.17183977365493774, -2.487086296081543, 0.22324232757091522, 0.1885831356048584, 0.4070374667644501, 0.006237425841391087, -3.7607340812683105, -0.1341061145067215, 0.3640291094779968, 0.3908931016921997, 0.4327312111854553, 0.3751571774482727, -0.14889493584632874, 0.4219122529029846, 0.5423245429992676, 0.18098433315753937, 0.041179634630680084, 0.09048353135585785, 0.1900213211774826],
    "NOVA2": [1.1983299255371094, 0.7553510069847107, -0.11643315851688385, 0.6848059892654419, -3.4123072624206543, 0.3823966383934021, 0.020973416045308113, -0.041541289538145065, 0.1298651099205017, -4.320456504821777, 0.1328410804271698, 0.7798321843147278, 0.9192888140678406, -0.011441987007856369, 0.5021658539772034, -3.01277232170105, 0.15069840848445892, 0.5135632753372192, 0.5072751641273499, 0.10088983178138733, 0.07536688446998596, -7.504648208618164, 0.1982572376728058, 0.2028168886899948, 0.1208561509847641, -3.351240873336792, 0.10814803093671799, -0.2574847936630249, 0.5949290990829468, 0.8897058963775635, -0.011263539083302021, 0.023030906915664673, 0.5989617705345154, 0.25227615237236023, 0.3036550283432007, 0.097237728536129, 0.3288447856903076, 0.4038790166378021, 0.28024664521217346, 0.1414487659931183, 1.276529312133789, 0.09527754038572311, -3.2896828651428223, 0.4307906925678253, 0.1465688943862915, 0.6483601331710815, 0.45327043533325195, 0.535084068775177, 0.004426241852343082, -0.023835983127355576, -0.09964805841445923, 0.9329249858856201, 0.03744696453213692, 0.018313033506274223, -3.1105291843414307, 0.03548780828714371, 0.13072998821735382, 1.0241966247558594, 0.42775759100914, 0.2272561490535736, -0.18610148131847382, 0.10477077960968018, 0.1976785957813263, 0.016407163813710213, 0.31298208236694336, 0.4097185432910919, 0.07735035568475723, -3.1821649074554443, 0.2845577895641327, 0.39520949125289917, 1.1905566453933716, 0.19482173025608063, -2.7022228240966797, 0.7844187021255493, 0.3867405951023102, 0.22514104843139648, 1.0072884559631348, 0.10878886282444, 0.15838348865509033, -3.617748498916626, 0.26376873254776, 0.3570598363876343, -2.396841049194336, 0.6372708082199097, 0.01997438631951809, 0.07147836685180664, 0.46764785051345825, 0.2363276183605194, 0.5287986993789673, 0.16327831149101257, 0.11173143982887268, -2.901160478591919, -0.0006287320284172893, 0.21265800297260284, 0.4581712782382965, 0.5663840770721436, 0.46456241607666016, 0.3096385598182678, 0.5768164396286011, 0.5899262428283691, 0.9144637584686279, 0.1793370097875595, 0.09171684086322784, 0.9268653392791748, 0.6438857316970825, 1.475677728652954, 0.1277070939540863, 0.13146352767944336, 0.9435262680053711, 0.3426448702812195, -2.267172336578369, 0.06779059767723083, 0.162134051322937, 0.286209374666214, -0.05769478157162666, -3.8586134910583496, -0.05524313449859619, 0.34964698553085327, 0.39856162667274475, 0.4654121696949005, 0.3936040997505188, 0.027396317571401596, 0.39761143922805786, 0.4053165316581726, 0.08136938512325287, -0.011603720486164093, 0.027974925935268402, 0.17831583321094513],
    "YANY": [0.7595553994178772, 0.7045170068740845, 0.14025861024856567, 0.5667456984519958, -3.617363452911377, 0.31423935294151306, 0.19483143091201782, -0.021618135273456573, 0.47987812757492065, -4.3643341064453125, 0.1844087541103363, 0.7400225400924683, 0.6076151728630066, 0.17821498215198517, 0.6499994993209839, -3.3450357913970947, 0.33548033237457275, 0.48264598846435547, 0.6536094546318054, 0.0376361720263958, 0.09048639237880707, -7.516693592071533, 0.08222998678684235, 0.2344668209552765, 0.11646643280982971, -3.2252886295318604, 0.11130928248167038, -0.14717638492584229, 0.3747222423553467, 0.7822909355163574, 0.019589057192206383, 0.24496370553970337, 1.0580699443817139, 0.5673164129257202, 0.24417510628700256, 0.29432353377342224, 0.18497471511363983, 0.5119978785514832, 0.4962784945964813, 0.204768568277359, 1.2384358644485474, -0.062021948397159576, -3.1774840354919434, 0.4962097108364105, -0.13075096905231476, 0.2981692850589752, 0.4086250364780426, 0.3752974569797516, 0.07090616226196289, 0.14261071383953094, -0.14197185635566711, 0.8166291117668152, -0.0609249472618103, 0.18801508843898773, -3.2127737998962402, 0.43553850054740906, -0.07682569324970245, 0.7805266976356506, 0.34974756836891174, 0.33446505665779114, -0.19968514144420624, 0.18937693536281586, 0.4269423186779022, -0.045752011239528656, -0.019833002239465714, 0.260649174451828, 0.006719403900206089, -3.4137356281280518, 0.47937801480293274, 0.6114392876625061, 1.1895595788955688, 0.29007431864738464, -2.403169870376587, 0.44408389925956726, 0.43230104446411133, 0.2233371138572693, 0.8427040576934814, 0.0887276902794838, 0.11937491595745087, -3.386258363723755, 0.6230071187019348, 0.2838999032974243, -3.1078875064849854, 0.2723325490951538, 0.20863571763038635, 0.09951550513505936, 0.5134825110435486, 0.026908542960882187, 0.5447674989700317, 0.18483781814575195, -0.028836730867624283, -2.662815570831299, 0.23732498288154602, 0.3241783678531647, 0.6850618124008179, 0.7286363840103149, 0.3241086006164551, 0.34012338519096375, 0.6306040287017822, 0.5372657179832458, 0.6698591709136963, 0.3421519100666046, 0.11022952944040298, 0.8070170283317566, 0.6347618699073792, 1.2677627801895142, 0.023278236389160156, 0.15844547748565674, 0.7308670282363892, 0.08875919133424759, -2.8425047397613525, 0.026972733438014984, 0.2932690978050232, 0.1280515342950821, 0.4489481449127197, -3.5902676582336426, -0.06417408585548401, 0.19549356400966644, 0.3790775239467621, 0.3419957160949707, 0.23203779757022858, 0.03513122349977493, 0.527247428894043, 0.5583801865577698, 0.22111022472381592, 0.09699676930904388, 0.17534780502319336, 0.1823458969593048],
    "ANITA": [0.5489174276590347, 0.8563072681427002, 0.015058575198054314, 0.5856767892837524, -3.474443793296814, 0.5685910433530807, 0.05540411360561848, -0.166514509357512, 0.32931193709373474, -4.220456838607788, 0.17830145359039307, 0.7940778732299805, 0.41199035942554474, 0.07260656729340553, 0.7391091883182526, -2.992477297782898, 0.33138880133628845, 0.7154046595096588, 0.6319634020328522, 0.11274447292089462, 0.13320110738277435, -7.617172002792358, 0.24857618659734726, 0.26255226135253906, 0.08399171382188797, -2.8611263036727905, 0.13354498147964478, -0.002969544380903244, 0.3499854579567909, 0.5311120748519897, -0.025399386882781982, 0.2828158661723137, 0.5750554352998734, 0.4820759743452072, 0.4567323178052902, 0.4035782665014267, 0.3425174504518509, 0.306240051984787, 0.5308757424354553, 0.3264385610818863, 1.0148829519748688, -0.07871465012431145, -3.2808687686920166, 0.5336374640464783, -0.065285908523947, 0.08356216922402382, 0.36565399169921875, 0.3154626786708832, 0.156748715788126, 0.36649923026561737, -0.22774440050125122, 0.6688017547130585, -0.050320989452302456, 0.17112083733081818, -3.0628098249435425, 0.23470847308635712, 0.21637441217899323, 0.8258635103702545, 0.5496575832366943, 0.3798123002052307, -0.18623936921358109, 0.17447946220636368, 0.4036127179861069, 0.15702290832996368, 0.31793907284736633, 0.33534564077854156, -0.0962473526597023, -3.4386789798736572, 0.3713282197713852, 0.6002452671527863, 1.0634905099868774, 0.15481910854578018, -2.9156216382980347, 0.5021517276763916, 0.5440895110368729, 0.4653082937002182, 0.6940016746520996, 0.14119910448789597, 0.4195473939180374, -3.6648422479629517, 0.6860649287700653, 0.2642555832862854, -3.0756865739822388, 0.33001116663217545, 0.1546030193567276, 0.11629177257418633, 0.6103253066539764, 0.02144426666200161, 0.42899811267852783, -0.006054788827896118, 0.22657296806573868, -2.8145543336868286, 0.15966206416487694, 0.47316767275333405, 0.6700464189052582, 1.0120139420032501, 0.34442101418972015, 0.04423576220870018, 0.9130581915378571, 0.3285454958677292, 0.6877541542053223, 0.061741845682263374, 0.10550222545862198, 0.7509118616580963, 0.6574697494506836, 0.8685739040374756, 0.14616264775395393, 0.2814873680472374, 0.7580173015594482, 0.028720788657665253, -3.7125461101531982, 0.09411222487688065, 0.19545741379261017, 0.3242332637310028, 0.20917727798223495, -3.281902551651001, 0.07898347079753876, 0.3505653291940689, 0.5302634239196777, 0.24469570070505142, 0.3834524601697922, -0.12796197086572647, 0.4154924005270004, 0.43273375928401947, 0.35387393832206726, 0.15660029649734497, -0.021274873986840248, 0.23377800732851028]
}

def decode_key(encoded: str) -> str:
    import base64
    try:
        return base64.b64decode(encoded[::-1]).decode()
    except Exception:
        return ""


# ── API Key ───────────────────────────────────────────────────
ENCODED_API_KEY = "0IDMy81czV2YjF2XlRXY2lmcw9VauF2X49md"

if os.environ.get("VOX_API_KEY"):
    API_KEY = os.environ.get("VOX_API_KEY")
elif ENCODED_API_KEY:
    API_KEY = decode_key(ENCODED_API_KEY)
else:
    API_KEY = None


# ── HuggingFace Hub persistence ───────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", "")
SPACE_ID  = os.environ.get("SPACE_ID", "")


def save_voices_to_repo(voices_data: dict):
    with open(VOICES_FILE, "w", encoding="utf-8") as f:
        json.dump(voices_data, f, ensure_ascii=False, indent=2)
    if not HF_TOKEN or not SPACE_ID:
        return
    try:
        from huggingface_hub import HfApi
        api = HfApi(token=HF_TOKEN)
        api.upload_file(
            path_or_fileobj=VOICES_FILE,
            path_in_repo=VOICES_FILE,
            repo_id=SPACE_ID,
            repo_type="space",
            commit_message="Update voices.json",
        )
    except Exception as e:
        print(f"Warning: could not save to repo: {e}")


def load_voices() -> dict:
    if HF_TOKEN and SPACE_ID:
        try:
            from huggingface_hub import hf_hub_download
            hf_hub_download(
                repo_id=SPACE_ID,
                repo_type="space",
                filename=VOICES_FILE,
                local_dir=".",
                token=HF_TOKEN,
            )
        except Exception as e:
            print(f"Could not pull {VOICES_FILE} from repo: {e}")
    if os.path.exists(VOICES_FILE):
        try:
            with open(VOICES_FILE, "r", encoding="utf-8") as f:
                data = json.load(f)
                print(f"  Loaded {len(data)} cloned voices from JSON")
                return data
        except Exception as e:
            print(f"  Error reading {VOICES_FILE}: {e}")
    return {}


# ── Global state ──────────────────────────────────────────────
MODEL               = None
TOKENIZER           = None
CODEC               = None
DEFAULT_SPEAKER_EMB = None
VOICE_EMBEDDINGS    = {}
CLONED_VOICES       = {}

VOICE_WAV_MAP = {
    "ani-bg-female": "sample_female_bg1.wav",
    "ani-bg-male":   "sample_male2_bg1.wav",
    "ani-en-female": "sample_female_en1.wav",
    "ani-en-male":   "sample_male2_en1.wav",
}


def load_model():
    global MODEL, TOKENIZER, CODEC, DEFAULT_SPEAKER_EMB, VOICE_EMBEDDINGS, CLONED_VOICES
    print(f"Loading model on {DEVICE}...")
    MODEL     = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
    TOKENIZER = TTSTokenizer()
    CODEC     = CodecV6(device=DEVICE)

    # 1. Π—Π°Ρ€Π΅ΠΆΠ΄Π°Π½Π΅ Π½Π° Π²Π³Ρ€Π°Π΄Π΅Π½ΠΈΡ‚Π΅ WAV прСсСти
    for voice_id, wav_file in VOICE_WAV_MAP.items():
        if os.path.exists(wav_file):
            result = CODEC.encode(wav_file)
            VOICE_EMBEDDINGS[voice_id] = result["global_embedding"].to(DEVICE)
            print(f"  Loaded WAV preset: {voice_id}")

    # 2. Π—Π°Ρ€Π΅ΠΆΠ΄Π°Π½Π΅ Π½Π° Π²Π³Ρ€Π°Π΄Π΅Π½ΠΈΡ‚Π΅ статични гласовС (Π½ΠΎΠ²ΠΈΡ‚Π΅)
    for v_name, emb_list in STATIC_VOICES.items():
        v_id = f"static-{v_name.lower()}"
        VOICE_EMBEDDINGS[v_id] = torch.tensor(emb_list, dtype=torch.float32).to(DEVICE)
        print(f"  Loaded static preset: {v_id}")

    # 3. Π—Π°Ρ€Π΅ΠΆΠ΄Π°Π½Π΅ Π½Π° Π΄ΠΈΠ½Π°ΠΌΠΈΡ‡Π½ΠΎ ΠΊΠ»ΠΎΠ½ΠΈΡ€Π°Π½ΠΈΡ‚Π΅ гласовС ΠΎΡ‚ JSON
    CLONED_VOICES = load_voices()

    # NOVA Π΅ новият глас ΠΏΠΎ ΠΏΠΎΠ΄Ρ€Π°Π·Π±ΠΈΡ€Π°Π½Π΅
    DEFAULT_SPEAKER_EMB = VOICE_EMBEDDINGS.get("static-nova") or VOICE_EMBEDDINGS.get("ani-bg-female")
    print("Model ready!")


def get_speaker_emb(voice_id: str):
    if voice_id in VOICE_EMBEDDINGS:
        return VOICE_EMBEDDINGS[voice_id]
    if voice_id in CLONED_VOICES:
        return torch.tensor(
            CLONED_VOICES[voice_id]["embedding"], dtype=torch.float32
        ).to(DEVICE)
    return DEFAULT_SPEAKER_EMB


_SILENCE_FRAMES = int(CODEC_FRAME_RATE * 0.15)


def synthesize_text(text: str, speaker_emb=None) -> np.ndarray:
    if speaker_emb is None:
        speaker_emb = DEFAULT_SPEAKER_EMB
    chunks    = _split_text(text, TOKENIZER, max_len=250)
    all_audio = []
    for chunk in chunks:
        codes = generate(
            MODEL, TOKENIZER, chunk, speaker_emb,
            max_new_tokens=512, temperature=0.3,
            top_k=250, top_p=0.95, rep_penalty=1.3, device=DEVICE,
        )
        if codes is not None and len(codes) > 0:
            audio = CODEC.decode(codes, speaker_emb).cpu().numpy()
            all_audio.append(audio)
            if len(chunks) > 1:
                silence = np.zeros(int(CODEC_SAMPLE_RATE * 0.15), dtype=np.float32)
                all_audio.append(silence)
    if not all_audio:
        return np.zeros(1000, dtype=np.float32)
    if len(chunks) > 1 and len(all_audio) > 1:
        all_audio = all_audio[:-1]
    return np.concatenate(all_audio)


# ── Auth helper ───────────────────────────────────────────────
def require_key(api_key: str):
    if API_KEY is not None and api_key != API_KEY:
        raise HTTPException(status_code=403, detail="Invalid API key")


# ── FastAPI app ───────────────────────────────────────────────
app = FastAPI(title="VOX ANI TTS")

app.mount("/static", StaticFiles(directory="static"), name="static")


@app.on_event("startup")
def startup():
    try:
        load_model()
    except Exception as e:
        print(f"⚠️  Model not loaded: {e}")


def remove_file(path: str):
    if os.path.exists(path):
        os.remove(path)


@app.get("/", response_class=HTMLResponse)
def serve_ui():
    with open("static/index.html", encoding="utf-8") as f:
        return f.read()


@app.get("/voices")
def api_get_voices(api_key: str = Query(default="")):
    require_key(api_key)
    preset = [{"id": k, "name": k, "type": "preset"}
              for k in VOICE_EMBEDDINGS]
    cloned = [{"id": k, "name": v["name"], "type": "cloned", "embedding": v["embedding"]}
              for k, v in CLONED_VOICES.items()]
    return {"voices": preset + cloned}


@app.get("/synthesize")
def api_synthesize(
    text:       str = Query(...),
    api_key:    str = Query(default=""),
    voice:      str = Query(default="static-nova"),
    background_tasks: BackgroundTasks = BackgroundTasks(),
):
    require_key(api_key)
    speaker_emb = get_speaker_emb(voice)
    wav = synthesize_text(text, speaker_emb)
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
    background_tasks.add_task(remove_file, tmp.name)
    return FileResponse(tmp.name, media_type="audio/wav")


@app.get("/synthesize_with_embedding")
def api_synthesize_with_embedding(
    text:      str = Query(...),
    api_key:   str = Query(default=""),
    embedding: str = Query(...),
    background_tasks: BackgroundTasks = BackgroundTasks(),
):
    require_key(api_key)
    emb_list    = json.loads(embedding)
    speaker_emb = torch.tensor(emb_list, dtype=torch.float32).to(DEVICE)
    wav = synthesize_text(text, speaker_emb)
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
    background_tasks.add_task(remove_file, tmp.name)
    return FileResponse(tmp.name, media_type="audio/wav")


@app.post("/encode_voice")
async def api_encode_voice(
    api_key:          str   = Query(default=""),
    file:             UploadFile = FastFile(...),
    enhance:          bool  = Query(default=True),
    denoise_strength: float = Query(default=0.75),
    deess_db:         float = Query(default=6.0),
    warm_db:          float = Query(default=2.5),
):
    require_key(api_key)
    audio_bytes = await file.read()
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name
    try:
        audio, sr = sf.read(tmp_path)
        audio = audio.astype(np.float32)
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        if enhance:
            audio = enhance_voice_for_cloning(
                audio, sr,
                denoise_strength=denoise_strength,
                deess_reduction_db=deess_db,
                warm_boost_db=warm_db,
            )
            sf.write(tmp_path, audio, sr)
        result = CODEC.encode(tmp_path)
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
    return {"embedding": result["global_embedding"].squeeze().cpu().tolist()}


@app.post("/clone_voice")
async def api_clone_voice(
    api_key:          str   = Query(default=""),
    name:             str   = Query(default=""),
    file:             UploadFile = FastFile(...),
    enhance:          bool  = Query(default=True),
    denoise_strength: float = Query(default=0.75),
    deess_db:         float = Query(default=6.0),
    warm_db:          float = Query(default=2.5),
):
    require_key(api_key)
    if len(CLONED_VOICES) >= 100:
        raise HTTPException(status_code=400, detail="Max 100 cloned voices")

    audio_bytes = await file.read()
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name
    try:
        audio, sr = sf.read(tmp_path)
        audio = audio.astype(np.float32)
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        if enhance:
            audio = enhance_voice_for_cloning(
                audio, sr,
                denoise_strength=denoise_strength,
                deess_reduction_db=deess_db,
                warm_boost_db=warm_db,
            )
            sf.write(tmp_path, audio, sr)
        result = CODEC.encode(tmp_path)
        embedding = result["global_embedding"].squeeze().cpu().tolist()
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

    voice_name = name.strip() if name.strip() else f"Cloned_{int(time.time())}"
    voice_id   = f"clone_{int(time.time())}"
    CLONED_VOICES[voice_id] = {"name": voice_name, "embedding": embedding}
    save_voices_to_repo(CLONED_VOICES)
    return {"id": voice_id, "name": voice_name}


@app.delete("/voices/{voice_id}")
def api_delete_voice(voice_id: str, api_key: str = Query(default="")):
    require_key(api_key)
    if voice_id not in CLONED_VOICES:
        raise HTTPException(status_code=404, detail="Voice not found")
    name = CLONED_VOICES.pop(voice_id)["name"]
    save_voices_to_repo(CLONED_VOICES)
    return {"deleted": voice_id, "name": name}


@app.get("/voices/{voice_id}/download")
def api_download_voice(voice_id: str, api_key: str = Query(default="")):
    require_key(api_key)
    if voice_id in CLONED_VOICES:
        v = CLONED_VOICES[voice_id]
        data = {voice_id: {"name": v["name"], "embedding": v["embedding"]}}
    elif voice_id in VOICE_EMBEDDINGS:
        data = {voice_id: {"name": voice_id, "embedding": VOICE_EMBEDDINGS[voice_id].cpu().tolist()}}
    else:
        raise HTTPException(status_code=404, detail="Voice not found")

    safe = (CLONED_VOICES[voice_id]["name"] if voice_id in CLONED_VOICES else voice_id).replace(" ", "_")
    tmp  = tempfile.NamedTemporaryFile(
        suffix=".json", prefix=f"voice_{safe}_",
        delete=False, mode="w", encoding="utf-8",
    )
    json.dump(data, tmp, ensure_ascii=False, indent=2)
    tmp.close()
    return FileResponse(
        tmp.name,
        media_type="application/json",
        filename=f"voice_{safe}.json",
    )


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)