Spaces:

ssasio
/

VOX33

Sleeping

App Files Files Community

ssasio commited on Apr 15

Commit

0b0f2b2

verified ·

1 Parent(s): 4bb711e

Upload app.py

Browse files

Files changed (1) hide show

app.py +29 -7

app.py CHANGED Viewed

@@ -42,16 +42,35 @@ CODEC = None
 DEFAULT_SPEAKER_EMB = None
 def load_model():
-    global MODEL, TOKENIZER, CODEC, DEFAULT_SPEAKER_EMB
     print(f"Loading model on {DEVICE}...")
     MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
     TOKENIZER = TTSTokenizer()
     CODEC = CodecV6(device=DEVICE)
-    # Load default speaker embedding from sample wav
-    result = CODEC.encode(SAMPLE_WAV_PATH)
-    DEFAULT_SPEAKER_EMB = result['global_embedding'].to(DEVICE)
     print("Model ready!")
@@ -89,8 +108,9 @@ app = FastAPI()
 @app.get("/synthesize")
 def api_synthesize(
-    text: str = Query(..., description="Text to synthesize"),
-    api_key: str = Query(..., description="API key"),
 ):
     if api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Invalid API key")
@@ -99,8 +119,10 @@ def api_synthesize(
     if len(text) > 500:
         raise HTTPException(status_code=400, detail="Text too long (max 500 chars)")
     try:
-        wav = synthesize_text(text)
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
         return FileResponse(tmp.name, media_type="audio/wav")

 DEFAULT_SPEAKER_EMB = None
+# Speaker embeddings per voice
+VOICE_EMBEDDINGS = {}
+VOICE_WAV_MAP = {
+    "ani-bg-female": "sample_female_bg1.wav",
+    "ani-bg-male":   "sample_male_bg1.wav",
+    "ani-bg-male2":  "sample_male2_bg1.wav",
+    "ani-en-female": "sample_female_en1.wav",
+    "ani-en-male":   "sample_male2_en1.wav",
+}
 def load_model():
+    global MODEL, TOKENIZER, CODEC, DEFAULT_SPEAKER_EMB, VOICE_EMBEDDINGS
     print(f"Loading model on {DEVICE}...")
     MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
     TOKENIZER = TTSTokenizer()
     CODEC = CodecV6(device=DEVICE)
+    for voice_id, wav_file in VOICE_WAV_MAP.items():
+        if os.path.exists(wav_file):
+            result = CODEC.encode(wav_file)
+            VOICE_EMBEDDINGS[voice_id] = result['global_embedding'].to(DEVICE)
+            print(f"Loaded speaker: {voice_id}")
+    DEFAULT_SPEAKER_EMB = VOICE_EMBEDDINGS.get("ani-bg-female")
+    if DEFAULT_SPEAKER_EMB is None:
+        result = CODEC.encode(SAMPLE_WAV_PATH)
+        DEFAULT_SPEAKER_EMB = result['global_embedding'].to(DEVICE)
     print("Model ready!")
 @app.get("/synthesize")
 def api_synthesize(
+    text: str = Query(...),
+    api_key: str = Query(...),
+    voice: str = Query(default="ani-bg-female"),
 ):
     if api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Invalid API key")
     if len(text) > 500:
         raise HTTPException(status_code=400, detail="Text too long (max 500 chars)")
+    speaker_emb = VOICE_EMBEDDINGS.get(voice, DEFAULT_SPEAKER_EMB)
     try:
+        wav = synthesize_text(text, speaker_emb)
         tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
         return FileResponse(tmp.name, media_type="audio/wav")