Spaces:

softwarebusters
/

qiuhuaTTSv2-api

Runtime error

App Files Files Community

neboximate commited on Nov 14, 2025

Commit

eb7b307

verified ·

1 Parent(s): 7ada6ba

Create app.py

Browse files

Files changed (1) hide show

app.py +140 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import base64
+import io
+import os
+from typing import Optional
+import numpy as np
+import soundfile as sf
+import torch
+from fastapi import FastAPI
+from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig
+# Torch >= 2.6 safety (older versions just ignore this)
+try:
+    from torch.serialization import add_safe_globals
+    add_safe_globals([XttsConfig, XttsArgs, XttsAudioConfig])
+except Exception:
+    pass
+# ---------- CONFIG ----------
+REPO_ID = "softwarebusters/qiuhuaTTSv2"  # HF model repo id
+CHECKPOINT_FILE = "checkpoint_7000_infer_fp16.safetensors"
+CONFIG_FILE = "config.json"
+SPEAKER_REFERENCE = "speaker_ref.wav"  # short wav you will upload
+SR_OUT = 24000
+def pick_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+device = pick_device()
+print(f"🚀 Using device: {device}")
+# ---------- LOAD MODEL AT STARTUP ----------
+print("📥 Downloading model files from Hugging Face…")
+ckpt_path = hf_hub_download(REPO_ID, CHECKPOINT_FILE)
+cfg_path = hf_hub_download(REPO_ID, CONFIG_FILE)
+print("📄 Loading XTTS config…")
+config = XttsConfig()
+config.load_json(cfg_path)
+print("🧠 Initializing XTTS model…")
+model = Xtts.init_from_config(config)
+# base XTTS files (model.pth, dvae.pth, mel_stats.json, vocab.json)
+base_dir = os.path.dirname(ckpt_path)
+print("📦 Loading base XTTS weights…")
+model.load_checkpoint(
+    config=config,
+    checkpoint_dir=base_dir,
+    vocab_path=os.path.join(base_dir, "vocab.json"),
+    use_deepspeed=False,
+)
+print(f"📦 Applying fine-tuned checkpoint: {ckpt_path}")
+state_dict = load_file(ckpt_path)
+missing, unexpected = model.load_state_dict(state_dict, strict=False)
+print("   missing keys:", len(missing), "| unexpected:", len(unexpected))
+model.to(device)
+model.eval()
+print("✅ Model ready.")
+# ---------- SPEAKER LATENTS ----------
+if not os.path.exists(SPEAKER_REFERENCE):
+    raise FileNotFoundError(
+        f"Speaker reference file not found: {SPEAKER_REFERENCE}. "
+        "Upload a short WAV file named 'speaker_ref.wav' to the Space."
+    )
+print("🎙️ Computing speaker latents…")
+with torch.inference_mode():
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
+        audio_path=[SPEAKER_REFERENCE]
+    )
+print("✅ Speaker latents ready.")
+# ---------- FASTAPI APP ----------
+app = FastAPI(title="XTTS v2 TTS API (Space)")
+class TtsRequest(BaseModel):
+    text: str
+    language: str = "en"
+    temperature: float = 0.7
+    speed: float = 1.0
+class TtsResponse(BaseModel):
+    audio_base64: str
+    sample_rate: int
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.post("/tts", response_model=TtsResponse)
+def tts(req: TtsRequest):
+    if not req.text.strip():
+        return TtsResponse(audio_base64="", sample_rate=SR_OUT)
+    with torch.inference_mode():
+        out = model.inference(
+            text=req.text,
+            language=req.language,
+            gpt_cond_latent=gpt_cond_latent,
+            speaker_embedding=speaker_embedding,
+            temperature=req.temperature,
+            speed=req.speed,
+            enable_text_splitting=True,
+        )
+    wav = np.asarray(out["wav"], dtype=np.float32)
+    buf = io.BytesIO()
+    sf.write(buf, wav, SR_OUT, format="WAV")
+    audio_bytes = buf.getvalue()
+    audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+    return TtsResponse(audio_base64=audio_b64, sample_rate=SR_OUT)