SeaWolf-AI's picture
Update app.py
b3eab56 verified
# app.py β€” Darwin-TTS v2 (λ ˆμ‹œν”Ό λΉ„κ³΅κ°œ + Voice Cloning)
"""
Darwin-TTS-1.7B-Cross v2 β€” HuggingFace Space
- Original / Darwin toggle (λ ˆμ‹œν”Ό λΉ„κ³΅κ°œ)
- Voice Cloning: μ‚¬μš©μž μŒμ„± μ—…λ‘œλ“œ β†’ κ·Έ λͺ©μ†Œλ¦¬λ‘œ 생성
"""
import os, io, torch, numpy as np, soundfile as sf, base64
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse, Response
from safetensors import safe_open
state = {"darwin_weights": None}
@asynccontextmanager
async def lifespan(app: FastAPI):
from huggingface_hub import hf_hub_download
print("πŸ“¦ Loading Darwin weights...")
path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
weights = {}
with safe_open(path, framework="pt") as s:
for k in s.keys():
weights[k] = s.get_tensor(k)
state["darwin_weights"] = weights
print(f" βœ… {len(weights)} tensors cached")
yield
state["darwin_weights"] = None
app = FastAPI(title="Darwin-TTS-1.7B-Cross", lifespan=lifespan)
@app.get("/", response_class=HTMLResponse)
async def index():
with open("index.html", "r") as f:
return f.read()
@app.post("/synthesize")
async def synthesize(request: dict):
text = request.get("text", "μ•ˆλ…•ν•˜μ„Έμš”, μ €λŠ” λ‹€μœˆμž…λ‹ˆλ‹€.")
use_darwin = request.get("use_darwin", True)
ref_audio_b64 = request.get("ref_audio", None)
model = None
try:
from qwen_tts import Qwen3TTSModel
model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
device_map="cuda:0", dtype=torch.bfloat16
)
if use_darwin and state["darwin_weights"]:
cnt = 0
for n, p in model.model.named_parameters():
if n in state["darwin_weights"]:
with torch.no_grad():
p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
cnt += 1
# Voice Cloning: base64 μ˜€λ””μ˜€ β†’ wav 파일
ref_path = "/tmp/darwin_ref.wav"
if ref_audio_b64:
audio_bytes = base64.b64decode(ref_audio_b64)
with open(ref_path, "wb") as f:
f.write(audio_bytes)
else:
sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000)
wavs, sr = model.generate_voice_clone(
text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True
)
wav = wavs[0].cpu().numpy() if hasattr(wavs[0], "cpu") else np.array(wavs[0])
buf = io.BytesIO()
sf.write(buf, wav, sr, format="WAV")
buf.seek(0)
del model; torch.cuda.empty_cache()
return Response(
content=buf.read(),
media_type="audio/wav",
headers={"X-Duration": f"{len(wav)/sr:.1f}", "X-Model": "Darwin" if use_darwin else "Original"},
)
except Exception as e:
if model is not None: del model
torch.cuda.empty_cache()
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health():
return {"status": "ok", "cuda": torch.cuda.is_available(), "darwin_loaded": state["darwin_weights"] is not None}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)