Spaces:
Running on T4
Running on T4
| # app.py β Darwin-TTS v2 (λ μνΌ λΉκ³΅κ° + Voice Cloning) | |
| """ | |
| Darwin-TTS-1.7B-Cross v2 β HuggingFace Space | |
| - Original / Darwin toggle (λ μνΌ λΉκ³΅κ°) | |
| - Voice Cloning: μ¬μ©μ μμ± μ λ‘λ β κ·Έ λͺ©μλ¦¬λ‘ μμ± | |
| """ | |
| import os, io, torch, numpy as np, soundfile as sf, base64 | |
| from pathlib import Path | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import HTMLResponse, Response | |
| from safetensors import safe_open | |
| state = {"darwin_weights": None} | |
| async def lifespan(app: FastAPI): | |
| from huggingface_hub import hf_hub_download | |
| print("π¦ Loading Darwin weights...") | |
| path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors") | |
| weights = {} | |
| with safe_open(path, framework="pt") as s: | |
| for k in s.keys(): | |
| weights[k] = s.get_tensor(k) | |
| state["darwin_weights"] = weights | |
| print(f" β {len(weights)} tensors cached") | |
| yield | |
| state["darwin_weights"] = None | |
| app = FastAPI(title="Darwin-TTS-1.7B-Cross", lifespan=lifespan) | |
| async def index(): | |
| with open("index.html", "r") as f: | |
| return f.read() | |
| async def synthesize(request: dict): | |
| text = request.get("text", "μλ νμΈμ, μ λ λ€μμ λλ€.") | |
| use_darwin = request.get("use_darwin", True) | |
| ref_audio_b64 = request.get("ref_audio", None) | |
| model = None | |
| try: | |
| from qwen_tts import Qwen3TTSModel | |
| model = Qwen3TTSModel.from_pretrained( | |
| "Qwen/Qwen3-TTS-12Hz-1.7B-Base", | |
| device_map="cuda:0", dtype=torch.bfloat16 | |
| ) | |
| if use_darwin and state["darwin_weights"]: | |
| cnt = 0 | |
| for n, p in model.model.named_parameters(): | |
| if n in state["darwin_weights"]: | |
| with torch.no_grad(): | |
| p.copy_(state["darwin_weights"][n].to(p.device, p.dtype)) | |
| cnt += 1 | |
| # Voice Cloning: base64 μ€λμ€ β wav νμΌ | |
| ref_path = "/tmp/darwin_ref.wav" | |
| if ref_audio_b64: | |
| audio_bytes = base64.b64decode(ref_audio_b64) | |
| with open(ref_path, "wb") as f: | |
| f.write(audio_bytes) | |
| else: | |
| sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000) | |
| wavs, sr = model.generate_voice_clone( | |
| text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True | |
| ) | |
| wav = wavs[0].cpu().numpy() if hasattr(wavs[0], "cpu") else np.array(wavs[0]) | |
| buf = io.BytesIO() | |
| sf.write(buf, wav, sr, format="WAV") | |
| buf.seek(0) | |
| del model; torch.cuda.empty_cache() | |
| return Response( | |
| content=buf.read(), | |
| media_type="audio/wav", | |
| headers={"X-Duration": f"{len(wav)/sr:.1f}", "X-Model": "Darwin" if use_darwin else "Original"}, | |
| ) | |
| except Exception as e: | |
| if model is not None: del model | |
| torch.cuda.empty_cache() | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def health(): | |
| return {"status": "ok", "cuda": torch.cuda.is_available(), "darwin_loaded": state["darwin_weights"] is not None} | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |