Spaces:
Sleeping
Sleeping
Commit
·
2ce54a8
1
Parent(s):
56fcb88
Integrate Supertonic 2 for professional TTS
Browse files- app/ora_server.py +57 -1
app/ora_server.py
CHANGED
|
@@ -4,10 +4,13 @@ from peft import PeftModel
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
from fastapi.staticfiles import StaticFiles
|
| 7 |
-
from fastapi.responses import FileResponse
|
| 8 |
from pydantic import BaseModel
|
| 9 |
import uvicorn
|
| 10 |
import os
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Settings
|
| 13 |
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct"
|
|
@@ -107,6 +110,59 @@ async def chat_endpoint(req: ChatRequest):
|
|
| 107 |
|
| 108 |
return {"response": response_text}
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# Mount Static Frontend (Must be last)
|
| 111 |
# Expects 'frontend/out' to exist (built via 'next build')
|
| 112 |
if os.path.exists("frontend/out"):
|
|
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
from fastapi.staticfiles import StaticFiles
|
| 7 |
+
from fastapi.responses import FileResponse, Response
|
| 8 |
from pydantic import BaseModel
|
| 9 |
import uvicorn
|
| 10 |
import os
|
| 11 |
+
import io
|
| 12 |
+
import numpy as np
|
| 13 |
+
from scipy.io import wavfile
|
| 14 |
|
| 15 |
# Settings
|
| 16 |
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct"
|
|
|
|
| 110 |
|
| 111 |
return {"response": response_text}
|
| 112 |
|
| 113 |
+
# TTS endpoint using Supertonic 2
|
| 114 |
+
tts_model = None
|
| 115 |
+
tts_processor = None
|
| 116 |
+
|
| 117 |
+
@app.on_event("startup")
|
| 118 |
+
async def load_tts():
|
| 119 |
+
global tts_model, tts_processor
|
| 120 |
+
try:
|
| 121 |
+
print("Loading Supertonic 2 TTS...")
|
| 122 |
+
from transformers import AutoProcessor, AutoModel
|
| 123 |
+
tts_processor = AutoProcessor.from_pretrained("Supertone/supertonic-2")
|
| 124 |
+
tts_model = AutoModel.from_pretrained("Supertone/supertonic-2")
|
| 125 |
+
if device == "cuda":
|
| 126 |
+
tts_model = tts_model.to("cuda")
|
| 127 |
+
print("TTS Model loaded successfully!")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"Could not load TTS model: {e}")
|
| 130 |
+
print("Voice will not be available.")
|
| 131 |
+
|
| 132 |
+
class TTSRequest(BaseModel):
|
| 133 |
+
text: str
|
| 134 |
+
|
| 135 |
+
@app.post("/api/tts")
|
| 136 |
+
async def text_to_speech(req: TTSRequest):
|
| 137 |
+
global tts_model, tts_processor
|
| 138 |
+
|
| 139 |
+
if tts_model is None or tts_processor is None:
|
| 140 |
+
raise HTTPException(status_code=503, detail="TTS model not loaded")
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
inputs = tts_processor(text=req.text, return_tensors="pt")
|
| 144 |
+
if device == "cuda":
|
| 145 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 146 |
+
|
| 147 |
+
with torch.no_grad():
|
| 148 |
+
audio = tts_model.generate(**inputs)
|
| 149 |
+
|
| 150 |
+
# Convert to numpy and create WAV
|
| 151 |
+
audio_np = audio.cpu().numpy().squeeze()
|
| 152 |
+
|
| 153 |
+
# Normalize audio
|
| 154 |
+
audio_np = np.int16(audio_np / np.max(np.abs(audio_np)) * 32767)
|
| 155 |
+
|
| 156 |
+
# Create WAV file in memory
|
| 157 |
+
wav_io = io.BytesIO()
|
| 158 |
+
wavfile.write(wav_io, 22050, audio_np)
|
| 159 |
+
wav_io.seek(0)
|
| 160 |
+
|
| 161 |
+
return Response(content=wav_io.read(), media_type="audio/wav")
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
|
| 165 |
+
|
| 166 |
# Mount Static Frontend (Must be last)
|
| 167 |
# Expects 'frontend/out' to exist (built via 'next build')
|
| 168 |
if os.path.exists("frontend/out"):
|