Update app.py
Browse files
app.py
CHANGED
|
@@ -32,73 +32,21 @@ app.add_middleware(
|
|
| 32 |
|
| 33 |
# Configuration
|
| 34 |
OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "http://localhost:7860")
|
| 35 |
-
DEFAULT_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 36 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
|
| 38 |
print(f"β
Using device: {DEVICE}")
|
| 39 |
|
| 40 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
tts = None
|
| 42 |
model_loaded = False
|
| 43 |
current_model = ""
|
| 44 |
voice_cloning_supported = False
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# Set environment variable to automatically accept terms
|
| 48 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 49 |
-
|
| 50 |
-
print("π Starting TTS model loading process...")
|
| 51 |
-
|
| 52 |
-
# Import TTS
|
| 53 |
-
from TTS.api import TTS
|
| 54 |
-
|
| 55 |
-
# Automatically respond to the TOS prompt
|
| 56 |
-
import sys
|
| 57 |
-
from io import StringIO
|
| 58 |
-
|
| 59 |
-
# Capture the input prompt and automatically respond 'y'
|
| 60 |
-
old_stdin = sys.stdin
|
| 61 |
-
sys.stdin = StringIO('y\n')
|
| 62 |
-
|
| 63 |
-
try:
|
| 64 |
-
print("π Loading XTTS model...")
|
| 65 |
-
|
| 66 |
-
# Clear any potentially corrupted model files
|
| 67 |
-
model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
|
| 68 |
-
if os.path.exists(model_path):
|
| 69 |
-
print(f"π Clearing potentially corrupted model cache: {model_path}")
|
| 70 |
-
import shutil
|
| 71 |
-
shutil.rmtree(model_path, ignore_errors=True)
|
| 72 |
-
|
| 73 |
-
# Try to load XTTS model with explicit download
|
| 74 |
-
tts = TTS(DEFAULT_MODEL).to(DEVICE)
|
| 75 |
-
model_loaded = True
|
| 76 |
-
current_model = DEFAULT_MODEL
|
| 77 |
-
voice_cloning_supported = True
|
| 78 |
-
print("β
XTTS model loaded successfully with voice cloning support")
|
| 79 |
-
|
| 80 |
-
except Exception as e:
|
| 81 |
-
print(f"β XTTS model failed: {e}")
|
| 82 |
-
|
| 83 |
-
# Try fallback model
|
| 84 |
-
try:
|
| 85 |
-
print("π Trying fallback model...")
|
| 86 |
-
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
|
| 87 |
-
model_loaded = True
|
| 88 |
-
current_model = "tts_models/en/ljspeech/tacotron2-DDC"
|
| 89 |
-
voice_cloning_supported = False
|
| 90 |
-
print("β
Fallback TTS model loaded successfully (English only, no voice cloning)")
|
| 91 |
-
except Exception as fallback_error:
|
| 92 |
-
print(f"β Fallback model also failed: {fallback_error}")
|
| 93 |
-
tts = None
|
| 94 |
-
|
| 95 |
-
finally:
|
| 96 |
-
# Restore stdin
|
| 97 |
-
sys.stdin = old_stdin
|
| 98 |
-
|
| 99 |
-
except Exception as e:
|
| 100 |
-
print(f"β Failed to initialize TTS: {e}")
|
| 101 |
-
tts = None
|
| 102 |
|
| 103 |
# Pydantic models
|
| 104 |
class TTSRequest(BaseModel):
|
|
@@ -125,14 +73,13 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
|
|
| 125 |
if not OCI_UPLOAD_API_URL:
|
| 126 |
return None, "OCI upload API URL not configured"
|
| 127 |
|
| 128 |
-
# Use voiceover subfolder
|
| 129 |
url = f"{OCI_UPLOAD_API_URL}/api/upload"
|
| 130 |
|
| 131 |
with open(file_path, "rb") as f:
|
| 132 |
files = {"file": (filename, f, "audio/wav")}
|
| 133 |
data = {
|
| 134 |
"project_id": project_id,
|
| 135 |
-
"subfolder": "voiceover"
|
| 136 |
}
|
| 137 |
|
| 138 |
response = requests.post(url, files=files, data=data, timeout=30)
|
|
@@ -158,7 +105,7 @@ def upload_to_oci_with_retry(file_path: str, filename: str, project_id: str, fil
|
|
| 158 |
|
| 159 |
if error:
|
| 160 |
if attempt < max_retries - 1:
|
| 161 |
-
wait_time = 2 ** attempt
|
| 162 |
print(f"β³ Upload failed, retrying in {wait_time}s: {error}")
|
| 163 |
time.sleep(wait_time)
|
| 164 |
continue
|
|
@@ -196,18 +143,15 @@ def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
|
|
| 196 |
try:
|
| 197 |
print(f"ποΈ Cloning voice: {voice_name}")
|
| 198 |
|
| 199 |
-
# Create voice directory
|
| 200 |
voice_dir = f"/tmp/voices/{voice_name}"
|
| 201 |
os.makedirs(voice_dir, exist_ok=True)
|
| 202 |
|
| 203 |
-
# Copy audio files to voice directory
|
| 204 |
for i, audio_file in enumerate(audio_files):
|
| 205 |
dest_path = f"{voice_dir}/sample_{i+1}.wav"
|
| 206 |
shutil.copy2(audio_file, dest_path)
|
| 207 |
print(f" Copied sample {i+1} to: {dest_path}")
|
| 208 |
|
| 209 |
print(f"β
Voice cloning setup completed for {voice_name}")
|
| 210 |
-
|
| 211 |
return True, f"Voice {voice_name} is ready for use"
|
| 212 |
|
| 213 |
except Exception as e:
|
|
@@ -217,18 +161,78 @@ def supports_voice_cloning():
|
|
| 217 |
"""Check if the current model supports voice cloning"""
|
| 218 |
return "xtts" in current_model.lower()
|
| 219 |
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
@app.post("/api/tts")
|
| 222 |
async def generate_tts(request: TTSRequest):
|
| 223 |
-
"""Generate TTS for a single text"""
|
| 224 |
try:
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
| 232 |
|
| 233 |
print(f"π₯ TTS request for project: {request.project_id}")
|
| 234 |
print(f" Text length: {len(request.text)} characters")
|
|
@@ -260,7 +264,6 @@ async def generate_tts(request: TTSRequest):
|
|
| 260 |
|
| 261 |
# Generate TTS based on model capabilities
|
| 262 |
if supports_voice_cloning():
|
| 263 |
-
# XTTS model with voice cloning support
|
| 264 |
tts.tts_to_file(
|
| 265 |
text=request.text,
|
| 266 |
speaker_wav=speaker_wav,
|
|
@@ -268,7 +271,6 @@ async def generate_tts(request: TTSRequest):
|
|
| 268 |
file_path=output_path
|
| 269 |
)
|
| 270 |
else:
|
| 271 |
-
# Fallback model (Tacotron2)
|
| 272 |
tts.tts_to_file(
|
| 273 |
text=request.text,
|
| 274 |
file_path=output_path
|
|
@@ -283,7 +285,6 @@ async def generate_tts(request: TTSRequest):
|
|
| 283 |
|
| 284 |
if error:
|
| 285 |
print(f"β OCI upload failed: {error}")
|
| 286 |
-
# Still return the local file path if upload fails
|
| 287 |
return {
|
| 288 |
"status": "partial_success",
|
| 289 |
"message": f"TTS generated but upload failed: {error}",
|
|
@@ -316,8 +317,10 @@ async def generate_tts(request: TTSRequest):
|
|
| 316 |
async def batch_generate_tts(request: BatchTTSRequest):
|
| 317 |
"""Generate TTS for multiple texts with sequential naming"""
|
| 318 |
try:
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
| 321 |
|
| 322 |
print(f"π₯ Batch TTS request for project: {request.project_id}")
|
| 323 |
print(f" Number of texts: {len(request.texts)}")
|
|
@@ -349,7 +352,6 @@ async def batch_generate_tts(request: BatchTTSRequest):
|
|
| 349 |
|
| 350 |
# Generate TTS based on model capabilities
|
| 351 |
if supports_voice_cloning():
|
| 352 |
-
# XTTS model with voice cloning support
|
| 353 |
tts.tts_to_file(
|
| 354 |
text=text,
|
| 355 |
speaker_wav=speaker_wav,
|
|
@@ -357,7 +359,6 @@ async def batch_generate_tts(request: BatchTTSRequest):
|
|
| 357 |
file_path=output_path
|
| 358 |
)
|
| 359 |
else:
|
| 360 |
-
# Fallback model (Tacotron2)
|
| 361 |
tts.tts_to_file(
|
| 362 |
text=text,
|
| 363 |
file_path=output_path
|
|
@@ -540,13 +541,35 @@ async def health_check():
|
|
| 540 |
"""Health check endpoint"""
|
| 541 |
return {
|
| 542 |
"status": "healthy",
|
| 543 |
-
"tts_loaded":
|
| 544 |
"model": current_model,
|
| 545 |
"voice_cloning_supported": voice_cloning_supported,
|
| 546 |
"device": DEVICE,
|
|
|
|
| 547 |
"timestamp": datetime.now().isoformat()
|
| 548 |
}
|
| 549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
@app.get("/")
|
| 551 |
async def root():
|
| 552 |
"""Root endpoint with API information"""
|
|
@@ -558,10 +581,11 @@ async def root():
|
|
| 558 |
"POST /api/upload-voice": "Upload a voice sample for cloning",
|
| 559 |
"POST /api/clone-voice": "Clone a voice from multiple samples",
|
| 560 |
"GET /api/voices": "List available voices",
|
| 561 |
-
"GET /api/health": "Health check"
|
|
|
|
| 562 |
},
|
| 563 |
-
"model_loaded":
|
| 564 |
-
"model_name": current_model if
|
| 565 |
"voice_cloning_supported": supports_voice_cloning()
|
| 566 |
}
|
| 567 |
|
|
@@ -569,7 +593,6 @@ if __name__ == "__main__":
|
|
| 569 |
import uvicorn
|
| 570 |
print("π Starting TTS API with Coqui TTS and Voice Cloning...")
|
| 571 |
print("π API endpoints available at: http://localhost:7860/")
|
| 572 |
-
print("
|
| 573 |
-
print(
|
| 574 |
-
print(f"ποΈ Voice cloning: {'Supported' if voice_cloning_supported else 'Not supported'}")
|
| 575 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 32 |
|
| 33 |
# Configuration
|
| 34 |
OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "http://localhost:7860")
|
|
|
|
| 35 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
|
| 37 |
print(f"β
Using device: {DEVICE}")
|
| 38 |
|
| 39 |
+
# Model configuration
|
| 40 |
+
MODEL_REPO_ID = "coqui/XTTS-v2"
|
| 41 |
+
MODEL_CACHE_DIR = "/tmp/tts_models"
|
| 42 |
+
|
| 43 |
+
# Global state
|
| 44 |
tts = None
|
| 45 |
model_loaded = False
|
| 46 |
current_model = ""
|
| 47 |
voice_cloning_supported = False
|
| 48 |
+
model_loading = False
|
| 49 |
+
model_load_attempts = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# Pydantic models
|
| 52 |
class TTSRequest(BaseModel):
|
|
|
|
| 73 |
if not OCI_UPLOAD_API_URL:
|
| 74 |
return None, "OCI upload API URL not configured"
|
| 75 |
|
|
|
|
| 76 |
url = f"{OCI_UPLOAD_API_URL}/api/upload"
|
| 77 |
|
| 78 |
with open(file_path, "rb") as f:
|
| 79 |
files = {"file": (filename, f, "audio/wav")}
|
| 80 |
data = {
|
| 81 |
"project_id": project_id,
|
| 82 |
+
"subfolder": "voiceover"
|
| 83 |
}
|
| 84 |
|
| 85 |
response = requests.post(url, files=files, data=data, timeout=30)
|
|
|
|
| 105 |
|
| 106 |
if error:
|
| 107 |
if attempt < max_retries - 1:
|
| 108 |
+
wait_time = 2 ** attempt
|
| 109 |
print(f"β³ Upload failed, retrying in {wait_time}s: {error}")
|
| 110 |
time.sleep(wait_time)
|
| 111 |
continue
|
|
|
|
| 143 |
try:
|
| 144 |
print(f"ποΈ Cloning voice: {voice_name}")
|
| 145 |
|
|
|
|
| 146 |
voice_dir = f"/tmp/voices/{voice_name}"
|
| 147 |
os.makedirs(voice_dir, exist_ok=True)
|
| 148 |
|
|
|
|
| 149 |
for i, audio_file in enumerate(audio_files):
|
| 150 |
dest_path = f"{voice_dir}/sample_{i+1}.wav"
|
| 151 |
shutil.copy2(audio_file, dest_path)
|
| 152 |
print(f" Copied sample {i+1} to: {dest_path}")
|
| 153 |
|
| 154 |
print(f"β
Voice cloning setup completed for {voice_name}")
|
|
|
|
| 155 |
return True, f"Voice {voice_name} is ready for use"
|
| 156 |
|
| 157 |
except Exception as e:
|
|
|
|
| 161 |
"""Check if the current model supports voice cloning"""
|
| 162 |
return "xtts" in current_model.lower()
|
| 163 |
|
| 164 |
+
def load_tts_model():
|
| 165 |
+
"""Load TTS model with retry logic and proper error handling"""
|
| 166 |
+
global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts
|
| 167 |
+
|
| 168 |
+
if model_loading:
|
| 169 |
+
print("β³ Model is already being loaded...")
|
| 170 |
+
return False
|
| 171 |
+
|
| 172 |
+
model_loading = True
|
| 173 |
+
model_load_attempts += 1
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
from TTS.api import TTS
|
| 177 |
+
|
| 178 |
+
# Handle TOS acceptance automatically
|
| 179 |
+
import sys
|
| 180 |
+
from io import StringIO
|
| 181 |
+
|
| 182 |
+
old_stdin = sys.stdin
|
| 183 |
+
sys.stdin = StringIO('y\n')
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
print("π Loading XTTS model...")
|
| 187 |
+
|
| 188 |
+
# Try to load XTTS model
|
| 189 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
|
| 190 |
+
|
| 191 |
+
model_loaded = True
|
| 192 |
+
current_model = "xtts_v2"
|
| 193 |
+
voice_cloning_supported = True
|
| 194 |
+
print("β
XTTS model loaded successfully")
|
| 195 |
+
return True
|
| 196 |
+
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"β XTTS model loading failed: {e}")
|
| 199 |
+
|
| 200 |
+
# Try fallback model
|
| 201 |
+
try:
|
| 202 |
+
print("π Trying fallback model...")
|
| 203 |
+
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
|
| 204 |
+
model_loaded = True
|
| 205 |
+
current_model = "tacotron2-DDC"
|
| 206 |
+
voice_cloning_supported = False
|
| 207 |
+
print("β
Fallback model loaded successfully")
|
| 208 |
+
return True
|
| 209 |
+
except Exception as fallback_error:
|
| 210 |
+
print(f"β Fallback model also failed: {fallback_error}")
|
| 211 |
+
return False
|
| 212 |
+
|
| 213 |
+
finally:
|
| 214 |
+
sys.stdin = old_stdin
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
print(f"β Failed to initialize TTS: {e}")
|
| 218 |
+
return False
|
| 219 |
+
finally:
|
| 220 |
+
model_loading = False
|
| 221 |
+
|
| 222 |
+
# API endpoints with lazy loading
|
| 223 |
@app.post("/api/tts")
|
| 224 |
async def generate_tts(request: TTSRequest):
|
| 225 |
+
"""Generate TTS for a single text with lazy model loading"""
|
| 226 |
try:
|
| 227 |
+
# Lazy load model on first request
|
| 228 |
+
if not model_loaded:
|
| 229 |
+
if not load_tts_model():
|
| 230 |
+
return {
|
| 231 |
+
"status": "error",
|
| 232 |
+
"message": "TTS model failed to load. Please check the logs.",
|
| 233 |
+
"requires_tos_acceptance": True,
|
| 234 |
+
"tos_url": "https://coqui.ai/cpml.txt"
|
| 235 |
+
}
|
| 236 |
|
| 237 |
print(f"π₯ TTS request for project: {request.project_id}")
|
| 238 |
print(f" Text length: {len(request.text)} characters")
|
|
|
|
| 264 |
|
| 265 |
# Generate TTS based on model capabilities
|
| 266 |
if supports_voice_cloning():
|
|
|
|
| 267 |
tts.tts_to_file(
|
| 268 |
text=request.text,
|
| 269 |
speaker_wav=speaker_wav,
|
|
|
|
| 271 |
file_path=output_path
|
| 272 |
)
|
| 273 |
else:
|
|
|
|
| 274 |
tts.tts_to_file(
|
| 275 |
text=request.text,
|
| 276 |
file_path=output_path
|
|
|
|
| 285 |
|
| 286 |
if error:
|
| 287 |
print(f"β OCI upload failed: {error}")
|
|
|
|
| 288 |
return {
|
| 289 |
"status": "partial_success",
|
| 290 |
"message": f"TTS generated but upload failed: {error}",
|
|
|
|
| 317 |
async def batch_generate_tts(request: BatchTTSRequest):
|
| 318 |
"""Generate TTS for multiple texts with sequential naming"""
|
| 319 |
try:
|
| 320 |
+
# Lazy load model on first request
|
| 321 |
+
if not model_loaded:
|
| 322 |
+
if not load_tts_model():
|
| 323 |
+
raise HTTPException(status_code=500, detail="TTS model failed to load")
|
| 324 |
|
| 325 |
print(f"π₯ Batch TTS request for project: {request.project_id}")
|
| 326 |
print(f" Number of texts: {len(request.texts)}")
|
|
|
|
| 352 |
|
| 353 |
# Generate TTS based on model capabilities
|
| 354 |
if supports_voice_cloning():
|
|
|
|
| 355 |
tts.tts_to_file(
|
| 356 |
text=text,
|
| 357 |
speaker_wav=speaker_wav,
|
|
|
|
| 359 |
file_path=output_path
|
| 360 |
)
|
| 361 |
else:
|
|
|
|
| 362 |
tts.tts_to_file(
|
| 363 |
text=text,
|
| 364 |
file_path=output_path
|
|
|
|
| 541 |
"""Health check endpoint"""
|
| 542 |
return {
|
| 543 |
"status": "healthy",
|
| 544 |
+
"tts_loaded": model_loaded,
|
| 545 |
"model": current_model,
|
| 546 |
"voice_cloning_supported": voice_cloning_supported,
|
| 547 |
"device": DEVICE,
|
| 548 |
+
"load_attempts": model_load_attempts,
|
| 549 |
"timestamp": datetime.now().isoformat()
|
| 550 |
}
|
| 551 |
|
| 552 |
+
@app.post("/api/reload-model")
|
| 553 |
+
async def reload_model():
|
| 554 |
+
"""Force reload the TTS model"""
|
| 555 |
+
global tts, model_loaded, current_model, voice_cloning_supported
|
| 556 |
+
|
| 557 |
+
# Clear current model
|
| 558 |
+
tts = None
|
| 559 |
+
model_loaded = False
|
| 560 |
+
current_model = ""
|
| 561 |
+
voice_cloning_supported = False
|
| 562 |
+
|
| 563 |
+
# Try to reload
|
| 564 |
+
success = load_tts_model()
|
| 565 |
+
|
| 566 |
+
return {
|
| 567 |
+
"status": "success" if success else "error",
|
| 568 |
+
"message": "Model reloaded successfully" if success else "Failed to reload model",
|
| 569 |
+
"model_loaded": model_loaded,
|
| 570 |
+
"model": current_model
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
@app.get("/")
|
| 574 |
async def root():
|
| 575 |
"""Root endpoint with API information"""
|
|
|
|
| 581 |
"POST /api/upload-voice": "Upload a voice sample for cloning",
|
| 582 |
"POST /api/clone-voice": "Clone a voice from multiple samples",
|
| 583 |
"GET /api/voices": "List available voices",
|
| 584 |
+
"GET /api/health": "Health check",
|
| 585 |
+
"POST /api/reload-model": "Reload TTS model"
|
| 586 |
},
|
| 587 |
+
"model_loaded": model_loaded,
|
| 588 |
+
"model_name": current_model if model_loaded else "None",
|
| 589 |
"voice_cloning_supported": supports_voice_cloning()
|
| 590 |
}
|
| 591 |
|
|
|
|
| 593 |
import uvicorn
|
| 594 |
print("π Starting TTS API with Coqui TTS and Voice Cloning...")
|
| 595 |
print("π API endpoints available at: http://localhost:7860/")
|
| 596 |
+
print("π‘ Model will be loaded on first request to save memory")
|
| 597 |
+
print("π Use /api/reload-model to force reload if needed")
|
|
|
|
| 598 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|