Update app.py
Browse files
app.py
CHANGED
|
@@ -41,10 +41,6 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 41 |
print(f"β
Using device: {DEVICE}")
|
| 42 |
print(f"π§ OCI Upload URL: {OCI_UPLOAD_API_URL or 'Not configured - uploads will be local only'}")
|
| 43 |
|
| 44 |
-
# Model configuration
|
| 45 |
-
MODEL_REPO_ID = "coqui/XTTS-v2"
|
| 46 |
-
MODEL_CACHE_DIR = "/tmp/tts_models"
|
| 47 |
-
|
| 48 |
# Global state
|
| 49 |
tts = None
|
| 50 |
model_loaded = False
|
|
@@ -53,6 +49,7 @@ voice_cloning_supported = False
|
|
| 53 |
model_loading = False
|
| 54 |
model_load_attempts = 0
|
| 55 |
current_voice_style = "default_female"
|
|
|
|
| 56 |
|
| 57 |
# Pydantic models
|
| 58 |
class TTSRequest(BaseModel):
|
|
@@ -60,7 +57,7 @@ class TTSRequest(BaseModel):
|
|
| 60 |
project_id: str
|
| 61 |
voice_name: Optional[str] = "default"
|
| 62 |
language: Optional[str] = "en"
|
| 63 |
-
voice_style: Optional[str] = "default_female"
|
| 64 |
|
| 65 |
class BatchTTSRequest(BaseModel):
|
| 66 |
texts: List[str]
|
|
@@ -83,7 +80,7 @@ def clean_text(text):
|
|
| 83 |
import re
|
| 84 |
|
| 85 |
if not text or not isinstance(text, str):
|
| 86 |
-
return "Hello"
|
| 87 |
|
| 88 |
# Remove any non-ASCII characters
|
| 89 |
text = text.encode('ascii', 'ignore').decode('ascii')
|
|
@@ -128,7 +125,6 @@ def upload_to_oci(file_path: str, filename: str, project_id: str, file_type="voi
|
|
| 128 |
"subfolder": "voiceover"
|
| 129 |
}
|
| 130 |
|
| 131 |
-
# Add headers and better timeout handling
|
| 132 |
headers = {
|
| 133 |
"User-Agent": "TTS-API/1.0",
|
| 134 |
"Accept": "application/json"
|
|
@@ -249,7 +245,7 @@ def save_wav(audio, file_path):
|
|
| 249 |
# Try soundfile first
|
| 250 |
try:
|
| 251 |
import soundfile as sf
|
| 252 |
-
sf.write(file_path, audio, 22050)
|
| 253 |
return True
|
| 254 |
except ImportError:
|
| 255 |
print("β οΈ soundfile not available, using fallback method")
|
|
@@ -258,17 +254,15 @@ def save_wav(audio, file_path):
|
|
| 258 |
import wave
|
| 259 |
import numpy as np
|
| 260 |
|
| 261 |
-
# Ensure audio is numpy array
|
| 262 |
if isinstance(audio, list):
|
| 263 |
audio = np.array(audio)
|
| 264 |
|
| 265 |
-
# Convert to 16-bit PCM
|
| 266 |
audio_int16 = (audio * 32767).astype(np.int16)
|
| 267 |
|
| 268 |
with wave.open(file_path, 'wb') as wav_file:
|
| 269 |
-
wav_file.setnchannels(1)
|
| 270 |
-
wav_file.setsampwidth(2)
|
| 271 |
-
wav_file.setframerate(22050)
|
| 272 |
wav_file.writeframes(audio_int16.tobytes())
|
| 273 |
|
| 274 |
return True
|
|
@@ -278,134 +272,126 @@ def save_wav(audio, file_path):
|
|
| 278 |
return False
|
| 279 |
|
| 280 |
def load_tts_model(voice_style="default_female"):
|
| 281 |
-
"""Load TTS model with different voice options"""
|
| 282 |
global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, current_voice_style
|
| 283 |
|
| 284 |
if model_loading:
|
| 285 |
print("β³ Model is already being loaded...")
|
| 286 |
return False
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
model_loading = True
|
| 289 |
model_load_attempts += 1
|
| 290 |
|
| 291 |
try:
|
| 292 |
from TTS.api import TTS
|
| 293 |
|
| 294 |
-
#
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
try:
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
"male_medium": {
|
| 310 |
-
"name": "tts_models/en/vctk/vits",
|
| 311 |
-
"description": "VITS - Multiple speakers",
|
| 312 |
-
"speaker": "p226" # Male voice
|
| 313 |
-
},
|
| 314 |
-
"female_1": {
|
| 315 |
-
"name": "tts_models/en/vctk/vits",
|
| 316 |
-
"description": "VITS - Multiple speakers",
|
| 317 |
-
"speaker": "p227" # Female voice
|
| 318 |
-
},
|
| 319 |
-
"female_2": {
|
| 320 |
-
"name": "tts_models/en/vctk/vits",
|
| 321 |
-
"description": "VITS - Multiple speakers",
|
| 322 |
-
"speaker": "p228" # Female voice
|
| 323 |
-
},
|
| 324 |
-
"default_female": {
|
| 325 |
-
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 326 |
-
"description": "Tacotron2 - Default female (current)",
|
| 327 |
-
"speaker": None
|
| 328 |
-
},
|
| 329 |
-
"clear_male": {
|
| 330 |
-
"name": "tts_models/en/ek1/tacotron2",
|
| 331 |
-
"description": "Tacotron2 - Clear male voice",
|
| 332 |
-
"speaker": None
|
| 333 |
-
}
|
| 334 |
-
}
|
| 335 |
-
|
| 336 |
-
selected_model = model_options.get(voice_style, model_options["default_female"])
|
| 337 |
-
current_voice_style = voice_style
|
| 338 |
-
|
| 339 |
-
print(f"π Loading {selected_model['description']}...")
|
| 340 |
-
|
| 341 |
-
# Load the selected model
|
| 342 |
-
tts = TTS(selected_model["name"]).to(DEVICE)
|
| 343 |
-
|
| 344 |
-
# Test the model
|
| 345 |
-
test_path = "/tmp/test_output.wav"
|
| 346 |
-
|
| 347 |
-
if selected_model["speaker"]:
|
| 348 |
-
# For VITS model with speaker selection
|
| 349 |
-
tts.tts_to_file(
|
| 350 |
-
text="Test voice",
|
| 351 |
-
file_path=test_path,
|
| 352 |
-
speaker=selected_model["speaker"]
|
| 353 |
-
)
|
| 354 |
-
else:
|
| 355 |
-
# For standard models
|
| 356 |
-
tts.tts_to_file(text="Test voice", file_path=test_path)
|
| 357 |
-
|
| 358 |
-
if os.path.exists(test_path):
|
| 359 |
-
os.remove(test_path)
|
| 360 |
-
print(f"β
{selected_model['description']} loaded successfully!")
|
| 361 |
-
else:
|
| 362 |
-
raise Exception("Test failed - no file created")
|
| 363 |
-
|
| 364 |
-
model_loaded = True
|
| 365 |
-
current_model = selected_model["name"]
|
| 366 |
-
voice_cloning_supported = False
|
| 367 |
-
return True
|
| 368 |
-
|
| 369 |
-
except Exception as e:
|
| 370 |
-
print(f"β Model loading failed: {e}")
|
| 371 |
-
# Fallback to default
|
| 372 |
-
print("π Falling back to default Tacotron2...")
|
| 373 |
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
|
| 383 |
except Exception as e:
|
| 384 |
print(f"β Failed to initialize TTS: {e}")
|
|
|
|
| 385 |
return False
|
| 386 |
finally:
|
| 387 |
model_loading = False
|
| 388 |
|
| 389 |
-
# Health check
|
| 390 |
@app.get("/")
|
| 391 |
async def root():
|
|
|
|
| 392 |
return {
|
| 393 |
"status": "running",
|
| 394 |
"service": "TTS API",
|
|
|
|
| 395 |
"model_loaded": model_loaded,
|
| 396 |
-
"current_model": current_model,
|
| 397 |
"device": DEVICE,
|
| 398 |
"oci_configured": bool(OCI_UPLOAD_API_URL)
|
| 399 |
}
|
| 400 |
|
| 401 |
-
@app.get("/
|
| 402 |
async def health_check():
|
| 403 |
-
"""Health check endpoint"""
|
| 404 |
return {
|
| 405 |
"status": "healthy",
|
|
|
|
|
|
|
| 406 |
"model_loaded": model_loaded,
|
| 407 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
"device": DEVICE,
|
|
|
|
| 409 |
"timestamp": datetime.now().isoformat()
|
| 410 |
}
|
| 411 |
|
|
@@ -419,9 +405,9 @@ async def check_oci_health():
|
|
| 419 |
}
|
| 420 |
|
| 421 |
try:
|
| 422 |
-
# Test connection to OCI service
|
| 423 |
test_url = f"{OCI_UPLOAD_API_URL}/api/health"
|
| 424 |
-
response = requests.get(test_url, timeout=
|
| 425 |
|
| 426 |
if response.status_code == 200:
|
| 427 |
return {
|
|
@@ -447,12 +433,13 @@ async def check_oci_health():
|
|
| 447 |
async def generate_tts(request: TTSRequest):
|
| 448 |
"""Generate TTS for a single text with lazy model loading"""
|
| 449 |
try:
|
| 450 |
-
# Lazy load model on first request
|
| 451 |
if not model_loaded or current_voice_style != request.voice_style:
|
|
|
|
| 452 |
if not load_tts_model(request.voice_style):
|
| 453 |
return {
|
| 454 |
"status": "error",
|
| 455 |
-
"message": "TTS model failed to load. Please
|
| 456 |
"requires_tos_acceptance": True,
|
| 457 |
"tos_url": "https://coqui.ai/cpml.txt"
|
| 458 |
}
|
|
@@ -460,7 +447,6 @@ async def generate_tts(request: TTSRequest):
|
|
| 460 |
print(f"π₯ TTS request for project: {request.project_id}")
|
| 461 |
print(f" Text length: {len(request.text)} characters")
|
| 462 |
print(f" Voice style: {request.voice_style}")
|
| 463 |
-
print(f" Language: {request.language}")
|
| 464 |
|
| 465 |
# Check if voice cloning is requested but not supported
|
| 466 |
if request.voice_name != "default" and not supports_voice_cloning():
|
|
@@ -470,7 +456,7 @@ async def generate_tts(request: TTSRequest):
|
|
| 470 |
"model": current_model
|
| 471 |
}
|
| 472 |
|
| 473 |
-
# Generate unique filename
|
| 474 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 475 |
filename = f"voiceover_{timestamp}.wav"
|
| 476 |
output_path = f"/tmp/output/{filename}"
|
|
@@ -492,52 +478,24 @@ async def generate_tts(request: TTSRequest):
|
|
| 492 |
|
| 493 |
# Clean the text before generation
|
| 494 |
cleaned_text = clean_text(request.text)
|
| 495 |
-
print(f"π Original text: '{request.text}'")
|
| 496 |
print(f"π Cleaned text: '{cleaned_text}'")
|
| 497 |
|
| 498 |
-
# Generate TTS
|
| 499 |
try:
|
| 500 |
-
print(f"π
|
| 501 |
|
| 502 |
-
#
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
"male_deep": "p225",
|
| 508 |
-
"male_medium": "p226",
|
| 509 |
-
"female_1": "p227",
|
| 510 |
-
"female_2": "p228"
|
| 511 |
-
}
|
| 512 |
-
speaker = speaker_map.get(request.voice_style)
|
| 513 |
-
|
| 514 |
-
if speaker:
|
| 515 |
-
# For VITS model with speaker selection
|
| 516 |
-
tts.tts_to_file(
|
| 517 |
-
text=cleaned_text,
|
| 518 |
-
file_path=output_path,
|
| 519 |
-
speaker=speaker
|
| 520 |
-
)
|
| 521 |
-
else:
|
| 522 |
-
# For standard models
|
| 523 |
-
tts.tts_to_file(
|
| 524 |
-
text=cleaned_text,
|
| 525 |
-
file_path=output_path
|
| 526 |
-
)
|
| 527 |
|
| 528 |
except Exception as tts_error:
|
| 529 |
print(f"β TTS generation failed: {tts_error}")
|
| 530 |
# Try alternative approach
|
| 531 |
try:
|
| 532 |
print("π Trying alternative TTS generation method...")
|
| 533 |
-
|
| 534 |
-
if speaker:
|
| 535 |
-
audio = tts.tts(
|
| 536 |
-
text=cleaned_text,
|
| 537 |
-
speaker=speaker
|
| 538 |
-
)
|
| 539 |
-
else:
|
| 540 |
-
audio = tts.tts(text=cleaned_text)
|
| 541 |
|
| 542 |
# Save manually
|
| 543 |
if not save_wav(audio, output_path):
|
|
@@ -561,7 +519,7 @@ async def generate_tts(request: TTSRequest):
|
|
| 561 |
|
| 562 |
if error:
|
| 563 |
print(f"β OCI upload failed: {error}")
|
| 564 |
-
#
|
| 565 |
return {
|
| 566 |
"status": "success_local",
|
| 567 |
"message": f"TTS generated locally (upload failed: {error})",
|
|
@@ -588,21 +546,12 @@ async def generate_tts(request: TTSRequest):
|
|
| 588 |
"filename": filename,
|
| 589 |
"oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
|
| 590 |
"model_used": current_model,
|
| 591 |
-
"voice_style": request.voice_style
|
| 592 |
-
"voice_cloning": supports_voice_cloning() and request.voice_name != "default"
|
| 593 |
}
|
| 594 |
|
| 595 |
except Exception as e:
|
| 596 |
print(f"β TTS generation error: {str(e)}")
|
| 597 |
-
|
| 598 |
-
error_detail = {
|
| 599 |
-
"error": str(e),
|
| 600 |
-
"model": current_model,
|
| 601 |
-
"voice_style": request.voice_style,
|
| 602 |
-
"voice_cloning_supported": supports_voice_cloning(),
|
| 603 |
-
"device": DEVICE
|
| 604 |
-
}
|
| 605 |
-
raise HTTPException(status_code=500, detail=error_detail)
|
| 606 |
|
| 607 |
@app.post("/api/batch-tts")
|
| 608 |
async def batch_generate_tts(request: BatchTTSRequest):
|
|
@@ -610,13 +559,13 @@ async def batch_generate_tts(request: BatchTTSRequest):
|
|
| 610 |
try:
|
| 611 |
# Lazy load model on first request
|
| 612 |
if not model_loaded or current_voice_style != request.voice_style:
|
|
|
|
| 613 |
if not load_tts_model(request.voice_style):
|
| 614 |
raise HTTPException(status_code=500, detail="TTS model failed to load")
|
| 615 |
|
| 616 |
print(f"π₯ Batch TTS request for project: {request.project_id}")
|
| 617 |
print(f" Number of texts: {len(request.texts)}")
|
| 618 |
print(f" Voice style: {request.voice_style}")
|
| 619 |
-
print(f" Language: {request.language}")
|
| 620 |
|
| 621 |
# Check if voice cloning is requested but not supported
|
| 622 |
if request.voice_name != "default" and not supports_voice_cloning():
|
|
@@ -647,25 +596,7 @@ async def batch_generate_tts(request: BatchTTSRequest):
|
|
| 647 |
|
| 648 |
# Generate TTS
|
| 649 |
try:
|
| 650 |
-
|
| 651 |
-
speaker = None
|
| 652 |
-
if "vctk/vits" in current_model:
|
| 653 |
-
speaker_map = {
|
| 654 |
-
"male_deep": "p225",
|
| 655 |
-
"male_medium": "p226",
|
| 656 |
-
"female_1": "p227",
|
| 657 |
-
"female_2": "p228"
|
| 658 |
-
}
|
| 659 |
-
speaker = speaker_map.get(request.voice_style)
|
| 660 |
-
|
| 661 |
-
if speaker:
|
| 662 |
-
tts.tts_to_file(
|
| 663 |
-
text=cleaned_text,
|
| 664 |
-
file_path=output_path,
|
| 665 |
-
speaker=speaker
|
| 666 |
-
)
|
| 667 |
-
else:
|
| 668 |
-
tts.tts_to_file(text=cleaned_text, file_path=output_path)
|
| 669 |
|
| 670 |
# Verify file was created
|
| 671 |
if not os.path.exists(output_path):
|
|
@@ -828,15 +759,42 @@ async def change_voice_style(request: ChangeVoiceRequest):
|
|
| 828 |
async def get_voice_styles():
|
| 829 |
"""Get available voice styles"""
|
| 830 |
styles = {
|
| 831 |
-
"
|
| 832 |
-
"
|
| 833 |
-
"
|
| 834 |
-
"
|
| 835 |
-
"
|
| 836 |
-
"
|
| 837 |
}
|
| 838 |
return {"voice_styles": styles}
|
| 839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
if __name__ == "__main__":
|
| 841 |
import uvicorn
|
| 842 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 41 |
print(f"β
Using device: {DEVICE}")
|
| 42 |
print(f"π§ OCI Upload URL: {OCI_UPLOAD_API_URL or 'Not configured - uploads will be local only'}")
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Global state
|
| 45 |
tts = None
|
| 46 |
model_loaded = False
|
|
|
|
| 49 |
model_loading = False
|
| 50 |
model_load_attempts = 0
|
| 51 |
current_voice_style = "default_female"
|
| 52 |
+
app_startup_time = datetime.now()
|
| 53 |
|
| 54 |
# Pydantic models
|
| 55 |
class TTSRequest(BaseModel):
|
|
|
|
| 57 |
project_id: str
|
| 58 |
voice_name: Optional[str] = "default"
|
| 59 |
language: Optional[str] = "en"
|
| 60 |
+
voice_style: Optional[str] = "default_female"
|
| 61 |
|
| 62 |
class BatchTTSRequest(BaseModel):
|
| 63 |
texts: List[str]
|
|
|
|
| 80 |
import re
|
| 81 |
|
| 82 |
if not text or not isinstance(text, str):
|
| 83 |
+
return "Hello"
|
| 84 |
|
| 85 |
# Remove any non-ASCII characters
|
| 86 |
text = text.encode('ascii', 'ignore').decode('ascii')
|
|
|
|
| 125 |
"subfolder": "voiceover"
|
| 126 |
}
|
| 127 |
|
|
|
|
| 128 |
headers = {
|
| 129 |
"User-Agent": "TTS-API/1.0",
|
| 130 |
"Accept": "application/json"
|
|
|
|
| 245 |
# Try soundfile first
|
| 246 |
try:
|
| 247 |
import soundfile as sf
|
| 248 |
+
sf.write(file_path, audio, 22050)
|
| 249 |
return True
|
| 250 |
except ImportError:
|
| 251 |
print("β οΈ soundfile not available, using fallback method")
|
|
|
|
| 254 |
import wave
|
| 255 |
import numpy as np
|
| 256 |
|
|
|
|
| 257 |
if isinstance(audio, list):
|
| 258 |
audio = np.array(audio)
|
| 259 |
|
|
|
|
| 260 |
audio_int16 = (audio * 32767).astype(np.int16)
|
| 261 |
|
| 262 |
with wave.open(file_path, 'wb') as wav_file:
|
| 263 |
+
wav_file.setnchannels(1)
|
| 264 |
+
wav_file.setsampwidth(2)
|
| 265 |
+
wav_file.setframerate(22050)
|
| 266 |
wav_file.writeframes(audio_int16.tobytes())
|
| 267 |
|
| 268 |
return True
|
|
|
|
| 272 |
return False
|
| 273 |
|
| 274 |
def load_tts_model(voice_style="default_female"):
|
| 275 |
+
"""Load TTS model with different voice options - LAZY LOADING"""
|
| 276 |
global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, current_voice_style
|
| 277 |
|
| 278 |
if model_loading:
|
| 279 |
print("β³ Model is already being loaded...")
|
| 280 |
return False
|
| 281 |
|
| 282 |
+
if model_loaded and current_voice_style == voice_style:
|
| 283 |
+
print("β
Model already loaded with requested voice style")
|
| 284 |
+
return True
|
| 285 |
+
|
| 286 |
model_loading = True
|
| 287 |
model_load_attempts += 1
|
| 288 |
|
| 289 |
try:
|
| 290 |
from TTS.api import TTS
|
| 291 |
|
| 292 |
+
# Use smaller, faster models for initial load
|
| 293 |
+
model_options = {
|
| 294 |
+
"default_female": {
|
| 295 |
+
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 296 |
+
"description": "Tacotron2 - Default female (fast)",
|
| 297 |
+
"speaker": None
|
| 298 |
+
},
|
| 299 |
+
"clear_male": {
|
| 300 |
+
"name": "tts_models/en/ek1/tacotron2",
|
| 301 |
+
"description": "Tacotron2 - Clear male voice",
|
| 302 |
+
"speaker": None
|
| 303 |
+
},
|
| 304 |
+
# Fallbacks for other voice styles
|
| 305 |
+
"male_deep": {
|
| 306 |
+
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 307 |
+
"description": "Tacotron2 - Default female (fallback)",
|
| 308 |
+
"speaker": None
|
| 309 |
+
},
|
| 310 |
+
"male_medium": {
|
| 311 |
+
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 312 |
+
"description": "Tacotron2 - Default female (fallback)",
|
| 313 |
+
"speaker": None
|
| 314 |
+
},
|
| 315 |
+
"female_1": {
|
| 316 |
+
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 317 |
+
"description": "Tacotron2 - Default female (fallback)",
|
| 318 |
+
"speaker": None
|
| 319 |
+
},
|
| 320 |
+
"female_2": {
|
| 321 |
+
"name": "tts_models/en/ljspeech/tacotron2-DDC",
|
| 322 |
+
"description": "Tacotron2 - Default female (fallback)",
|
| 323 |
+
"speaker": None
|
| 324 |
+
}
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
selected_model = model_options.get(voice_style, model_options["default_female"])
|
| 328 |
+
current_voice_style = voice_style
|
| 329 |
|
| 330 |
+
print(f"π Loading {selected_model['description']}...")
|
| 331 |
+
print(f"π₯ This may take a few minutes on first load...")
|
| 332 |
|
| 333 |
+
# Load the selected model
|
| 334 |
+
tts = TTS(selected_model["name"]).to(DEVICE)
|
| 335 |
+
|
| 336 |
+
# Quick test
|
| 337 |
try:
|
| 338 |
+
test_text = "Hello"
|
| 339 |
+
audio = tts.tts(text=test_text)
|
| 340 |
+
print(f"β
{selected_model['description']} loaded successfully!")
|
| 341 |
+
except Exception as test_error:
|
| 342 |
+
print(f"β Model test failed: {test_error}")
|
| 343 |
+
# Try fallback to default
|
| 344 |
+
print("π Trying fallback model...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
|
| 346 |
+
tts.tts(text="Hello")
|
| 347 |
+
selected_model = model_options["default_female"]
|
| 348 |
+
|
| 349 |
+
model_loaded = True
|
| 350 |
+
current_model = selected_model["name"]
|
| 351 |
+
voice_cloning_supported = False
|
| 352 |
+
model_load_attempts = 0
|
| 353 |
+
return True
|
| 354 |
|
| 355 |
except Exception as e:
|
| 356 |
print(f"β Failed to initialize TTS: {e}")
|
| 357 |
+
model_loading = False
|
| 358 |
return False
|
| 359 |
finally:
|
| 360 |
model_loading = False
|
| 361 |
|
| 362 |
+
# Health check endpoints - CRITICAL FOR DEPLOYMENT
|
| 363 |
@app.get("/")
|
| 364 |
async def root():
|
| 365 |
+
"""Root endpoint - always responds quickly"""
|
| 366 |
return {
|
| 367 |
"status": "running",
|
| 368 |
"service": "TTS API",
|
| 369 |
+
"startup_time": app_startup_time.isoformat(),
|
| 370 |
"model_loaded": model_loaded,
|
|
|
|
| 371 |
"device": DEVICE,
|
| 372 |
"oci_configured": bool(OCI_UPLOAD_API_URL)
|
| 373 |
}
|
| 374 |
|
| 375 |
+
@app.get("/health")
|
| 376 |
async def health_check():
|
| 377 |
+
"""Health check endpoint - must respond quickly"""
|
| 378 |
return {
|
| 379 |
"status": "healthy",
|
| 380 |
+
"timestamp": datetime.now().isoformat(),
|
| 381 |
+
"startup_time": app_startup_time.isoformat(),
|
| 382 |
"model_loaded": model_loaded,
|
| 383 |
+
"service": "TTS API"
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
@app.get("/api/health")
|
| 387 |
+
async def api_health_check():
|
| 388 |
+
"""API health check with model status"""
|
| 389 |
+
return {
|
| 390 |
+
"status": "healthy",
|
| 391 |
+
"model_loaded": model_loaded,
|
| 392 |
+
"current_model": current_model if model_loaded else "none",
|
| 393 |
"device": DEVICE,
|
| 394 |
+
"uptime": str(datetime.now() - app_startup_time),
|
| 395 |
"timestamp": datetime.now().isoformat()
|
| 396 |
}
|
| 397 |
|
|
|
|
| 405 |
}
|
| 406 |
|
| 407 |
try:
|
| 408 |
+
# Test connection to OCI service with short timeout
|
| 409 |
test_url = f"{OCI_UPLOAD_API_URL}/api/health"
|
| 410 |
+
response = requests.get(test_url, timeout=5)
|
| 411 |
|
| 412 |
if response.status_code == 200:
|
| 413 |
return {
|
|
|
|
| 433 |
async def generate_tts(request: TTSRequest):
|
| 434 |
"""Generate TTS for a single text with lazy model loading"""
|
| 435 |
try:
|
| 436 |
+
# Lazy load model on first request
|
| 437 |
if not model_loaded or current_voice_style != request.voice_style:
|
| 438 |
+
print("π Lazy loading TTS model...")
|
| 439 |
if not load_tts_model(request.voice_style):
|
| 440 |
return {
|
| 441 |
"status": "error",
|
| 442 |
+
"message": "TTS model failed to load. Please try again in a moment.",
|
| 443 |
"requires_tos_acceptance": True,
|
| 444 |
"tos_url": "https://coqui.ai/cpml.txt"
|
| 445 |
}
|
|
|
|
| 447 |
print(f"π₯ TTS request for project: {request.project_id}")
|
| 448 |
print(f" Text length: {len(request.text)} characters")
|
| 449 |
print(f" Voice style: {request.voice_style}")
|
|
|
|
| 450 |
|
| 451 |
# Check if voice cloning is requested but not supported
|
| 452 |
if request.voice_name != "default" and not supports_voice_cloning():
|
|
|
|
| 456 |
"model": current_model
|
| 457 |
}
|
| 458 |
|
| 459 |
+
# Generate unique filename
|
| 460 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 461 |
filename = f"voiceover_{timestamp}.wav"
|
| 462 |
output_path = f"/tmp/output/{filename}"
|
|
|
|
| 478 |
|
| 479 |
# Clean the text before generation
|
| 480 |
cleaned_text = clean_text(request.text)
|
|
|
|
| 481 |
print(f"π Cleaned text: '{cleaned_text}'")
|
| 482 |
|
| 483 |
+
# Generate TTS with error handling
|
| 484 |
try:
|
| 485 |
+
print(f"π Generating TTS with {current_model}...")
|
| 486 |
|
| 487 |
+
# Simple TTS generation for fast models
|
| 488 |
+
tts.tts_to_file(
|
| 489 |
+
text=cleaned_text,
|
| 490 |
+
file_path=output_path
|
| 491 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
except Exception as tts_error:
|
| 494 |
print(f"β TTS generation failed: {tts_error}")
|
| 495 |
# Try alternative approach
|
| 496 |
try:
|
| 497 |
print("π Trying alternative TTS generation method...")
|
| 498 |
+
audio = tts.tts(text=cleaned_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
# Save manually
|
| 501 |
if not save_wav(audio, output_path):
|
|
|
|
| 519 |
|
| 520 |
if error:
|
| 521 |
print(f"β OCI upload failed: {error}")
|
| 522 |
+
# Return success with local file info
|
| 523 |
return {
|
| 524 |
"status": "success_local",
|
| 525 |
"message": f"TTS generated locally (upload failed: {error})",
|
|
|
|
| 546 |
"filename": filename,
|
| 547 |
"oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
|
| 548 |
"model_used": current_model,
|
| 549 |
+
"voice_style": request.voice_style
|
|
|
|
| 550 |
}
|
| 551 |
|
| 552 |
except Exception as e:
|
| 553 |
print(f"β TTS generation error: {str(e)}")
|
| 554 |
+
raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
|
| 556 |
@app.post("/api/batch-tts")
|
| 557 |
async def batch_generate_tts(request: BatchTTSRequest):
|
|
|
|
| 559 |
try:
|
| 560 |
# Lazy load model on first request
|
| 561 |
if not model_loaded or current_voice_style != request.voice_style:
|
| 562 |
+
print("π Lazy loading TTS model for batch processing...")
|
| 563 |
if not load_tts_model(request.voice_style):
|
| 564 |
raise HTTPException(status_code=500, detail="TTS model failed to load")
|
| 565 |
|
| 566 |
print(f"π₯ Batch TTS request for project: {request.project_id}")
|
| 567 |
print(f" Number of texts: {len(request.texts)}")
|
| 568 |
print(f" Voice style: {request.voice_style}")
|
|
|
|
| 569 |
|
| 570 |
# Check if voice cloning is requested but not supported
|
| 571 |
if request.voice_name != "default" and not supports_voice_cloning():
|
|
|
|
| 596 |
|
| 597 |
# Generate TTS
|
| 598 |
try:
|
| 599 |
+
tts.tts_to_file(text=cleaned_text, file_path=output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
# Verify file was created
|
| 602 |
if not os.path.exists(output_path):
|
|
|
|
| 759 |
async def get_voice_styles():
|
| 760 |
"""Get available voice styles"""
|
| 761 |
styles = {
|
| 762 |
+
"default_female": "Default female voice (Tacotron2) - Fast",
|
| 763 |
+
"clear_male": "Clear male voice (Tacotron2) - Fast",
|
| 764 |
+
"male_deep": "Deep male voice (Fallback to default)",
|
| 765 |
+
"male_medium": "Medium male voice (Fallback to default)",
|
| 766 |
+
"female_1": "Female voice 1 (Fallback to default)",
|
| 767 |
+
"female_2": "Female voice 2 (Fallback to default)"
|
| 768 |
}
|
| 769 |
return {"voice_styles": styles}
|
| 770 |
|
| 771 |
+
@app.get("/api/status")
|
| 772 |
+
async def get_status():
|
| 773 |
+
"""Get detailed application status"""
|
| 774 |
+
return {
|
| 775 |
+
"status": "running",
|
| 776 |
+
"model_loaded": model_loaded,
|
| 777 |
+
"current_model": current_model if model_loaded else "none",
|
| 778 |
+
"current_voice_style": current_voice_style,
|
| 779 |
+
"device": DEVICE,
|
| 780 |
+
"oci_configured": bool(OCI_UPLOAD_API_URL),
|
| 781 |
+
"startup_time": app_startup_time.isoformat(),
|
| 782 |
+
"uptime": str(datetime.now() - app_startup_time),
|
| 783 |
+
"model_load_attempts": model_load_attempts
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
# Startup event - NO MODEL LOADING to avoid timeouts
|
| 787 |
+
@app.on_event("startup")
|
| 788 |
+
async def startup_event():
|
| 789 |
+
"""Startup event - no model loading to avoid timeouts"""
|
| 790 |
+
print("=" * 50)
|
| 791 |
+
print("π TTS API Starting Up...")
|
| 792 |
+
print(f"β
Device: {DEVICE}")
|
| 793 |
+
print(f"π§ OCI Upload: {OCI_UPLOAD_API_URL or 'Local only'}")
|
| 794 |
+
print("π Models will load on first request (lazy loading)")
|
| 795 |
+
print("β° Startup time:", app_startup_time.isoformat())
|
| 796 |
+
print("=" * 50)
|
| 797 |
+
|
| 798 |
if __name__ == "__main__":
|
| 799 |
import uvicorn
|
| 800 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, access_log=False)
|