MaenGit commited on
Commit
6d29231
·
1 Parent(s): ed05d40
Files changed (1) hide show
  1. server.py +35 -13
server.py CHANGED
@@ -1,19 +1,16 @@
1
  from flask import Flask, request, send_file, jsonify, after_this_request
2
  from TTS.api import TTS
3
- import tempfile
4
  import os
5
  import uuid
6
- import soundfile as sf
7
  import torch
 
8
 
9
  app = Flask(__name__)
10
 
11
  MODEL_MULTI = "tts_models/multilingual/multi-dataset/xtts_v2"
12
- tts_multi = TTS(model_name=MODEL_MULTI, gpu=False) # Set gpu=True if CUDA available
13
-
14
- # Print available speakers and load mapping (no manual latents needed)
15
- print("Available speakers:", tts_multi.speakers[:10], "...") # XTTS has 100+ speakers [web:page:2]
16
 
 
17
  SPEAKERS = {
18
  "en_male": "Baldur Sanjin",
19
  "en_female": "Gracie Wise",
@@ -21,6 +18,22 @@ SPEAKERS = {
21
  "ar_female": "Claribel Dervla"
22
  }
23
  print("EN/AR Speakers:", SPEAKERS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  @app.route("/tts", methods=["POST"])
26
  def tts_api():
@@ -32,21 +45,30 @@ def tts_api():
32
  if not text:
33
  return jsonify({"error": "Text is required"}), 400
34
 
35
- # Pick a speaker by name (XTTS supports direct string names for presets)
36
  speaker_name = SPEAKERS.get(f"{language}_{gender}", "Baldur Sanjin")
37
 
 
 
 
 
38
  out_path = f"/tmp/{uuid.uuid4()}.wav"
39
 
40
- # Use high-level TTS API - handles latents/embeddings internally
41
- # speed supported via split_sentences=True + length_scale, but for simplicity use tts_to_file
42
- tts_multi.tts_to_file(
 
43
  text=text,
44
- speaker=speaker_name, # Direct string works for XTTS presets [web:page:2]
 
45
  language=language,
46
- file_path=out_path,
47
- speed=speed # Note: speed param may vary by TTS version; test or use post-processing
48
  )
49
 
 
 
 
 
50
  @after_this_request
51
  def cleanup(response):
52
  try:
 
1
  from flask import Flask, request, send_file, jsonify, after_this_request
2
  from TTS.api import TTS
 
3
  import os
4
  import uuid
 
5
  import torch
6
+ import requests # For potential voice download if needed
7
 
8
  app = Flask(__name__)
9
 
10
  MODEL_MULTI = "tts_models/multilingual/multi-dataset/xtts_v2"
11
+ tts_multi = TTS(model_name=MODEL_MULTI, gpu=False)
 
 
 
12
 
13
+ # Confirmed XTTS v2 preset speakers (no .pth files needed for basic use)
14
  SPEAKERS = {
15
  "en_male": "Baldur Sanjin",
16
  "en_female": "Gracie Wise",
 
18
  "ar_female": "Claribel Dervla"
19
  }
20
  print("EN/AR Speakers:", SPEAKERS)
21
+ print("Available speakers count:", len(tts_multi.speakers) if hasattr(tts_multi, 'speakers') else "N/A")[web:20]
22
+
23
+ # Cache low-level components for manual latents (safer than synthesizer access)
24
+ gpt_cond_latent_cache = {}
25
+ speaker_embedding_cache = {}
26
+
27
+ def load_speaker_embedding(speaker_name):
28
+ """Load precomputed latents for XTTS preset speakers"""
29
+ if speaker_name in gpt_cond_latent_cache:
30
+ return gpt_cond_latent_cache[speaker_name], speaker_embedding_cache[speaker_name]
31
+
32
+ # Use TTS internals safely
33
+ gpt_cond_latent, speaker_embedding = tts_multi.synthesizer.speaker_manager.speakers[speaker_name].values()
34
+ gpt_cond_latent_cache[speaker_name] = gpt_cond_latent
35
+ speaker_embedding_cache[speaker_name] = speaker_embedding
36
+ return gpt_cond_latent, speaker_embedding[web:36]
37
 
38
  @app.route("/tts", methods=["POST"])
39
  def tts_api():
 
45
  if not text:
46
  return jsonify({"error": "Text is required"}), 400
47
 
 
48
  speaker_name = SPEAKERS.get(f"{language}_{gender}", "Baldur Sanjin")
49
 
50
+ # Verify speaker exists
51
+ if speaker_name not in tts_multi.synthesizer.speaker_manager.speakers:
52
+ return jsonify({"error": f"Speaker '{speaker_name}' not available"}), 400
53
+
54
  out_path = f"/tmp/{uuid.uuid4()}.wav"
55
 
56
+ # Low-level TTS with cached latents (bypasses voice file lookup for presets)
57
+ gpt_cond_latent, speaker_embedding = load_speaker_embedding(speaker_name)
58
+
59
+ wav = tts_multi.synthesizer.tts(
60
  text=text,
61
+ gpt_cond_latent=gpt_cond_latent,
62
+ speaker_embedding=speaker_embedding,
63
  language=language,
64
+ temperature=0.7,
65
+ speed=speed # XTTS supports speed via length_scale internally
66
  )
67
 
68
+ # Save WAV (sample rate 24kHz for XTTS)
69
+ import soundfile as sf
70
+ sf.write(out_path, wav, 24000, subtype="PCM_16")
71
+
72
  @after_this_request
73
  def cleanup(response):
74
  try: