ssasio commited on
Commit
0b0f2b2
·
verified ·
1 Parent(s): 4bb711e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -7
app.py CHANGED
@@ -42,16 +42,35 @@ CODEC = None
42
  DEFAULT_SPEAKER_EMB = None
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def load_model():
46
- global MODEL, TOKENIZER, CODEC, DEFAULT_SPEAKER_EMB
47
  print(f"Loading model on {DEVICE}...")
48
  MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
49
  TOKENIZER = TTSTokenizer()
50
  CODEC = CodecV6(device=DEVICE)
51
 
52
- # Load default speaker embedding from sample wav
53
- result = CODEC.encode(SAMPLE_WAV_PATH)
54
- DEFAULT_SPEAKER_EMB = result['global_embedding'].to(DEVICE)
 
 
 
 
 
 
 
55
  print("Model ready!")
56
 
57
 
@@ -89,8 +108,9 @@ app = FastAPI()
89
 
90
  @app.get("/synthesize")
91
  def api_synthesize(
92
- text: str = Query(..., description="Text to synthesize"),
93
- api_key: str = Query(..., description="API key"),
 
94
  ):
95
  if api_key != API_KEY:
96
  raise HTTPException(status_code=403, detail="Invalid API key")
@@ -99,8 +119,10 @@ def api_synthesize(
99
  if len(text) > 500:
100
  raise HTTPException(status_code=400, detail="Text too long (max 500 chars)")
101
 
 
 
102
  try:
103
- wav = synthesize_text(text)
104
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
105
  sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
106
  return FileResponse(tmp.name, media_type="audio/wav")
 
42
  DEFAULT_SPEAKER_EMB = None
43
 
44
 
45
+ # Speaker embeddings per voice
46
+ VOICE_EMBEDDINGS = {}
47
+
48
+ VOICE_WAV_MAP = {
49
+ "ani-bg-female": "sample_female_bg1.wav",
50
+ "ani-bg-male": "sample_male_bg1.wav",
51
+ "ani-bg-male2": "sample_male2_bg1.wav",
52
+ "ani-en-female": "sample_female_en1.wav",
53
+ "ani-en-male": "sample_male2_en1.wav",
54
+ }
55
+
56
+
57
  def load_model():
58
+ global MODEL, TOKENIZER, CODEC, DEFAULT_SPEAKER_EMB, VOICE_EMBEDDINGS
59
  print(f"Loading model on {DEVICE}...")
60
  MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE)
61
  TOKENIZER = TTSTokenizer()
62
  CODEC = CodecV6(device=DEVICE)
63
 
64
+ for voice_id, wav_file in VOICE_WAV_MAP.items():
65
+ if os.path.exists(wav_file):
66
+ result = CODEC.encode(wav_file)
67
+ VOICE_EMBEDDINGS[voice_id] = result['global_embedding'].to(DEVICE)
68
+ print(f"Loaded speaker: {voice_id}")
69
+
70
+ DEFAULT_SPEAKER_EMB = VOICE_EMBEDDINGS.get("ani-bg-female")
71
+ if DEFAULT_SPEAKER_EMB is None:
72
+ result = CODEC.encode(SAMPLE_WAV_PATH)
73
+ DEFAULT_SPEAKER_EMB = result['global_embedding'].to(DEVICE)
74
  print("Model ready!")
75
 
76
 
 
108
 
109
  @app.get("/synthesize")
110
  def api_synthesize(
111
+ text: str = Query(...),
112
+ api_key: str = Query(...),
113
+ voice: str = Query(default="ani-bg-female"),
114
  ):
115
  if api_key != API_KEY:
116
  raise HTTPException(status_code=403, detail="Invalid API key")
 
119
  if len(text) > 500:
120
  raise HTTPException(status_code=400, detail="Text too long (max 500 chars)")
121
 
122
+ speaker_emb = VOICE_EMBEDDINGS.get(voice, DEFAULT_SPEAKER_EMB)
123
+
124
  try:
125
+ wav = synthesize_text(text, speaker_emb)
126
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
127
  sf.write(tmp.name, wav, CODEC_SAMPLE_RATE)
128
  return FileResponse(tmp.name, media_type="audio/wav")