Spaces:

tacab
/

TTS

Sleeping

App Files Files Community

nurfarah57 commited on May 26, 2025

Commit

3b6b693

verified ·

1 Parent(s): 7291a4c

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -22

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
-# Set cache dirs BEFORE imports for permission fix
 os.environ["HF_HOME"] = "/tmp"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["TORCH_HOME"] = "/tmp"
@@ -19,7 +18,6 @@ from transformers import VitsModel, AutoTokenizer
 app = FastAPI()
-# Load model and tokenizer ONCE at startup
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
@@ -84,15 +82,12 @@ def normalize_text(text: str) -> str:
 def waveform_to_wav_bytes(waveform: torch.Tensor, sample_rate: int = 22050) -> bytes:
     np_waveform = waveform.cpu().numpy()
     if np_waveform.ndim == 3:
         np_waveform = np_waveform[0]
     if np_waveform.ndim == 2:
         np_waveform = np_waveform.mean(axis=0)
     np_waveform = np.clip(np_waveform, -1.0, 1.0).astype(np.float32)
     pcm_waveform = (np_waveform * 32767).astype(np.int16)
     buf = io.BytesIO()
     scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
     buf.seek(0)
@@ -102,7 +97,25 @@ class TextIn(BaseModel):
     inputs: str
 @app.post("/synthesize")
-async def synthesize(data: TextIn, test: bool = Query(False, description="Set true to return a test tone")):
     if test:
         duration_s = 2.0
         sample_rate = 22050
@@ -110,22 +123,14 @@ async def synthesize(data: TextIn, test: bool = Query(False, description="Set tr
         freq = 440
         waveform = 0.5 * np.sin(2 * math.pi * freq * t).astype(np.float32)
         pcm_waveform = (waveform * 32767).astype(np.int16)
         buf = io.BytesIO()
         scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
         buf.seek(0)
-        print(f"[TEST MODE] Generated test tone: {pcm_waveform.shape[0]} samples, Sample rate: {sample_rate}")
         return StreamingResponse(buf, media_type="audio/wav")
-    text = normalize_text(data.inputs)
-    inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model(**inputs)
-    print("Model output type:", type(output))
-    # Try to extract waveform safely:
     if hasattr(output, "waveform"):
         waveform = output.waveform
     elif isinstance(output, dict) and "waveform" in output:
@@ -134,12 +139,6 @@ async def synthesize(data: TextIn, test: bool = Query(False, description="Set tr
         waveform = output[0]
     else:
         return {"error": "Waveform not found in model output"}
-    print("Extracted waveform shape:", waveform.shape)
     sample_rate = getattr(model.config, "sampling_rate", 22050)
-    print("Sample rate:", sample_rate)
     wav_bytes = waveform_to_wav_bytes(waveform, sample_rate=sample_rate)
     return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")

 import os
 os.environ["HF_HOME"] = "/tmp"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["TORCH_HOME"] = "/tmp"
 app = FastAPI()
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
 def waveform_to_wav_bytes(waveform: torch.Tensor, sample_rate: int = 22050) -> bytes:
     np_waveform = waveform.cpu().numpy()
     if np_waveform.ndim == 3:
         np_waveform = np_waveform[0]
     if np_waveform.ndim == 2:
         np_waveform = np_waveform.mean(axis=0)
     np_waveform = np.clip(np_waveform, -1.0, 1.0).astype(np.float32)
     pcm_waveform = (np_waveform * 32767).astype(np.int16)
     buf = io.BytesIO()
     scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
     buf.seek(0)
     inputs: str
 @app.post("/synthesize")
+async def synthesize_post(data: TextIn):
+    text = normalize_text(data.inputs)
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output = model(**inputs)
+    if hasattr(output, "waveform"):
+        waveform = output.waveform
+    elif isinstance(output, dict) and "waveform" in output:
+        waveform = output["waveform"]
+    elif isinstance(output, (tuple, list)):
+        waveform = output[0]
+    else:
+        return {"error": "Waveform not found in model output"}
+    sample_rate = getattr(model.config, "sampling_rate", 22050)
+    wav_bytes = waveform_to_wav_bytes(waveform, sample_rate=sample_rate)
+    return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")
+@app.get("/synthesize")
+async def synthesize_get(text: str = Query(..., description="Text to synthesize"), test: bool = Query(False)):
     if test:
         duration_s = 2.0
         sample_rate = 22050
         freq = 440
         waveform = 0.5 * np.sin(2 * math.pi * freq * t).astype(np.float32)
         pcm_waveform = (waveform * 32767).astype(np.int16)
         buf = io.BytesIO()
         scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
         buf.seek(0)
         return StreamingResponse(buf, media_type="audio/wav")
+    normalized = normalize_text(text)
+    inputs = tokenizer(normalized, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model(**inputs)
     if hasattr(output, "waveform"):
         waveform = output.waveform
     elif isinstance(output, dict) and "waveform" in output:
         waveform = output[0]
     else:
         return {"error": "Waveform not found in model output"}
     sample_rate = getattr(model.config, "sampling_rate", 22050)
     wav_bytes = waveform_to_wav_bytes(waveform, sample_rate=sample_rate)
     return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")