Spaces:

tacab
/

TTS

Sleeping

App Files Files Community

nurfarah57 commited on May 26

Commit

830736b

verified ·

1 Parent(s): 8b39b35

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -22

app.py CHANGED Viewed

@@ -1,4 +1,12 @@
 import os
 import io
 import re
 import numpy as np
@@ -9,15 +17,9 @@ from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from transformers import VitsModel, AutoTokenizer
-# Set environment variables to avoid permission issues in container
-os.environ["HF_HOME"] = "/tmp"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp"
-os.environ["TORCH_HOME"] = "/tmp"
-os.environ["XDG_CACHE_HOME"] = "/tmp"
 app = FastAPI()
-# Load model and tokenizer at startup
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
@@ -25,7 +27,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 model.eval()
-# Number-to-Somali words dictionary
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
@@ -73,11 +75,11 @@ def number_to_words(number: int) -> str:
         return str(number)
 def normalize_text(text: str) -> str:
-    # Replace numbers with words
     numbers = re.findall(r'\d+', text)
     for num in numbers:
         text = text.replace(num, number_to_words(int(num)))
-    # Additional Somali text normalization rules
     text = text.replace("KH", "qa").replace("Z", "S")
     text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
     text = text.replace("ZamZam", "SamSam")
@@ -88,38 +90,27 @@ class TextIn(BaseModel):
 @app.post("/synthesize")
 async def synthesize(data: TextIn):
-    # Normalize and convert text input
     text = normalize_text(data.inputs)
-    # Tokenize and move to device
     inputs = tokenizer(text, return_tensors="pt").to(device)
-    # Generate waveform with no_grad
     with torch.no_grad():
         output = model(**inputs)
         waveform = output.waveform.squeeze().cpu().numpy()
-    # If multi-channel audio, average channels to mono
     if waveform.ndim > 1:
         waveform = waveform.mean(axis=0)
-    # Convert to float32 if not already
     waveform = waveform.astype(np.float32)
-    # Clip waveform to [-1.0, 1.0]
     waveform = np.clip(waveform, -1.0, 1.0)
-    # Convert to 16-bit PCM
     pcm_waveform = (waveform * 32767).astype(np.int16)
-    # Prepare WAV file in memory buffer
     buf = io.BytesIO()
     sample_rate = getattr(model.config, "sampling_rate", 22050)
     scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
     buf.seek(0)
-    # Debug info
     print(f"Generated audio length: {pcm_waveform.shape[0]} samples, Sample rate: {sample_rate}")
-    # Stream response as WAV audio
     return StreamingResponse(buf, media_type="audio/wav")

 import os
+# === IMPORTANT ===
+# Set cache directories BEFORE any imports that use Hugging Face or PyTorch caching
+os.environ["HF_HOME"] = "/tmp"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp"
+os.environ["TORCH_HOME"] = "/tmp"
+os.environ["XDG_CACHE_HOME"] = "/tmp"
 import io
 import re
 import numpy as np
 from pydantic import BaseModel
 from transformers import VitsModel, AutoTokenizer
 app = FastAPI()
+# Load model and tokenizer once at startup
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
 model.to(device)
 model.eval()
+# Somali number words dictionary for normalization
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
         return str(number)
 def normalize_text(text: str) -> str:
+    # Replace digits with Somali words
     numbers = re.findall(r'\d+', text)
     for num in numbers:
         text = text.replace(num, number_to_words(int(num)))
+    # Additional Somali text normalizations
     text = text.replace("KH", "qa").replace("Z", "S")
     text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
     text = text.replace("ZamZam", "SamSam")
 @app.post("/synthesize")
 async def synthesize(data: TextIn):
     text = normalize_text(data.inputs)
     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model(**inputs)
         waveform = output.waveform.squeeze().cpu().numpy()
+    # Mono conversion if multi-channel
     if waveform.ndim > 1:
         waveform = waveform.mean(axis=0)
     waveform = waveform.astype(np.float32)
     waveform = np.clip(waveform, -1.0, 1.0)
     pcm_waveform = (waveform * 32767).astype(np.int16)
     buf = io.BytesIO()
     sample_rate = getattr(model.config, "sampling_rate", 22050)
     scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
     buf.seek(0)
     print(f"Generated audio length: {pcm_waveform.shape[0]} samples, Sample rate: {sample_rate}")
     return StreamingResponse(buf, media_type="audio/wav")