Spaces:

E-motionAssistant
/

Space3

Sleeping

App Files Files Community

amasha03 commited on 28 days ago

Commit

12dfa83

verified ·

1 Parent(s): 97553c6

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -25

app.py CHANGED Viewed

@@ -1,38 +1,44 @@
 import gradio as gr
-from transformers import VitsModel, AutoTokenizer
 import torch
-# 1. Load the models specifically as VITS models
-# This avoids the "Unrecognized model" error
-models = {
-    "English": {
-        "model": VitsModel.from_pretrained("E-motionAssistant/text-to-speech-VITS-english"),
-        "tokenizer": AutoTokenizer.from_pretrained("E-motionAssistant/text-to-speech-VITS-english")
-    },
-    "Sinhala": {
-        "model": VitsModel.from_pretrained("E-motionAssistant/text-to-speech-VITS-sinhala"),
-        "tokenizer": AutoTokenizer.from_pretrained("E-motionAssistant/text-to-speech-VITS-sinhala")
-    },
-    "Tamil": {
-        "model": VitsModel.from_pretrained("E-motionAssistant/text-to-speech-VITS-tamil"),
-        "tokenizer": AutoTokenizer.from_pretrained("E-motionAssistant/text-to-speech-VITS-tamil")
-    }
-}
 def generate_speech(text, language):
     try:
-        selected = models[language]
-        inputs = selected["tokenizer"](text, return_tensors="pt")
         with torch.no_grad():
-            output = selected["model"](**inputs).waveform
-        # VITS models typically output at 22050Hz
-        # We convert the tensor to a numpy array for Gradio
-        return (22050, output.cpu().numpy().squeeze())
     except Exception as e:
-        print(f"Error: {e}")
         return None
 demo = gr.Interface(

 import gradio as gr
+from transformers import AutoModel, AutoTokenizer
 import torch
+import scipy.io.wavfile
+# Load models with 'trust_remote_code' to handle custom architectures
+def load_model(model_id):
+    # trust_remote_code is essential for models that aren't 'native' to transformers
+    model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    return model, tokenizer
+print("Loading English Model...")
+eng_model, eng_tok = load_model("E-motionAssistant/text-to-speech-VITS-english")
+print("Loading Sinhala Model...")
+sin_model, sin_tok = load_model("E-motionAssistant/text-to-speech-VITS-sinhala")
+print("Loading Tamil Model...")
+tam_model, tam_tok = load_model("E-motionAssistant/text-to-speech-VITS-tamil")
 def generate_speech(text, language):
     try:
+        if language == "English":
+            model, tokenizer = eng_model, eng_tok
+        elif language == "Sinhala":
+            model, tokenizer = sin_model, sin_tok
+        else:
+            model, tokenizer = tam_model, tam_tok
+        inputs = tokenizer(text, return_tensors="pt")
         with torch.no_grad():
+            # VITS models usually return a 'waveform' attribute
+            output = model(**inputs)
+            waveform = output.waveform.cpu().numpy().squeeze()
+        # Standard VITS sampling rate is 22050
+        return (22050, waveform)
     except Exception as e:
         return None
 demo = gr.Interface(