Spaces:

mkfallah
/

vgap

Sleeping

App Files Files Community

mkfallah commited on Sep 6, 2025

Commit

9f08613

verified ·

1 Parent(s): ce98ad4

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -16

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech
 import torch
 import soundfile as sf
@@ -8,21 +8,18 @@ import soundfile as sf
 # --------------------------
 asr = pipeline(
     task="automatic-speech-recognition",
-    model="vhdm/whisper-large-fa-v1",
-    device=-1
 )
 # --------------------------
-# 2. Language Model (LLM)
 # --------------------------
-llm_model_id = "tiiuae/falcon-rw-1b"
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
-llm_model = AutoModelForCausalLM.from_pretrained(
-    llm_model_id,
-    torch_dtype=torch.float32
-).to("cpu")
-def ask_llm(prompt, max_new_tokens=200):
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
         outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens)
@@ -31,16 +28,20 @@ def ask_llm(prompt, max_new_tokens=200):
 # --------------------------
 # 3. TTS (text-to-speech) using SpeechT5
 # --------------------------
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-# Random speaker embedding (can be replaced with a fixed one for consistency)
-speaker_embedding = torch.randn(1, 512)
 def text_to_speech(text, out_path="output.wav"):
     inputs = processor(text=text, return_tensors="pt")
-    with torch.no_grad():
-        speech = tts_model.generate_speech(inputs["input_ids"], speaker_embedding)
     sf.write(out_path, speech.numpy(), 16000)
     return out_path
@@ -77,8 +78,8 @@ iface = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
-    title="Persian Voice Assistant",
-    description="ASR → LLM → TTS"
 )
 if __name__ == "__main__":

 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5Processor, SpeechT5ForTextToSpeech
 import torch
 import soundfile as sf
 # --------------------------
 asr = pipeline(
     task="automatic-speech-recognition",
+    model="openai/whisper-small",  # smaller model = faster
+    device=-1  # set to 0 for GPU
 )
 # --------------------------
+# 2. Language Model (LLM) - lightweight
 # --------------------------
+llm_model_id = "google/flan-t5-small"
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
+llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_id).to("cpu")
+def ask_llm(prompt, max_new_tokens=100):
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
         outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens)
 # --------------------------
 # 3. TTS (text-to-speech) using SpeechT5
 # --------------------------
+from datasets import load_dataset
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+# use a fixed speaker embedding (pre-extracted)
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", streaming=True)
+for i, example in enumerate(embeddings_dataset):
+    if i == 0:  # just take the first speaker embedding
+        speaker_embedding = torch.tensor(example["xvector"]).unsqueeze(0)
+        break
 def text_to_speech(text, out_path="output.wav"):
     inputs = processor(text=text, return_tensors="pt")
+    speech = tts_model.generate_speech(inputs["input_ids"], speaker_embedding)
     sf.write(out_path, speech.numpy(), 16000)
     return out_path
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
+    title="Persian Voice Assistant (Fast LLM)",
+    description="ASR → Lightweight LLM → TTS"
 )
 if __name__ == "__main__":