Spaces:

mkfallah
/

vgap

Sleeping

App Files Files Community

mkfallah commited on Sep 6, 2025

Commit

66951ec

verified ·

1 Parent(s): f13240a

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -11

app.py CHANGED Viewed

@@ -8,21 +8,27 @@ import soundfile as sf
 # --------------------------
 asr = pipeline(
     task="automatic-speech-recognition",
-    model="openai/whisper-small",  # smaller model = faster
-    device=-1  # set to 0 for GPU
 )
 # --------------------------
-# 2. Language Model (LLM) - lightweight
 # --------------------------
-llm_model_id = "google/flan-t5-small"
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_id).to("cpu")
-def ask_llm(prompt, max_new_tokens=100):
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
-        outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # --------------------------
@@ -31,8 +37,6 @@ def ask_llm(prompt, max_new_tokens=100):
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-# fixed dummy speaker embedding (instead of dataset)
-# dimension must match SpeechT5 (512)
 speaker_embedding = torch.randn(1, 512)
 def text_to_speech(text, out_path="output.wav"):
@@ -56,7 +60,7 @@ def full_pipeline(audio_file):
     user_text = result.get("text", "")
     try:
-        llm_response = ask_llm(user_text)
     except Exception as e:
         return f"Assistant generation error: {e}", None
@@ -74,8 +78,8 @@ iface = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
-    title="Persian Voice Assistant (Fast LLM)",
-    description="ASR → Lightweight LLM → TTS"
 )
 if __name__ == "__main__":

 # --------------------------
 asr = pipeline(
     task="automatic-speech-recognition",
+    model="openai/whisper-small",
+    device=-1
 )
 # --------------------------
+# 2. Language Model (LLM) - more reliable
 # --------------------------
+llm_model_id = "google/flan-t5-base"
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_id).to("cpu")
+def ask_llm(prompt, max_new_tokens=200):
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
+        outputs = llm_model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95
+        )
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # --------------------------
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 speaker_embedding = torch.randn(1, 512)
 def text_to_speech(text, out_path="output.wav"):
     user_text = result.get("text", "")
     try:
+        llm_response = ask_llm(f"پاسخ بده به زبان ساده: {user_text}")
     except Exception as e:
         return f"Assistant generation error: {e}", None
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
+    title="Persian Voice Assistant (Reliable LLM)",
+    description="ASR → Flan-T5-Base → TTS"
 )
 if __name__ == "__main__":