Spaces:

mkfallah
/

vgap

Sleeping

App Files Files Community

mkfallah commited on Sep 5, 2025

Commit

ce98ad4

verified ·

1 Parent(s): 0ec3d43

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -108

app.py CHANGED Viewed

@@ -1,152 +1,84 @@
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
-from rapidfuzz import process, fuzz
-import soundfile as sf
-import numpy as np
 import torch
-# ----------------------------
-# 1) ASR pipeline (Whisper Persian)
-# ----------------------------
 asr = pipeline(
-    "automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
-    device=-1,  # CPU. set 0 for GPU
 )
-# ----------------------------
-# 2) LLM (text generation)
-# ----------------------------
-llm_model_id = "tiiuae/falcon-rw-1b"  # choose a model that fits your env
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
 llm_model = AutoModelForCausalLM.from_pretrained(
     llm_model_id,
-    torch_dtype=torch.float32,
 ).to("cpu")
-def ask_llm(prompt: str, max_new_tokens: int = 200) -> str:
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
         outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# ----------------------------
-# 3) TTS pipeline (SpeechT5 via transformers pipeline)
-# ----------------------------
-# Use the text-to-speech pipeline which handles preprocessing and speaker defaults.
-tts_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=-1)  # device=-1 -> CPU
-def text_to_speech_save(text: str, out_path: str = "response.wav") -> str:
-    """
-    Use the text-to-speech pipeline to synthesize `text` and save to `out_path`.
-    Returns the path on success or raises exception on failure.
-    """
-    # pipeline may return a dict or list depending on versions; handle both
-    result = tts_pipeline(text)
-    if isinstance(result, list):
-        entry = result[0]
-    else:
-        entry = result
-    audio = entry.get("audio") if isinstance(entry, dict) else None
-    sr = entry.get("sampling_rate", 16000) if isinstance(entry, dict) else 16000
-    if audio is None:
-        # some pipeline versions return numpy array directly
-        audio = result if isinstance(result, np.ndarray) else None
-    if audio is None:
-        raise RuntimeError("TTS pipeline returned no audio.")
-    # ensure numpy array
-    audio_np = np.asarray(audio)
-    sf.write(out_path, audio_np, sr)
     return out_path
-# ----------------------------
-# 4) Fuzzy replacement (robust)
-# ----------------------------
-custom_vocab_map = {
-    "نرد": ["نرد", "نِرد", "نَرد"],
-    "کامپیوتر": ["کامپیوتر", "کامپیوتره"],
-    "هوش مصنوعی": ["هوش مصنوعی", "هوش صنعتی"],
-    "ماشین": ["ماشین", "ماشینه"],
-}
-def replace_fuzzy(text: str, vocab_map: dict, threshold: int = 85) -> str:
-    """
-    Replace near-matches in `text` with canonical targets from vocab_map.
-    Handles rapidfuzz.extractOne return types (object or tuple).
-    """
-    if not text:
-        return text
-    for target, alternatives in vocab_map.items():
-        try:
-            res = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
-        except Exception:
-            res = None
-        if not res:
-            continue
-        # res may be an Extracted object or tuple
-        if hasattr(res, "value") and hasattr(res, "score"):
-            match = res.value
-            score = res.score
-        else:
-            # tuple like (match, score, idx) or (match, score)
-            match = res[0]
-            score = res[1] if len(res) > 1 else 0
-        if score >= threshold:
-            # replace only the first occurrence to avoid accidental global replacement
-            text = text.replace(match, target, 1)
-    return text
-# ----------------------------
-# 5) Full pipeline function
-# ----------------------------
-def full_pipeline(audio_file: str):
-    """
-    audio_file is a filepath (Gradio with type='filepath' sends a path for mic/upload).
-    Returns (text_output_str, path_to_tts_wav or None).
-    """
     if not audio_file:
         return "No audio input detected.", None
-    # 1) ASR
     try:
-        asr_result = asr(audio_file, chunk_length_s=30, stride_length_s=[5, 5])
     except Exception as e:
         return f"ASR error: {e}", None
-    raw_text = asr_result.get("text", "")
-    if raw_text is None:
-        raw_text = ""
-    # 2) fuzzy replacement
-    corrected_text = replace_fuzzy(raw_text, custom_vocab_map, threshold=85)
-    # 3) LLM reply
     try:
-        llm_reply = ask_llm(corrected_text)
     except Exception as e:
-        llm_reply = f"LLM error: {e}"
-    # 4) TTS (synthesize LLM reply)
     try:
-        audio_out = text_to_speech_save(llm_reply, out_path="response.wav")
     except Exception as e:
-        return f"User said: {corrected_text}\nAssistant generation error: {e}", None
-    convo = f"User said: {corrected_text}\nAssistant: {llm_reply}"
-    return convo, audio_out
-# ----------------------------
-# 6) Gradio UI
-# ----------------------------
 iface = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
     title="Persian Voice Assistant",
-    description="ASR → LLM → TTS (offline-ready pipelines).",
 )
 if __name__ == "__main__":

 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech
 import torch
+import soundfile as sf
+# --------------------------
+# 1. ASR (speech to text)
+# --------------------------
 asr = pipeline(
+    task="automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
+    device=-1
 )
+# --------------------------
+# 2. Language Model (LLM)
+# --------------------------
+llm_model_id = "tiiuae/falcon-rw-1b"
 tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
 llm_model = AutoModelForCausalLM.from_pretrained(
     llm_model_id,
+    torch_dtype=torch.float32
 ).to("cpu")
+def ask_llm(prompt, max_new_tokens=200):
     inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
     with torch.no_grad():
         outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# --------------------------
+# 3. TTS (text-to-speech) using SpeechT5
+# --------------------------
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+# Random speaker embedding (can be replaced with a fixed one for consistency)
+speaker_embedding = torch.randn(1, 512)
+def text_to_speech(text, out_path="output.wav"):
+    inputs = processor(text=text, return_tensors="pt")
+    with torch.no_grad():
+        speech = tts_model.generate_speech(inputs["input_ids"], speaker_embedding)
+    sf.write(out_path, speech.numpy(), 16000)
     return out_path
+# --------------------------
+# 4. Full pipeline function
+# --------------------------
+def full_pipeline(audio_file):
     if not audio_file:
         return "No audio input detected.", None
     try:
+        result = asr(audio_file, chunk_length_s=30, stride_length_s=[5, 5])
     except Exception as e:
         return f"ASR error: {e}", None
+    user_text = result.get("text", "")
     try:
+        llm_response = ask_llm(user_text)
     except Exception as e:
+        return f"Assistant generation error: {e}", None
     try:
+        audio_path = text_to_speech(llm_response, "response.wav")
     except Exception as e:
+        return f"TTS error: {e}", None
+    return f"User said: {user_text}\nAssistant: {llm_response}", audio_path
+# --------------------------
+# 5. Gradio Interface
+# --------------------------
 iface = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Audio(type="filepath", label="Record or upload audio"),
     outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
     title="Persian Voice Assistant",
+    description="ASR → LLM → TTS"
 )
 if __name__ == "__main__":