Spaces:

SherinMohamed
/

chatbot

Build error

App Files Files Community

SherinMohamed commited on Jan 18

Commit

8115984

verified ·

1 Parent(s): 60358b5

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -223

app.py CHANGED Viewed

@@ -1,107 +1,47 @@
 import re
-from pathlib import Path
 import gradio as gr
 import spaces
 import torch
-from transformers import (
-    pipeline,
-    AutoTokenizer,
-    AutoModelForCausalLM,
-)
-# ======= EGTTS imports (Coqui XTTS) =======
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-# =========================================================
-# 0) CONFIG
-# =========================================================
-# Translator model (MSA <-> Egyptian)
 TRANSLATOR_MODEL = "oddadmix/Masrawy-BiLingual-v1"
-# ASR model (Audio -> text)
 ASR_MODEL = "openai/whisper-small"
-# LLM model (Qwen 3B)
 LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
-# EGTTS (Egyptian TTS) model files hosted on HF (from your provided code) :contentReference[oaicite:4]{index=4}
-CONFIG_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json"
-VOCAB_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json"
-MODEL_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth"
-SPEAKER_AUDIO_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav"
 USE_GPU = torch.cuda.is_available()
-DEVICE_PIPELINE = 0 if USE_GPU else -1
-DEVICE_TORCH = "cuda" if USE_GPU else "cpu"
-# =========================================================
-# 1) DOWNLOAD EGTTS FILES (once)
-# =========================================================
-base_path = Path(__file__).parent
-config_path = base_path / "config.json"
-vocab_path = base_path / "vocab.json"
-model_path = base_path / "model.pth"
-default_speaker_path = base_path / "speaker_reference.wav"
-def _download_if_missing(url: str, dst: Path):
-    if not dst.exists():
-        torch.hub.download_url_to_file(url, str(dst))
-_download_if_missing(CONFIG_URL, config_path)
-_download_if_missing(VOCAB_URL, vocab_path)
-_download_if_missing(MODEL_URL, model_path)
-_download_if_missing(SPEAKER_AUDIO_URL, default_speaker_path)
-# =========================================================
-# 2) LOAD MODELS (once)
-# =========================================================
-# --- Translator pipeline
-translator = pipeline("translation", model=TRANSLATOR_MODEL, device=DEVICE_PIPELINE)
-# --- ASR pipeline
-asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=DEVICE_PIPELINE)
-# --- Qwen LLM
 tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)
-model_llm = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL,
     torch_dtype="auto",
     device_map="auto" if USE_GPU else None,
     trust_remote_code=True
 )
 if not USE_GPU:
-    model_llm = model_llm.to("cpu")
-# --- EGTTS model
-tts_config = XttsConfig()
-tts_config.load_json(str(config_path))
-print("Loading EGTTS model...")
-tts_model = Xtts.init_from_config(tts_config)
-tts_model.load_checkpoint(
-    tts_config,
-    checkpoint_path=str(model_path),
-    use_deepspeed=False,
-    vocab_path=str(vocab_path),
-    eval=True
-)
-tts_model.to(DEVICE_TORCH)
-print("EGTTS loaded on:", DEVICE_TORCH)
-# =========================================================
-# 3) TRANSLATION HELPERS (explicit directions)
-# =========================================================
 def to_msa(text: str) -> str:
     """
-    Convert ANY Arabic (Egyptian/MSA/mix) -> MSA using <ar>
     """
     text = (text or "").strip()
     if not text:
@@ -110,46 +50,57 @@ def to_msa(text: str) -> str:
 def to_egyptian(text: str) -> str:
     """
-    Convert MSA -> Egyptian using <arz>
     """
     text = (text or "").strip()
     if not text:
         return ""
     return translator(text + " <arz>")[0]["translation_text"]
-# =========================================================
-# 4) STYLE CLEANUP (remove defensive/meta behavior)
-# =========================================================
 _BANNED_PHRASES = [
-    "كمساعد", "كمساعد ذكي", "معلش", "آسف", "اعتذر", "مش عارف",
-    "لا أستطيع", "غير قادر", "لا يمكنني", "لا أقدر", "لا أملك معلومات",
-    "قد لا يكون", "ربما", "عادةً", "بشكل عام"
 ]
 def clean_egyptian(text: str) -> str:
     t = (text or "").strip()
     for p in _BANNED_PHRASES:
         t = t.replace(p, "")
     t = re.sub(r"\s+", " ", t).strip()
     t = re.sub(r"[.،]{3,}", "…", t).strip()
     if not t:
-        t = "تمام—قولّي تحب تعمل إيه النهارده: شغل، مذاكرة، ولا راحة؟"
-    return t
-# =========================================================
-# 5) QWEN GENERATION (stable behavior: respond in simple MSA)
-# =========================================================
 def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
     msa_prompt = (msa_prompt or "").strip()
     if not msa_prompt:
         return ""
-    # Behavior-first system prompt (important)
     system_msg = (
         "أنت مساعد شخصي عملي. "
-        "إذا كان سؤال المستخدم عامًا أو مفتوحًا، اقترح خطة أو خطوات عملية فورًا "
         "بدون اعتذار وبدون تبرير لحدودك. "
         "اجعل الرد قصيرًا ومباشرًا ومفيدًا. "
         "اكتب باللغة العربية الفصحى البسيطة فقط."
@@ -165,11 +116,12 @@ def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float,
         add_generation_prompt=True,
         return_tensors="pt"
     )
     if USE_GPU:
-        input_ids = input_ids.to(model_llm.device)
     with torch.no_grad():
-        output_ids = model_llm.generate(
             input_ids,
             max_new_tokens=max_new_tokens,
             do_sample=True,
@@ -179,184 +131,110 @@ def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float,
         )
     gen_ids = output_ids[0][input_ids.shape[-1]:]
-    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
-# =========================================================
-# 6) EGTTS INFERENCE
-# =========================================================
-def egtss_speak(text_egy: str, speaker_audio_fp: str, tts_temperature: float):
     """
-    text_egy: Egyptian Arabic text (we pass language 'ar' as in your code) :contentReference[oaicite:5]{index=5}
-    speaker_audio_fp: path to reference audio (4-5 sec)
-    returns (sr, wav_np)
-    """
-    text_egy = (text_egy or "").strip()
-    if not text_egy:
-        # empty audio
-        return None
-    ref_path = speaker_audio_fp or str(default_speaker_path)
-    # compute speaker latents
-    gpt_cond_latent, speaker_embedding = tts_model.get_conditioning_latents(audio_path=[ref_path])
-    # inference
-    out = tts_model.inference(
-        text_egy,
-        "ar",
-        gpt_cond_latent,
-        speaker_embedding,
-        temperature=tts_temperature
-    )
-    return 24000, out["wav"]
-# =========================================================
-# 7) CORE PIPELINE (Text/Audio -> Egyptian text -> TTS audio)
-# =========================================================
-def _pipeline_text_to_egy_and_audio(
-    user_text: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    speaker_ref: str,
-    tts_temperature: float
-):
-    """
-    Returns:
-      msa_in, llm_msa, final_egy, audio_tuple(sr, wav)
     """
     user_text = (user_text or "").strip()
     if not user_text:
-        return "", "", "", None
-    # 1) Normalize input to MSA
     msa_in = to_msa(user_text)
-    # 2) LLM in MSA
     llm_msa = qwen_generate_msa(msa_in, max_new_tokens, temperature, top_p)
-    # 3) Convert to Egyptian + clean
     final_egy = clean_egyptian(to_egyptian(llm_msa))
-    # 4) TTS
-    audio = egtss_speak(final_egy, speaker_ref, tts_temperature)
-    return msa_in, llm_msa, final_egy, audio
 @spaces.GPU
-def generate_from_text(
-    user_text: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    speaker_ref: str,
-    tts_temperature: float,
-    show_debug: bool
-):
-    msa_in, llm_msa, final_egy, audio = _pipeline_text_to_egy_and_audio(
-        user_text, max_new_tokens, temperature, top_p, speaker_ref, tts_temperature
-    )
     if show_debug:
-        return msa_in, llm_msa, final_egy, audio
     # hide debug outputs
-    return "", "", final_egy, audio
 @spaces.GPU
-def generate_from_audio(
-    audio_path: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    speaker_ref: str,
-    tts_temperature: float,
-    show_debug: bool
-):
     if not audio_path:
         if show_debug:
-            return "", "", "", "", None
-        return "", "", "", "", None
-    # 1) ASR
     asr_out = asr(audio_path)
     asr_text = (asr_out.get("text", "") if isinstance(asr_out, dict) else str(asr_out)).strip()
     if not asr_text:
         if show_debug:
-            return "", "", "", "", None
-        return "", "", "", "", None
-    # 2) Full pipeline
-    msa_in, llm_msa, final_egy, audio = _pipeline_text_to_egy_and_audio(
-        asr_text, max_new_tokens, temperature, top_p, speaker_ref, tts_temperature
-    )
     if show_debug:
-        return asr_text, msa_in, llm_msa, final_egy, audio
-    # hide debug except ASR + final + audio
-    return asr_text, "", "", final_egy, audio
-# =========================================================
-# 8) GRADIO UI
-# =========================================================
-with gr.Blocks(title="Egyptian Arabic Assistant (Chatbot + TTS)") as demo:
     gr.Markdown(
-        "## Egyptian Arabic Assistant (Chatbot + TTS)\n"
-        "**Pipeline:** Input → (to MSA) → Qwen (MSA) → (to Egyptian) → **EGTTS صوت**\n\n"
-        "ملاحظة: تقدر ترفع Speaker Reference (4–5 ثواني) أو تسيبه الافتراضي."
     )
     with gr.Row():
         max_new_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
-        temp = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
         top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
-    with gr.Row():
-        speaker_ref = gr.Audio(
-            label="Speaker reference (optional)",
-            value=str(default_speaker_path),
-            type="filepath"
-        )
-        tts_temp = gr.Slider(0.1, 1.0, value=0.75, step=0.05, label="TTS Temperature")
-    show_debug = gr.Checkbox(value=False, label="Show debug outputs")
     with gr.Tabs():
         with gr.TabItem("Text Input"):
-            txt_in = gr.Textbox(lines=4, placeholder="اكتب هنا (مصري/فصحى)", label="Input Text")
-            btn = gr.Button("Generate (Text → Reply + Voice)", variant="primary")
             dbg_msa_in = gr.Textbox(lines=2, label="(Debug) Input after to_msa")
             dbg_llm_msa = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
-            out_egy = gr.Textbox(lines=4, label="Final Output (Egyptian)")
-            out_audio = gr.Audio(label="Synthesized audio (EGTTS)")
-            btn.click(
-                generate_from_text,
-                inputs=[txt_in, max_new_tokens, temp, top_p, speaker_ref, tts_temp, show_debug],
-                outputs=[dbg_msa_in, dbg_llm_msa, out_egy, out_audio]
             )
         with gr.TabItem("Audio Input"):
             aud_in = gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)")
-            btn_a = gr.Button("Generate (Audio → Reply + Voice)", variant="primary")
             asr_txt = gr.Textbox(lines=2, label="ASR Text")
             dbg_msa_in_a = gr.Textbox(lines=2, label="(Debug) ASR after to_msa")
             dbg_llm_msa_a = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
-            out_egy_a = gr.Textbox(lines=4, label="Final Output (Egyptian)")
-            out_audio_a = gr.Audio(label="Synthesized audio (EGTTS)")
-            btn_a.click(
-                generate_from_audio,
-                inputs=[aud_in, max_new_tokens, temp, top_p, speaker_ref, tts_temp, show_debug],
-                outputs=[asr_txt, dbg_msa_in_a, dbg_llm_msa_a, out_egy_a, out_audio_a]
             )
 demo.launch()

 import re
 import gradio as gr
 import spaces
 import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+# =========================
+# 0) Config
+# =========================
 TRANSLATOR_MODEL = "oddadmix/Masrawy-BiLingual-v1"
 ASR_MODEL = "openai/whisper-small"
 LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
 USE_GPU = torch.cuda.is_available()
+DEVICE = 0 if USE_GPU else -1
+# =========================
+# 1) Load models (once)
+# =========================
+translator = pipeline("translation", model=TRANSLATOR_MODEL, device=DEVICE)
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=ASR_MODEL,
+    device=DEVICE
+)
 tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL,
     torch_dtype="auto",
     device_map="auto" if USE_GPU else None,
     trust_remote_code=True
 )
 if not USE_GPU:
+    model = model.to("cpu")
+# =========================
+# 2) Translator helpers (explicit direction, non-ambiguous)
+# =========================
 def to_msa(text: str) -> str:
     """
+    Convert ANY Arabic (Egyptian/MSA/mix) -> MSA.
+    Uses tag <ar> (model behavior in your translator code).
     """
     text = (text or "").strip()
     if not text:
 def to_egyptian(text: str) -> str:
     """
+    Convert MSA -> Egyptian.
+    Uses tag <arz>.
     """
     text = (text or "").strip()
     if not text:
         return ""
     return translator(text + " <arz>")[0]["translation_text"]
+# =========================
+# 3) Output cleaning (Detox / style shaping)
+# =========================
 _BANNED_PHRASES = [
+    "كمساعد", "كمساع�� ذكي", "معلش", "آسف", "اعتذر", "مش عارف", "لا أستطيع", "غير قادر",
+    "لا يمكنني", "لا أقدر", "لا أملك معلومات", "قد لا يكون", "ربما", "عادةً", "بشكل عام"
 ]
 def clean_egyptian(text: str) -> str:
+    """
+    Lightweight cleanup to remove annoying meta/defensive phrasing.
+    Not meant to be perfect; keeps it simple and safe.
+    """
     t = (text or "").strip()
+    # Remove banned phrases (simple replace)
     for p in _BANNED_PHRASES:
         t = t.replace(p, "")
+    # Collapse extra spaces
     t = re.sub(r"\s+", " ", t).strip()
+    # Remove repeated punctuation
     t = re.sub(r"[.،]{3,}", "…", t).strip()
+    # If it becomes empty, fall back to a helpful default
     if not t:
+        t = "تمام—قولي انت فاضي ولا عندك شغل/مذاكرة النهارده؟"
+    return t
+# =========================
+# 4) Qwen generation (in MSA for stability)
+# =========================
 def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
     msa_prompt = (msa_prompt or "").strip()
     if not msa_prompt:
         return ""
+    # Behavior-first system message (MOST IMPORTANT CHANGE)
     system_msg = (
         "أنت مساعد شخصي عملي. "
+        "إذا كان سؤال المستخدم عامًا أو مفتوحًا، اقترح خطة أو خطوات عملية من نفسك فورًا "
         "بدون اعتذار وبدون تبرير لحدودك. "
         "اجعل الرد قصيرًا ومباشرًا ومفيدًا. "
         "اكتب باللغة العربية الفصحى البسيطة فقط."
         add_generation_prompt=True,
         return_tensors="pt"
     )
     if USE_GPU:
+        input_ids = input_ids.to(model.device)
     with torch.no_grad():
+        output_ids = model.generate(
             input_ids,
             max_new_tokens=max_new_tokens,
             do_sample=True,
         )
     gen_ids = output_ids[0][input_ids.shape[-1]:]
+    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    return text
+# =========================
+# 5) Core pipeline (stable + non-ambiguous)
+# =========================
+def _pipeline_from_text(user_text: str, max_new_tokens: int, temperature: float, top_p: float):
     """
+    Input -> (to MSA) -> Qwen (MSA) -> (to Egyptian) -> clean
+    Returns: msa_in, llm_msa, final_egy
     """
     user_text = (user_text or "").strip()
     if not user_text:
+        return "", "", ""
+    # 1) Normalize input to MSA (stable for LLM)
     msa_in = to_msa(user_text)
+    # 2) LLM outputs in MSA (behavior controlled by system prompt)
     llm_msa = qwen_generate_msa(msa_in, max_new_tokens, temperature, top_p)
+    # 3) Force Egyptian output + clean
     final_egy = clean_egyptian(to_egyptian(llm_msa))
+    return msa_in, llm_msa, final_egy
 @spaces.GPU
+def process_text(user_text: str, max_new_tokens: int, temperature: float, top_p: float, show_debug: bool):
+    msa_in, llm_msa, final_egy = _pipeline_from_text(user_text, max_new_tokens, temperature, top_p)
     if show_debug:
+        return msa_in, llm_msa, final_egy
     # hide debug outputs
+    return "", "", final_egy
 @spaces.GPU
+def process_audio(audio_path: str, max_new_tokens: int, temperature: float, top_p: float, show_debug: bool):
     if not audio_path:
         if show_debug:
+            return "", "", "", ""
+        return "", "", "", ""
+    # ASR
     asr_out = asr(audio_path)
     asr_text = (asr_out.get("text", "") if isinstance(asr_out, dict) else str(asr_out)).strip()
     if not asr_text:
         if show_debug:
+            return "", "", "", ""
+        return "", "", "", ""
+    msa_in, llm_msa, final_egy = _pipeline_from_text(asr_text, max_new_tokens, temperature, top_p)
     if show_debug:
+        return asr_text, msa_in, llm_msa, final_egy
+    # hide debug outputs except ASR text + final
+    return asr_text, "", "", final_egy
+# =========================
+# 6) Gradio UI
+# =========================
+with gr.Blocks(title="Egyptian Arabic Assistant") as demo:
     gr.Markdown(
+        "## Egyptian Arabic Assistant\n"
+        "منطق ثابت وواضح:\n"
+        "**Input → (to MSA) → Qwen (MSA) → (to Egyptian) → Output**\n\n"
+        "السلوك: رد عملي ومباشر، بدون اعتذار وبدون كلام Meta."
     )
     with gr.Row():
         max_new_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
+        temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
         top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
+        show_debug = gr.Checkbox(value=False, label="Show debug outputs")
     with gr.Tabs():
         with gr.TabItem("Text Input"):
+            txt_in = gr.Textbox(lines=4, placeholder="اكتب هنا (مصري/فصحى)", label="Input")
+            txt_btn = gr.Button("Generate")
             dbg_msa_in = gr.Textbox(lines=2, label="(Debug) Input after to_msa")
             dbg_llm_msa = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
+            out_egy = gr.Textbox(lines=5, label="Final Output (Egyptian)")
+            txt_btn.click(
+                process_text,
+                inputs=[txt_in, max_new_tokens, temperature, top_p, show_debug],
+                outputs=[dbg_msa_in, dbg_llm_msa, out_egy],
             )
         with gr.TabItem("Audio Input"):
             aud_in = gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)")
+            aud_btn = gr.Button("Transcribe + Generate")
             asr_txt = gr.Textbox(lines=2, label="ASR Text")
             dbg_msa_in_a = gr.Textbox(lines=2, label="(Debug) ASR after to_msa")
             dbg_llm_msa_a = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
+            out_egy_a = gr.Textbox(lines=5, label="Final Output (Egyptian)")
+            aud_btn.click(
+                process_audio,
+                inputs=[aud_in, max_new_tokens, temperature, top_p, show_debug],
+                outputs=[asr_txt, dbg_msa_in_a, dbg_llm_msa_a, out_egy_a],
             )
 demo.launch()