Spaces:

rahul7star
/

OhamLab-AI

Sleeping

App Files Files Community

rahul7star commited on Jan 12

Commit

fbf4f7f

verified ·

1 Parent(s): 43b2aa5

Update app_qwen_tts_fast.py

Browse files

Files changed (1) hide show

app_qwen_tts_fast.py +107 -109

app_qwen_tts_fast.py CHANGED Viewed

@@ -1,81 +1,82 @@
 import os
 import requests
 import torch
 import gradio as gr
 import numpy as np
-import soundfile as sf
 from functools import lru_cache
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
-# =========================================================
 # CONFIG
-# =========================================================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 DOC_FILE = "general.md"
-MAX_NEW_TOKENS = 200
-TOP_K = 3
-MAX_TTS_CHARS = 200  # 🔥 BIG SPEED WIN
 TTS_API_URL = os.getenv(
     "TTS_API_URL",
     "https://rahul7star-Chatterbox-Multilingual-TTS-API.hf.space/tts"
 )
-SESSION = requests.Session()  # 🔥 reuse HTTP connection
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DOC_PATH = os.path.join(BASE_DIR, DOC_FILE)
-# =========================================================
-# LOAD MODELS
-# =========================================================
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    trust_remote_code=True
-)
-model.eval()
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-# =========================================================
-# LOAD DOCUMENT
-# =========================================================
 def chunk_text(text, chunk_size=300, overlap=50):
     words = text.split()
-    chunks = []
-    i = 0
     while i < len(words):
         chunks.append(" ".join(words[i:i + chunk_size]))
         i += chunk_size - overlap
     return chunks
-with open(DOC_PATH, "r", encoding="utf-8", errors="ignore") as f:
-    DOC_TEXT = f.read()
 DOC_CHUNKS = chunk_text(DOC_TEXT)
-DOC_EMBEDS = embedder.encode(DOC_CHUNKS, normalize_embeddings=True)
-# =========================================================
 # RETRIEVAL
-# =========================================================
-def retrieve_context(question, k=TOP_K):
     q_emb = embedder.encode([question], normalize_embeddings=True)
     scores = np.dot(DOC_EMBEDS, q_emb[0])
-    ids = scores.argsort()[-k:][::-1]
-    return "\n\n".join(DOC_CHUNKS[i] for i in ids)
-# =========================================================
-# QWEN (CACHED)
-# =========================================================
-@lru_cache(maxsize=128)
-def cached_answer(question: str) -> str:
     context = retrieve_context(question)
     messages = [
@@ -84,9 +85,8 @@ def cached_answer(question: str) -> str:
             "content": (
                 "You are a strict document-based Q&A assistant.\n"
                 "Answer ONLY the question.\n"
-                "Do NOT repeat context or question.\n"
                 "Respond in 1 short sentence.\n"
-                "If not found say:\n"
                 "'I could not find this information in the document.'"
             )
         },
@@ -106,99 +106,97 @@ def cached_answer(question: str) -> str:
         output = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
-            temperature=0.3,
-            do_sample=True
         )
-    text = tokenizer.decode(output[0], skip_special_tokens=True)
-    return text.strip().split("\n")[-1]
-# =========================================================
-# TTS (CACHED)
-# =========================================================
-import base64
 @lru_cache(maxsize=128)
-def cached_tts(text: str) -> str:
     payload = {
-        "text": text[:MAX_TTS_CHARS],
         "language_id": "en",
-        "mode": "Speak 🗣️",
-        "exaggeration": 0.5,
-        "temperature": 0.8,
-        "cfg_weight": 0.5
     }
-    r = SESSION.post(TTS_API_URL, json=payload)
     r.raise_for_status()
     data = r.json()
-    if "audio" not in data:
-        raise RuntimeError("TTS API returned no audio field")
-    audio_b64 = data["audio"]
     audio_bytes = base64.b64decode(audio_b64)
-    audio_path = f"/tmp/tts_{abs(hash(text))}.wav"
-    with open(audio_path, "wb") as f:
         f.write(audio_bytes)
-    return audio_path
-# =========================================================
-# PIPELINE
-# =========================================================
 def run_pipeline(question):
     if not question.strip():
         return "", None
-    # 1️⃣ TEXT (FAST)
-    answer = cached_answer(question)
-    # 2️⃣ AUDIO (CAN TAKE TIME)
-    audio_path = cached_tts(answer)
-    return answer, audio_path
-# =========================================================
 # UI
-# =========================================================
-def build_ui():
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("## 🤖 OhamLab AI Assistant with Voice")
-        with gr.Row():
-            question = gr.Textbox(
                 label="Your Question",
-                placeholder="Ask something about the document...",
-                lines=2
             )
-        ask = gr.Button("🚀 Ask")
-        with gr.Row():
-            answer_box = gr.Markdown(label="Answer")
-        with gr.Row():
-            audio_box = gr.Audio(label="Voice Response", autoplay=True)
-        ask.click(
-            fn=run_pipeline,
-            inputs=question,
-            outputs=[answer_box, audio_box]
-        )
-        demo.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=False,
-            show_api=False
-        )
-# =========================================================
-# MAIN
-# =========================================================
-if __name__ == "__main__":
-    print("✅ Qwen + TTS Assistant Ready")
-    build_ui()

 import os
+import base64
+import uuid
 import requests
 import torch
 import gradio as gr
 import numpy as np
 from functools import lru_cache
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
+# =====================================================
 # CONFIG
+# =====================================================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 DOC_FILE = "general.md"
 TTS_API_URL = os.getenv(
     "TTS_API_URL",
     "https://rahul7star-Chatterbox-Multilingual-TTS-API.hf.space/tts"
 )
+MAX_NEW_TOKENS = 128
+TOP_K = 3
+SESSION = requests.Session()
+# =====================================================
+# LOAD DOCUMENT
+# =====================================================
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DOC_PATH = os.path.join(BASE_DIR, DOC_FILE)
+with open(DOC_PATH, "r", encoding="utf-8", errors="ignore") as f:
+    DOC_TEXT = f.read()
+# =====================================================
+# CHUNK + EMBED
+# =====================================================
 def chunk_text(text, chunk_size=300, overlap=50):
     words = text.split()
+    chunks, i = [], 0
     while i < len(words):
         chunks.append(" ".join(words[i:i + chunk_size]))
         i += chunk_size - overlap
     return chunks
 DOC_CHUNKS = chunk_text(DOC_TEXT)
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+DOC_EMBEDS = embedder.encode(
+    DOC_CHUNKS, normalize_embeddings=True, batch_size=32
+)
+# =====================================================
+# LOAD QWEN (FAST SETTINGS)
+# =====================================================
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    trust_remote_code=True
+)
+model.eval()
+# =====================================================
 # RETRIEVAL
+# =====================================================
+@lru_cache(maxsize=256)
+def retrieve_context(question: str):
     q_emb = embedder.encode([question], normalize_embeddings=True)
     scores = np.dot(DOC_EMBEDS, q_emb[0])
+    top_ids = scores.argsort()[-TOP_K:][::-1]
+    return "\n\n".join(DOC_CHUNKS[i] for i in top_ids)
+# =====================================================
+# QWEN ANSWER (FAST)
+# =====================================================
+def answer_question(question: str) -> str:
     context = retrieve_context(question)
     messages = [
             "content": (
                 "You are a strict document-based Q&A assistant.\n"
                 "Answer ONLY the question.\n"
                 "Respond in 1 short sentence.\n"
+                "If not found, say:\n"
                 "'I could not find this information in the document.'"
             )
         },
         output = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+            use_cache=True
         )
+    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
+    return decoded.split("\n")[-1].strip()
+# =====================================================
+# TTS (FAST + SAFE)
+# =====================================================
 @lru_cache(maxsize=128)
+def generate_audio(text: str) -> str:
     payload = {
+        "text": text,
         "language_id": "en",
+        "mode": "Speak 🗣️"
     }
+    r = SESSION.post(TTS_API_URL, json=payload, timeout=None)
     r.raise_for_status()
+    # Unique output path
+    wav_path = f"/tmp/tts_{uuid.uuid4().hex}.wav"
+    # Case 1: raw audio
+    if r.headers.get("content-type", "").startswith("audio"):
+        with open(wav_path, "wb") as f:
+            f.write(r.content)
+        return wav_path
+    # Case 2: JSON base64
     data = r.json()
+    audio_b64 = (
+        data.get("audio")
+        or data.get("audio_base64")
+        or data.get("wav")
+    )
+    if not audio_b64:
+        raise RuntimeError(f"TTS API returned no audio field: {data}")
     audio_bytes = base64.b64decode(audio_b64)
+    with open(wav_path, "wb") as f:
         f.write(audio_bytes)
+    if os.path.getsize(wav_path) < 1000:
+        raise RuntimeError("Generated audio file is too small")
+    return wav_path
+# =====================================================
+# MAIN PIPELINE
+# =====================================================
 def run_pipeline(question):
     if not question.strip():
         return "", None
+    answer = answer_question(question)
+    audio_path = generate_audio(answer)
+    return f"**Bot:** {answer}", audio_path
+# =====================================================
 # UI
+# =====================================================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 📘 Document Q&A with Voice")
+    with gr.Row():
+        with gr.Column():
+            user_input = gr.Textbox(
                 label="Your Question",
+                placeholder="Who is CEO of OhamLab?",
+                lines=3
             )
+            ask_btn = gr.Button("Ask")
+        with gr.Column():
+            answer_text = gr.Markdown()
+            answer_audio = gr.Audio(type="filepath")
+    ask_btn.click(
+        fn=run_pipeline,
+        inputs=user_input,
+        outputs=[answer_text, answer_audio]
+    )
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    share=False,
+    queue=True
+)