Spaces:

rahul7star
/

OhamLab-AI

Sleeping

App Files Files Community

rahul7star commited on Jan 12

Commit

e3405bf

verified ·

1 Parent(s): f7d140b

optim 1

Browse files

Files changed (1) hide show

app_qwen_tts_fast.py +69 -35

app_qwen_tts_fast.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 import gradio as gr
 import numpy as np
 from functools import lru_cache
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 # =====================================================
@@ -14,13 +14,15 @@ from sentence_transformers import SentenceTransformer
 # =====================================================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 DOC_FILE = "general.md"
 TTS_API_URL = os.getenv(
     "TTS_API_URL",
-    "ETS"
 )
-print(TTS_API_URL)
-MAX_NEW_TOKENS = 128
 TOP_K = 3
 SESSION = requests.Session()
@@ -48,46 +50,73 @@ DOC_CHUNKS = chunk_text(DOC_TEXT)
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 DOC_EMBEDS = embedder.encode(
-    DOC_CHUNKS, normalize_embeddings=True, batch_size=32
 )
 # =====================================================
-# LOAD QWEN (FAST SETTINGS)
 # =====================================================
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True
 )
 model.eval()
 # =====================================================
-# RETRIEVAL
 # =====================================================
 @lru_cache(maxsize=256)
 def retrieve_context(question: str):
     q_emb = embedder.encode([question], normalize_embeddings=True)
     scores = np.dot(DOC_EMBEDS, q_emb[0])
     top_ids = scores.argsort()[-TOP_K:][::-1]
     return "\n\n".join(DOC_CHUNKS[i] for i in top_ids)
 # =====================================================
-# QWEN ANSWER (FAST)
 # =====================================================
-def answer_question(question: str) -> str:
     context = retrieve_context(question)
     messages = [
         {
             "role": "system",
             "content": (
-                "You are a strict document-based Q&A assistant.\n"
-                "Answer ONLY the question.\n"
-                "Respond in 1 short sentence.\n"
-                "If not found, say:\n"
-                "'I could not find this information in the document.'"
             )
         },
         {
@@ -107,17 +136,23 @@ def answer_question(question: str) -> str:
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             do_sample=False,
             use_cache=True
         )
-    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
-    return decoded.split("\n")[-1].strip()
 # =====================================================
-# TTS (FAST + SAFE)
 # =====================================================
 @lru_cache(maxsize=128)
-def generate_audio(text: str) -> str:
     payload = {
         "text": text,
         "language_id": "en",
@@ -127,16 +162,13 @@ def generate_audio(text: str) -> str:
     r = SESSION.post(TTS_API_URL, json=payload, timeout=None)
     r.raise_for_status()
-    # Unique output path
     wav_path = f"/tmp/tts_{uuid.uuid4().hex}.wav"
-    # Case 1: raw audio
     if r.headers.get("content-type", "").startswith("audio"):
         with open(wav_path, "wb") as f:
             f.write(r.content)
         return wav_path
-    # Case 2: JSON base64
     data = r.json()
     audio_b64 = (
         data.get("audio")
@@ -145,38 +177,41 @@ def generate_audio(text: str) -> str:
     )
     if not audio_b64:
-        raise RuntimeError(f"TTS API returned no audio field: {data}")
-    audio_bytes = base64.b64decode(audio_b64)
     with open(wav_path, "wb") as f:
-        f.write(audio_bytes)
     if os.path.getsize(wav_path) < 1000:
-        raise RuntimeError("Generated audio file is too small")
     return wav_path
 # =====================================================
-# MAIN PIPELINE
 # =====================================================
 def run_pipeline(question):
     if not question.strip():
         return "", None
     answer = answer_question(question)
-    audio_path = generate_audio(answer)
     return f"**Bot:** {answer}", audio_path
 # =====================================================
 # UI
 # =====================================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
-        with gr.Column():
             user_input = gr.Textbox(
                 label="Your Question",
                 placeholder="Who is CEO of OhamLab?",
@@ -184,9 +219,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
             ask_btn = gr.Button("Ask")
-        with gr.Column():
             answer_text = gr.Markdown()
-            answer_audio = gr.Audio(type="filepath")
     ask_btn.click(
         fn=run_pipeline,
@@ -194,11 +229,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         outputs=[answer_text, answer_audio]
     )
-demo.queue()  # enable long-running jobs (5 min audio OK)
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
     share=False
 )

 import gradio as gr
 import numpy as np
 from functools import lru_cache
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from sentence_transformers import SentenceTransformer
 # =====================================================
 # =====================================================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 DOC_FILE = "general.md"
 TTS_API_URL = os.getenv(
     "TTS_API_URL",
+    "https://rahul7star-Chatterbox-Multilingual-TTS-API.hf.space/tts"
 )
+MAX_NEW_TOKENS = 80          # 🔥 shorter = faster
 TOP_K = 3
+MIN_RELEVANCE_SCORE = 0.35   # 🔒 anti-hallucination
 SESSION = requests.Session()
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 DOC_EMBEDS = embedder.encode(
+    DOC_CHUNKS,
+    normalize_embeddings=True,
+    batch_size=32
 )
 # =====================================================
+# LOAD QWEN (QUANTIZED IF POSSIBLE)
 # =====================================================
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+bnb_config = None
+if torch.cuda.is_available():
+    try:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        print("✅ Using 4-bit quantization")
+    except Exception:
+        print("⚠️ bitsandbytes not available, loading normal model")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
+    quantization_config=bnb_config,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True
 )
 model.eval()
 # =====================================================
+# RETRIEVAL (STRICT)
 # =====================================================
 @lru_cache(maxsize=256)
 def retrieve_context(question: str):
     q_emb = embedder.encode([question], normalize_embeddings=True)
     scores = np.dot(DOC_EMBEDS, q_emb[0])
     top_ids = scores.argsort()[-TOP_K:][::-1]
+    top_score = scores[top_ids[0]]
+    if top_score < MIN_RELEVANCE_SCORE:
+        return None
     return "\n\n".join(DOC_CHUNKS[i] for i in top_ids)
 # =====================================================
+# ANSWER (NO HALLUCINATION)
 # =====================================================
+def answer_question(question: str):
     context = retrieve_context(question)
+    # 🚨 Abort early
+    if context is None:
+        return None
     messages = [
         {
             "role": "system",
             "content": (
+                "You are a STRICT document-based assistant.\n"
+                "ONLY answer if the information is explicitly present.\n"
+                "If not found, reply EXACTLY:\n"
+                "'I could not find this information in the document.'\n"
+                "Do NOT explain. Do NOT guess."
             )
         },
         {
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             do_sample=False,
+            temperature=0.0,
             use_cache=True
         )
+    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()
+    final = decoded.split("\n")[-1].strip()
+    if "could not find this information" in final.lower():
+        return None
+    return final
 # =====================================================
+# TTS (SAFE + CACHED)
 # =====================================================
 @lru_cache(maxsize=128)
+def generate_audio(text: str):
     payload = {
         "text": text,
         "language_id": "en",
     r = SESSION.post(TTS_API_URL, json=payload, timeout=None)
     r.raise_for_status()
     wav_path = f"/tmp/tts_{uuid.uuid4().hex}.wav"
     if r.headers.get("content-type", "").startswith("audio"):
         with open(wav_path, "wb") as f:
             f.write(r.content)
         return wav_path
     data = r.json()
     audio_b64 = (
         data.get("audio")
     )
     if not audio_b64:
+        raise RuntimeError(f"TTS API returned no audio: {data}")
     with open(wav_path, "wb") as f:
+        f.write(base64.b64decode(audio_b64))
     if os.path.getsize(wav_path) < 1000:
+        raise RuntimeError("Generated audio file is empty")
     return wav_path
 # =====================================================
+# PIPELINE
 # =====================================================
 def run_pipeline(question):
     if not question.strip():
         return "", None
     answer = answer_question(question)
+    # 🚨 FAST EXIT — NO AUDIO
+    if answer is None:
+        return (
+            "**Bot:** I could not find this information in the document.",
+            None
+        )
+    audio_path = generate_audio(answer)
     return f"**Bot:** {answer}", audio_path
 # =====================================================
 # UI
 # =====================================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
+        with gr.Column(scale=1):
             user_input = gr.Textbox(
                 label="Your Question",
                 placeholder="Who is CEO of OhamLab?",
             )
             ask_btn = gr.Button("Ask")
+        with gr.Column(scale=1):
             answer_text = gr.Markdown()
+            answer_audio = gr.Audio(type="filepath", label="Assistant Voice")
     ask_btn.click(
         fn=run_pipeline,
         outputs=[answer_text, answer_audio]
     )
+demo.queue()
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
     share=False
 )