Spaces:

rahul7star
/

OhamLab-AI

Running

App Files Files Community

rahul7star commited on Jan 12

Commit

bb5fd6a

verified ·

1 Parent(s): 5deb4cf

Create app_qwen_tts_fast.py

Browse files

Files changed (1) hide show

app_qwen_tts_fast.py +195 -0

app_qwen_tts_fast.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import requests
+import torch
+import gradio as gr
+import numpy as np
+import soundfile as sf
+from functools import lru_cache
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from sentence_transformers import SentenceTransformer
+# =========================================================
+# CONFIG
+# =========================================================
+MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+DOC_FILE = "general.md"
+MAX_NEW_TOKENS = 200
+TOP_K = 3
+MAX_TTS_CHARS = 200  # 🔥 BIG SPEED WIN
+TTS_API_URL = os.getenv(
+    "TTS_API_URL",
+    "https://rahul7star-Chatterbox-Multilingual-TTS-API.hf.space/tts"
+)
+SESSION = requests.Session()  # 🔥 reuse HTTP connection
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DOC_PATH = os.path.join(BASE_DIR, DOC_FILE)
+# =========================================================
+# LOAD MODELS
+# =========================================================
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    trust_remote_code=True
+)
+model.eval()
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# =========================================================
+# LOAD DOCUMENT
+# =========================================================
+def chunk_text(text, chunk_size=300, overlap=50):
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunks.append(" ".join(words[i:i + chunk_size]))
+        i += chunk_size - overlap
+    return chunks
+with open(DOC_PATH, "r", encoding="utf-8", errors="ignore") as f:
+    DOC_TEXT = f.read()
+DOC_CHUNKS = chunk_text(DOC_TEXT)
+DOC_EMBEDS = embedder.encode(DOC_CHUNKS, normalize_embeddings=True)
+# =========================================================
+# RETRIEVAL
+# =========================================================
+def retrieve_context(question, k=TOP_K):
+    q_emb = embedder.encode([question], normalize_embeddings=True)
+    scores = np.dot(DOC_EMBEDS, q_emb[0])
+    ids = scores.argsort()[-k:][::-1]
+    return "\n\n".join(DOC_CHUNKS[i] for i in ids)
+# =========================================================
+# QWEN (CACHED)
+# =========================================================
+@lru_cache(maxsize=128)
+def cached_answer(question: str) -> str:
+    context = retrieve_context(question)
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a strict document-based Q&A assistant.\n"
+                "Answer ONLY the question.\n"
+                "Do NOT repeat context or question.\n"
+                "Respond in 1 short sentence.\n"
+                "If not found say:\n"
+                "'I could not find this information in the document.'"
+            )
+        },
+        {
+            "role": "user",
+            "content": f"Context:\n{context}\n\nQuestion:\n{question}"
+        }
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=0.3,
+            do_sample=True
+        )
+    text = tokenizer.decode(output[0], skip_special_tokens=True)
+    return text.strip().split("\n")[-1]
+# =========================================================
+# TTS (CACHED)
+# =========================================================
+@lru_cache(maxsize=128)
+def cached_tts(text: str) -> str:
+    payload = {
+        "text": text[:MAX_TTS_CHARS],
+        "language_id": "en",
+        "mode": "Speak 🗣️",
+        "exaggeration": 0.5,
+        "temperature": 0.8,
+        "cfg_weight": 0.5
+    }
+    r = SESSION.post(TTS_API_URL, json=payload)
+    r.raise_for_status()
+    audio_bytes = r.content  # RAW WAV BYTES
+    audio_path = f"/tmp/{abs(hash(text))}.wav"
+    with open(audio_path, "wb") as f:
+        f.write(audio_bytes)
+    return audio_path
+# =========================================================
+# PIPELINE
+# =========================================================
+def run_pipeline(question):
+    if not question.strip():
+        return "", None
+    # 1️⃣ TEXT (FAST)
+    answer = cached_answer(question)
+    # 2️⃣ AUDIO (CAN TAKE TIME)
+    audio_path = cached_tts(answer)
+    return answer, audio_path
+# =========================================================
+# UI
+# =========================================================
+def build_ui():
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("## 🤖 OhamLab AI Assistant with Voice")
+        with gr.Row():
+            question = gr.Textbox(
+                label="Your Question",
+                placeholder="Ask something about the document...",
+                lines=2
+            )
+        ask = gr.Button("🚀 Ask")
+        with gr.Row():
+            answer_box = gr.Markdown(label="Answer")
+        with gr.Row():
+            audio_box = gr.Audio(label="Voice Response", autoplay=True)
+        ask.click(
+            fn=run_pipeline,
+            inputs=question,
+            outputs=[answer_box, audio_box]
+        )
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_api=False
+        )
+# =========================================================
+# MAIN
+# =========================================================
+if __name__ == "__main__":
+    print("✅ Qwen + TTS Assistant Ready")
+    build_ui()