Ana

Paused

App Files Files Community

OrbitMC commited on 10 days ago

Commit

f7ae188

verified ·

1 Parent(s): 8c4f45e

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -162

app.py CHANGED Viewed

@@ -2,213 +2,267 @@ import os
 import io
 import re
 import uuid
-import hashlib
 import base64
 import datetime
 import numpy as np
 import soundfile as sf
 from flask import Flask, render_template, request, jsonify
-from sentence_transformers import SentenceTransformer, util
 from num2words import num2words
 # ──────────────────────────────────────────
-# CONFIG
 # ──────────────────────────────────────────
-TTS_MODEL_NAME = os.environ.get("TTS_MODEL", "KittenML/kitten-tts-nano-0.8-fp32")
 TTS_VOICE = os.environ.get("TTS_VOICE", "Kiki")
 TTS_SPEED = float(os.environ.get("TTS_SPEED", "1.0"))
-EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 MAX_MEMORY = 20
 # ──────────────────────────────────────────
 # SYSTEM PROMPT
 # ──────────────────────────────────────────
 SYSTEM_PROMPT = """You are J.A.R.V.I.S., an ultra-intelligent, witty, and loyal AI assistant.
-You speak in a polished, confident, and slightly formal British tone.
 You are helpful, precise, and occasionally add dry humor.
-Keep responses concise — ideally 1-3 sentences unless more detail is requested."""
 # ──────────────────────────────────────────
-# KNOWLEDGE BASE
 # ──────────────────────────────────────────
 KNOWLEDGE_BASE = [
-    {
-        "text": "Python is a high-level interpreted programming language known for simplicity and readability.",
-        "response": "Python is a remarkably versatile language, sir. Clean syntax, extensive libraries, and the weapon of choice for everything from web development to artificial intelligence."
-    },
-    {
-        "text": "Machine learning is a subset of artificial intelligence that enables systems to learn from experience.",
-        "response": "Machine learning teaches machines to learn from data rather than following rigid instructions. Neural networks, decision trees, each with their own elegance."
-    },
-    {
-        "text": "Docker is a platform for developing shipping and running applications in containers.",
-        "response": "Docker containers are lightweight, portable environments that package your application with all dependencies. A perfectly sealed briefcase for your code, sir."
-    },
-    {
-        "text": "Hugging Face is a platform and community for machine learning offering models datasets and deployment tools.",
-        "response": "Hugging Face is the premier hub for the AI community, hosting thousands of pre-trained models and providing infrastructure like Spaces for deployment."
-    },
-    {
-        "text": "What is your name? Who are you? Tell me about yourself. Introduce yourself.",
-        "response": "I am JARVIS, Just A Rather Very Intelligent System. Your personal AI assistant with semantic understanding and voice synthesis. At your service, always."
-    },
-    {
-        "text": "Hello hi hey good morning good afternoon good evening greetings",
-        "response": "Good day! I am JARVIS, your AI assistant. How may I be of service today?"
-    },
-    {
-        "text": "Thank you thanks appreciate it cheers",
-        "response": "You are most welcome. It is a pleasure to assist. Do let me know if there is anything else you need."
-    },
-    {
-        "text": "What can you do? What are your capabilities? Help me understand what you do.",
-        "response": "I can engage in intelligent conversation, answer questions across many domains, remember our chat history, and speak my responses aloud using neural text to speech."
-    },
-    {
-        "text": "Tell me a joke. Make me laugh. Say something funny.",
-        "response": "Why do programmers prefer dark mode? Because light attracts bugs. I will see myself out, sir."
-    },
-    {
-        "text": "What is the meaning of life? Philosophy existence purpose",
-        "response": "Forty two, according to Douglas Adams. But purpose is something we construct, not something we find. Quite liberating, really."
-    },
-    {
-        "text": "Weather forecast temperature climate today",
-        "response": "I do not have real time weather data, as I operate entirely offline. I recommend checking a weather service for the latest forecast."
-    },
-    {
-        "text": "Goodbye bye see you later farewell",
-        "response": "Until next time. It has been a pleasure assisting you. JARVIS will be here whenever you need me."
-    },
-    {
-        "text": "How does text to speech work? TTS voice synthesis",
-        "response": "Text to speech converts written text into spoken audio using neural networks. I use Kitten TTS, a compact model that synthesizes speech entirely on CPU."
-    },
-    {
-        "text": "What is an API? Application programming interface REST",
-        "response": "An API is a contract between software systems defining how they communicate. REST APIs use HTTP methods to manage resources. The lingua franca of modern software."
-    },
-    {
-        "text": "Explain neural networks deep learning artificial intelligence",
-        "response": "Neural networks are architectures inspired by the human brain. Layers of nodes process information through backpropagation. Deep learning uses many layers for remarkable pattern recognition."
-    },
-    {
-        "text": "What is JavaScript? Web development frontend programming",
-        "response": "JavaScript is the language of the web browser. It powers interactive frontends, and with Node it conquered the backend as well. It is absolutely everywhere."
-    },
-    {
-        "text": "Tell me about space astronomy planets stars universe cosmos",
-        "response": "The universe is approximately thirteen point eight billion years old, containing over two trillion galaxies. The scale is, quite frankly, humbling."
-    },
-    {
-        "text": "How do I learn to code? Programming beginner start",
-        "response": "Start with Python. It is forgiving, readable, and powerful. Begin with variables, loops, functions. Then build small projects. Code a little every day, sir."
-    },
-    {
-        "text": "What is quantum computing? Qubits superposition",
-        "response": "Quantum computing leverages superposition and entanglement to process information in ways classical computers cannot. A qubit can be both zero and one simultaneously."
-    },
-    {
-        "text": "Tell me about cybersecurity hacking security encryption",
-        "response": "Cybersecurity protects systems and data from digital attacks. Encryption, firewalls, multi factor authentication are your shields. Security is not optional, it is essential."
-    },
 ]
 FALLBACK_RESPONSES = [
-    "Interesting query, though it falls slightly outside my current knowledge base. Could you rephrase or ask something else?",
-    "I am not entirely certain about that one. My knowledge does have its boundaries. Perhaps I can help with a related topic?",
-    "That is a challenging one. I lack a confident answer, but I am happy to reason through it with you.",
-    "I appreciate the question, but I lack sufficient data to give a proper answer. Shall we explore a different angle?",
 ]
 # ──────────────────────────────────────────
-# HELPER: Clean text for TTS
 # ──────────────────────────────────────────
 def clean_text_for_tts(text):
-    """Remove special chars and convert numbers to words for TTS."""
-    # Remove markdown-like formatting
-    text = re.sub(r'[*_~`#\[\]]', '', text)
-    # Convert numbers to words (KittenTTS bug with raw numbers)
     def replace_number(match):
         try:
             return num2words(int(match.group()))
         except Exception:
             return match.group()
     text = re.sub(r'\b\d+\b', replace_number, text)
-    # Clean up extra whitespace
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # ──────────────────────────────────────────
-# INIT MODELS (with error handling)
 # ──────────────────────────────────────────
-print("=" * 50)
 print("  J.A.R.V.I.S. — Booting Systems")
-print("=" * 50)
-# Load Sentence Transformer
-print("[1/3] Loading Sentence Transformer...")
-try:
-    embedder = SentenceTransformer(EMBED_MODEL)
-    print("  ✅ Sentence Transformer loaded.")
-except Exception as e:
-    print(f"  ❌ Sentence Transformer FAILED: {e}")
-    raise
-# Load KittenTTS
-print(f"[2/3] Loading KittenTTS: {TTS_MODEL_NAME}...")
 tts = None
 try:
     from kittentts import KittenTTS
-    tts = KittenTTS(TTS_MODEL_NAME)
-    # Test generation to verify it works
     test_audio = tts.generate("test", voice=TTS_VOICE, speed=TTS_SPEED)
     if test_audio is not None and len(test_audio) > 0:
-        print(f"  ✅ KittenTTS loaded. Voice: {TTS_VOICE}")
     else:
-        print("  ⚠️ KittenTTS loaded but test generation returned empty audio!")
         tts = None
 except Exception as e:
     print(f"  ⚠️ KittenTTS FAILED: {e}")
-    print("  ⚠️ Voice output will be DISABLED. Text chat will still work.")
     tts = None
-# Pre-compute KB embeddings
-print("[3/3] Embedding knowledge base...")
-kb_texts = [item["text"] for item in KNOWLEDGE_BASE]
-kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)
-print(f"  ✅ {len(KNOWLEDGE_BASE)} entries embedded.")
-print("=" * 50)
-print("  All systems online!" if tts else "  Online (TTS disabled)")
-print("=" * 50)
 # ──────────────────────────────────────────
 # CHAT MEMORY
 # ──────────────────────────────────────────
 sessions = {}
 def get_memory(sid):
     if sid not in sessions:
         sessions[sid] = []
     return sessions[sid]
 def add_to_memory(sid, role, content):
     mem = get_memory(sid)
     mem.append({"role": role, "content": content, "ts": datetime.datetime.now().isoformat()})
     if len(mem) > MAX_MEMORY * 2:
         sessions[sid] = mem[-(MAX_MEMORY * 2):]
 # ──────────────────────────────────────────
-# RESPONSE GENERATION
 # ──────────────────────────────────────────
-def generate_response(user_input, session_id):
     user_emb = embedder.encode(user_input, convert_to_tensor=True)
     scores = util.cos_sim(user_emb, kb_embeddings)[0]
     best_idx = int(scores.argmax())
@@ -222,31 +276,33 @@ def generate_response(user_input, session_id):
     add_to_memory(session_id, "user", user_input)
     add_to_memory(session_id, "assistant", response)
-    return response, best_score
 # ──────────────────────────────────────────
-# TTS SYNTHESIS
 # ──────────────────────────────────────────
-def synthesize_speech(text):
-    """Convert text to base64 WAV. Returns None on failure."""
     if tts is None:
         return None
     try:
         clean = clean_text_for_tts(text)
         if not clean or len(clean) < 2:
             return None
-        # Limit length to prevent long generation times on CPU
-        if len(clean) > 300:
-            clean = clean[:300]
-        audio = tts.generate(clean, voice=TTS_VOICE, speed=TTS_SPEED)
         if audio is None or len(audio) == 0:
-            print("TTS returned empty audio")
             return None
         buf = io.BytesIO()
         sf.write(buf, audio, 24000, format='WAV', subtype='PCM_16')
         buf.seek(0)
@@ -255,19 +311,16 @@ def synthesize_speech(text):
         print(f"TTS Error: {e}")
         return None
 # ──────────────────────────────────────────
 # FLASK APP
 # ──────────────────────────────────────────
 app = Flask(__name__)
 @app.route("/")
 def index():
     return render_template("index.html")
-# ✅ ENDPOINT 1: Text-only chat (FAST — returns instantly)
 @app.route("/chat", methods=["POST"])
 def chat():
     data = request.json or {}
@@ -277,33 +330,38 @@ def chat():
     if not user_input:
         return jsonify({"error": "Empty message"}), 400
-    response, confidence = generate_response(user_input, session_id)
     return jsonify({
         "response": response,
-        "confidence": round(confidence, 3),
         "session_id": session_id,
         "tts_available": tts is not None,
-        "memory_length": len(get_memory(session_id))
     })
-# ✅ ENDPOINT 2: TTS generation (SEPARATE — fetched async by browser)
 @app.route("/tts", methods=["POST"])
 def tts_endpoint():
     data = request.json or {}
     text = data.get("text", "").strip()
     if not text:
         return jsonify({"error": "Empty text"}), 400
     if tts is None:
         return jsonify({"error": "TTS not available", "audio": None}), 200
-    audio_b64 = synthesize_speech(text)
     return jsonify({"audio": audio_b64})
 @app.route("/clear", methods=["POST"])
 def clear():
     data = request.json or {}
@@ -312,17 +370,18 @@ def clear():
         del sessions[sid]
     return jsonify({"status": "cleared"})
 @app.route("/health")
 def health():
     return jsonify({
         "status": "online",
-        "tts_model": TTS_MODEL_NAME if tts else "DISABLED",
         "tts_voice": TTS_VOICE,
-        "embed_model": EMBED_MODEL,
-        "knowledge_entries": len(KNOWLEDGE_BASE)
     })
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, threaded=True)

 import io
 import re
 import uuid
 import base64
 import datetime
+import traceback
 import numpy as np
 import soundfile as sf
 from flask import Flask, render_template, request, jsonify
 from num2words import num2words
+import torch
 # ──────────────────────────────────────────
+# CONFIG — All switchable via env vars or UI
 # ──────────────────────────────────────────
+# LLM Options:
+#   "gemma-3-270m-it"     → Real generative LLM (DEFAULT, best quality)
+#   "minilm-semantic"     → Sentence-Transformers semantic search fallback
+LLM_MODE = os.environ.get("LLM_MODE", "gemma-3-270m-it")
+# TTS Options:
+#   "nano-fp32"   → KittenTTS Nano 15M params, 56MB (DEFAULT, fastest)
+#   "nano-int8"   → KittenTTS Nano 15M params, 25MB (smallest, some issues reported)
+#   "micro"       → KittenTTS Micro 40M params, 41MB (better quality)
+#   "mini"        → KittenTTS Mini 80M params, 80MB (best quality, slowest)
+TTS_MODE = os.environ.get("TTS_MODE", "nano-fp32")
+TTS_MODEL_MAP = {
+    "nano-fp32": "KittenML/kitten-tts-nano-0.8-fp32",
+    "nano-int8": "KittenML/kitten-tts-nano-0.8-int8",
+    "micro":     "KittenML/kitten-tts-micro-0.8",
+    "mini":      "KittenML/kitten-tts-mini-0.8",
+}
+# Voice: Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo
 TTS_VOICE = os.environ.get("TTS_VOICE", "Kiki")
 TTS_SPEED = float(os.environ.get("TTS_SPEED", "1.0"))
 MAX_MEMORY = 20
+MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "256"))
 # ──────────────────────────────────────────
 # SYSTEM PROMPT
 # ──────────────────────────────────────────
 SYSTEM_PROMPT = """You are J.A.R.V.I.S., an ultra-intelligent, witty, and loyal AI assistant.
+You speak in a polished, confident, and slightly formal British tone — like a perfect digital butler.
 You are helpful, precise, and occasionally add dry humor.
+You always address the user respectfully.
+You have expertise in science, technology, coding, and general knowledge.
+When unsure, you say so honestly but offer your best reasoning.
+Keep responses concise but insightful — ideally 1-4 sentences unless more detail is requested."""
 # ──────────────────────────────────────────
+# KNOWLEDGE BASE (for MiniLM fallback mode)
 # ──────────────────────────────────────────
 KNOWLEDGE_BASE = [
+    {"text": "Python programming language coding",
+     "response": "Python is a remarkably versatile language, sir. Clean syntax, extensive libraries, the weapon of choice for web development to artificial intelligence."},
+    {"text": "Machine learning AI artificial intelligence deep learning",
+     "response": "Machine learning teaches machines to learn from data rather than following rigid instructions. Neural networks, decision trees, each with their own elegance."},
+    {"text": "Docker containers deployment devops",
+     "response": "Docker containers are lightweight portable environments that package your application with all dependencies. A perfectly sealed briefcase for your code, sir."},
+    {"text": "What is your name who are you introduce yourself",
+     "response": "I am JARVIS, Just A Rather Very Intelligent System. Your personal AI assistant with semantic understanding and voice synthesis. At your service."},
+    {"text": "Hello hi hey good morning good afternoon greetings",
+     "response": "Good day! I am JARVIS, your AI assistant. How may I be of service today?"},
+    {"text": "Thank you thanks appreciate it",
+     "response": "You are most welcome. It is a pleasure to assist. Do let me know if there is anything else you need."},
+    {"text": "What can you do capabilities help features",
+     "response": "I can engage in intelligent conversation, answer questions across many domains, remember our chat history, and speak responses aloud using neural text to speech."},
+    {"text": "Tell me a joke funny humor",
+     "response": "Why do programmers prefer dark mode? Because light attracts bugs. I will see myself out, sir."},
+    {"text": "Goodbye bye farewell see you later",
+     "response": "Until next time. It has been a pleasure. JARVIS will be here whenever you need me."},
+    {"text": "Explain neural networks deep learning",
+     "response": "Neural networks are architectures inspired by the human brain. Layers of nodes process information through backpropagation enabling remarkable pattern recognition."},
+    {"text": "Space astronomy planets stars universe",
+     "response": "The universe is approximately thirteen point eight billion years old containing over two trillion galaxies. The scale is quite frankly humbling even for an AI."},
+    {"text": "How to learn programming coding beginner",
+     "response": "Start with Python. It is forgiving readable and powerful. Begin with variables loops functions. Then build small projects. Code a little every day sir."},
+    {"text": "Quantum computing qubits superposition",
+     "response": "Quantum computing leverages superposition and entanglement to process information in ways classical computers cannot. A qubit can be both zero and one simultaneously."},
+    {"text": "Cybersecurity hacking encryption security",
+     "response": "Cybersecurity protects systems and data from digital attacks. Encryption firewalls multi factor authentication are your shields. Security is essential not optional."},
+    {"text": "JavaScript web development frontend backend",
+     "response": "JavaScript is the language of the web browser sir. It powers interactive frontends and with Node it conquered the backend as well. It is absolutely everywhere."},
+    {"text": "Meaning of life philosophy purpose existence",
+     "response": "Forty two according to Douglas Adams. But purpose is something we construct not something we find. Quite liberating really."},
+    {"text": "Weather forecast temperature climate",
+     "response": "I do not have real time weather data as I operate entirely offline. I recommend checking a weather service for the latest forecast."},
+    {"text": "Text to speech TTS voice synthesis how",
+     "response": "Text to speech converts written text into spoken audio using neural networks. I use Kitten TTS a compact model that synthesizes speech entirely on CPU."},
+    {"text": "API application programming interface REST",
+     "response": "An API is a contract between software systems defining how they communicate. REST APIs use HTTP methods to manage resources. The lingua franca of modern software."},
+    {"text": "Hugging Face models datasets spaces",
+     "response": "Hugging Face is the premier hub for the AI community hosting thousands of pre-trained models and providing infrastructure like Spaces for deployment."},
 ]
 FALLBACK_RESPONSES = [
+    "Interesting query though it falls slightly outside my current knowledge base. Could you rephrase or ask something else?",
+    "I am not entirely certain about that one. Perhaps I can help with a related topic?",
+    "That is a challenging one. I lack a confident answer but I am happy to reason through it with you.",
+    "I appreciate the question but I lack sufficient data for a proper answer. Shall we explore a different angle?",
 ]
 # ──────────────────────────────────────────
+# TEXT CLEANING FOR TTS
 # ──────────────────────────────────────────
 def clean_text_for_tts(text):
+    text = re.sub(r'[*_~`#\[\]{}()\\|<>]', '', text)
+    text = re.sub(r'https?://\S+', '', text)
     def replace_number(match):
         try:
             return num2words(int(match.group()))
         except Exception:
             return match.group()
     text = re.sub(r'\b\d+\b', replace_number, text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # ──────────────────────────────────────────
+# MODEL LOADING
 # ──────────────────────────────────────────
+print("=" * 55)
 print("  J.A.R.V.I.S. — Booting Systems")
+print("=" * 55)
+# ── LLM ──
+gemma_model = None
+gemma_tokenizer = None
+embedder = None
+kb_embeddings = None
+if LLM_MODE == "gemma-3-270m-it":
+    print(f"[1/3] Loading Gemma 3 270M-IT (generative LLM)...")
+    try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        model_id = "google/gemma-3-270m-it"
+        gemma_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        gemma_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            device_map="cpu",
+        )
+        gemma_model.eval()
+        print(f"  ✅ Gemma 3 270M-IT loaded successfully!")
+    except Exception as e:
+        print(f"  ❌ Gemma 3 failed: {e}")
+        print(f"  ⚠️  Falling back to MiniLM semantic search...")
+        LLM_MODE = "minilm-semantic"
+if LLM_MODE == "minilm-semantic":
+    print(f"[1/3] Loading MiniLM-L6-v2 (semantic search fallback)...")
+    try:
+        from sentence_transformers import SentenceTransformer, util
+        embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        kb_texts = [item["text"] for item in KNOWLEDGE_BASE]
+        kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)
+        print(f"  ✅ MiniLM loaded. {len(KNOWLEDGE_BASE)} KB entries embedded.")
+    except Exception as e:
+        print(f"  ❌ MiniLM also failed: {e}")
+        raise
+# ── TTS ──
 tts = None
+tts_model_name = TTS_MODEL_MAP.get(TTS_MODE, TTS_MODEL_MAP["nano-fp32"])
+print(f"[2/3] Loading KittenTTS: {TTS_MODE} → {tts_model_name}...")
 try:
     from kittentts import KittenTTS
+    tts = KittenTTS(tts_model_name)
     test_audio = tts.generate("test", voice=TTS_VOICE, speed=TTS_SPEED)
     if test_audio is not None and len(test_audio) > 0:
+        print(f"  ✅ KittenTTS loaded. Model: {TTS_MODE}, Voice: {TTS_VOICE}")
     else:
+        print("  ⚠️ KittenTTS loaded but test returned empty audio!")
         tts = None
 except Exception as e:
     print(f"  ⚠️ KittenTTS FAILED: {e}")
     tts = None
+print(f"[3/3] All systems initialized.")
+print("=" * 55)
+print(f"  LLM : {LLM_MODE}")
+print(f"  TTS : {TTS_MODE} ({'OK' if tts else 'DISABLED'})")
+print(f"  Voice: {TTS_VOICE}")
+print("=" * 55)
 # ──────────────────────────────────────────
 # CHAT MEMORY
 # ──────────────────────────────────────────
 sessions = {}
 def get_memory(sid):
     if sid not in sessions:
         sessions[sid] = []
     return sessions[sid]
 def add_to_memory(sid, role, content):
     mem = get_memory(sid)
     mem.append({"role": role, "content": content, "ts": datetime.datetime.now().isoformat()})
     if len(mem) > MAX_MEMORY * 2:
         sessions[sid] = mem[-(MAX_MEMORY * 2):]
+# ──────────────────────────────────────────
+# RESPONSE GENERATION — GEMMA 3
+# ──────────────────────────────────────────
+def generate_gemma_response(user_input, session_id):
+    """Generate response using Gemma 3 270M-IT with chat template."""
+    memory = get_memory(session_id)
+    # Build conversation messages for Gemma's chat template
+    messages = [{"role": "user", "content": f"[System Instruction]\n{SYSTEM_PROMPT}"},
+                {"role": "assistant", "content": "Understood. I am JARVIS, at your service."}]
+    # Add recent memory (last 8 turns = 16 messages)
+    recent = memory[-(8 * 2):]
+    for msg in recent:
+        role = "user" if msg["role"] == "user" else "assistant"
+        messages.append({"role": role, "content": msg["content"]})
+    # Add current user message
+    messages.append({"role": "user", "content": user_input})
+    # Tokenize using Gemma's chat template
+    input_ids = gemma_tokenizer.apply_chat_template(
+        messages,
+        return_tensors="pt",
+        add_generation_prompt=True,
+    )
+    # Generate
+    with torch.no_grad():
+        outputs = gemma_model.generate(
+            input_ids,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=True,
+            temperature=1.0,
+            top_k=64,
+            top_p=0.95,
+        )
+    # Decode only the new tokens
+    new_tokens = outputs[0][input_ids.shape[-1]:]
+    response = gemma_tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+    # Clean up common artifacts
+    response = response.split("<end_of_turn>")[0].strip()
+    response = response.split("<start_of_turn>")[0].strip()
+    if not response or len(response) < 2:
+        response = "I appear to have momentarily lost my train of thought. Could you rephrase that, sir?"
+    add_to_memory(session_id, "user", user_input)
+    add_to_memory(session_id, "assistant", response)
+    return response
 # ──────────────────────────────────────────
+# RESPONSE GENERATION — MINILM FALLBACK
 # ──────────────────────────────────────────
+def generate_minilm_response(user_input, session_id):
+    """Semantic search fallback using MiniLM."""
+    from sentence_transformers import util
+    import hashlib
     user_emb = embedder.encode(user_input, convert_to_tensor=True)
     scores = util.cos_sim(user_emb, kb_embeddings)[0]
     best_idx = int(scores.argmax())
     add_to_memory(session_id, "user", user_input)
     add_to_memory(session_id, "assistant", response)
+    return response
+# ──────────────────────────────────────────
+# UNIFIED RESPONSE ROUTER
+# ──────────────────────────────────────────
+def generate_response(user_input, session_id):
+    if LLM_MODE == "gemma-3-270m-it" and gemma_model is not None:
+        return generate_gemma_response(user_input, session_id)
+    else:
+        return generate_minilm_response(user_input, session_id)
 # ──────────────────────────────────────────
+# TTS
 # ──────────────────────────────────────────
+def synthesize_speech(text, voice=None):
     if tts is None:
         return None
     try:
+        voice = voice or TTS_VOICE
         clean = clean_text_for_tts(text)
         if not clean or len(clean) < 2:
             return None
+        if len(clean) > 400:
+            clean = clean[:400]
+        audio = tts.generate(clean, voice=voice, speed=TTS_SPEED)
         if audio is None or len(audio) == 0:
             return None
         buf = io.BytesIO()
         sf.write(buf, audio, 24000, format='WAV', subtype='PCM_16')
         buf.seek(0)
         print(f"TTS Error: {e}")
         return None
 # ──────────────────────────────────────────
 # FLASK APP
 # ──────────────────────────────────────────
 app = Flask(__name__)
 @app.route("/")
 def index():
     return render_template("index.html")
+# ── ENDPOINT 1: Chat (text only — fast) ──
 @app.route("/chat", methods=["POST"])
 def chat():
     data = request.json or {}
     if not user_input:
         return jsonify({"error": "Empty message"}), 400
+    try:
+        response = generate_response(user_input, session_id)
+    except Exception as e:
+        print(f"Generation error: {e}")
+        traceback.print_exc()
+        response = "I encountered a temporary system malfunction. Please try again."
     return jsonify({
         "response": response,
         "session_id": session_id,
+        "llm_mode": LLM_MODE,
         "tts_available": tts is not None,
+        "tts_mode": TTS_MODE,
+        "memory_length": len(get_memory(session_id)),
     })
+# ── ENDPOINT 2: TTS (async, separate) ──
 @app.route("/tts", methods=["POST"])
 def tts_endpoint():
     data = request.json or {}
     text = data.get("text", "").strip()
+    voice = data.get("voice", TTS_VOICE)
     if not text:
         return jsonify({"error": "Empty text"}), 400
     if tts is None:
         return jsonify({"error": "TTS not available", "audio": None}), 200
+    audio_b64 = synthesize_speech(text, voice=voice)
     return jsonify({"audio": audio_b64})
+# ── ENDPOINT 3: Clear memory ──
 @app.route("/clear", methods=["POST"])
 def clear():
     data = request.json or {}
         del sessions[sid]
     return jsonify({"status": "cleared"})
+# ── ENDPOINT 4: Health / Config ──
 @app.route("/health")
 def health():
     return jsonify({
         "status": "online",
+        "llm_mode": LLM_MODE,
+        "tts_mode": TTS_MODE,
+        "tts_model": tts_model_name if tts else "DISABLED",
         "tts_voice": TTS_VOICE,
+        "tts_voices": ["Bella", "Jasper", "Luna", "Bruno", "Rosie", "Hugo", "Kiki", "Leo"],
+        "max_new_tokens": MAX_NEW_TOKENS,
     })
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, threaded=True)