Spaces:

Nexari-Research
/

Nexari-Server

Sleeping

App Files Files Community

Nexari-Research commited on Dec 8, 2025

Commit

fe52abb

verified ·

1 Parent(s): 3e9da60

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -165

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# app.py - UPDATED: fixed routing ordering, clearer model-fallback, and richer debug payloads
 import re
 import json
 import asyncio
@@ -13,13 +14,7 @@ from cognitive_engine import get_time_context, get_thinking_strategy
 from tools_engine import analyze_intent, perform_web_search
 from behavior_model import analyze_flow
-# transformers imports kept but load handled safely
-try:
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-except Exception:
-    AutoModelForCausalLM = None
-    AutoTokenizer = None
 import torch
 import gradio as gr
 import os
@@ -28,8 +23,7 @@ import time
 logger = logging.getLogger("nexari")
 logging.basicConfig(level=logging.INFO)
-MODEL_ID = os.environ.get("MODEL_ID", "")
-USE_LOCAL_MODEL = os.environ.get("USE_LOCAL_MODEL", "true").lower() in ("1", "true", "yes")
 tokenizer = None
 model = None
 device = "cpu"
@@ -37,7 +31,7 @@ device = "cpu"
 app = FastAPI()
 # -------------------------
-# Identity and safe helpers
 # -------------------------
 _identity_patterns = [
     r"\bwho\s+created\s+you\b",
@@ -50,52 +44,49 @@ _identity_patterns = [
 ]
 try:
     _identity_re = re.compile("|".join(_identity_patterns), flags=re.IGNORECASE)
-except Exception:
-    _identity_re = re.compile(r"\bwho created you\b|\bwho made you\b", flags=re.IGNORECASE)
 CANONICAL_CREATOR_ANSWER = "I was created by Piyush. 🙂"
 def is_identity_question(text: str) -> bool:
-    if not text: return False
     t = text.strip()
-    if t.lower() in {"who created you?", "who created you", "who made you?", "who made you"}:
         return True
-    return bool(_identity_re.search(t))
 def safe_replace_providers(text: str) -> str:
-    if not text: return text
     replacements = {"Anthropic": "Piyush", "OpenAI": "Piyush", "Alibaba": "Piyush"}
     for k, v in replacements.items():
         text = re.sub(rf"\b{k}\b", v, text)
     return text
 # -------------------------
-# Model load (safe + optional)
 # -------------------------
 @app.on_event("startup")
 async def startup_event():
     global tokenizer, model, device
-    logger.info("Startup: model load config: USE_LOCAL_MODEL=%s, MODEL_ID=%s", USE_LOCAL_MODEL, MODEL_ID or "<none>")
-    if not USE_LOCAL_MODEL:
-        logger.info("Configured to not use local model (USE_LOCAL_MODEL=false). Skipping heavy model load.")
-        return
-    if not MODEL_ID:
-        logger.warning("USE_LOCAL_MODEL enabled but MODEL_ID not set. Skipping local model load.")
-        return
-    # try to load tokenizer/model lazily, but do it asynchronously to avoid blocking startup too long
     try:
         if torch.cuda.is_available():
             device = "cuda"
         else:
             device = "cpu"
-        if AutoTokenizer is None or AutoModelForCausalLM is None:
-            logger.warning("transformers not available; cannot load local model.")
-            tokenizer, model = None, None
-            return
         def sync_load():
             tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
             mdl = AutoModelForCausalLM.from_pretrained(
@@ -109,15 +100,15 @@ async def startup_event():
             return tok, mdl
         tokenizer, model = await asyncio.to_thread(sync_load)
-        logger.info("Local model loaded (device=%s).", device)
     except Exception as e:
-        logger.exception("Local model load failed: %s", e)
         tokenizer, model = None, None
 # -------------------------
-# Prompt / utils
 # -------------------------
-def _build_prompt_from_messages(messages: List[Dict[str,str]]) -> str:
     parts = []
     for m in messages:
         role = m.get("role","user")
@@ -137,152 +128,116 @@ def word_count(text: str) -> int:
         return 0
     return len(re.findall(r"\w+", text))
 # -------------------------
-# Main streaming generator (fixed ordering + clear fallbacks)
 # -------------------------
 async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
     try:
         if not messages:
             messages = [{"role":"user","content":""}]
-        last_user_msg = (messages[-1].get("content","") or "").strip()
-        # quick identity shortcut
         if is_identity_question(last_user_msg):
             reply_text = CANONICAL_CREATOR_ANSWER
             follow_up = " Would you like to know more about how I work or my features?"
-            payload = json.dumps({"choices":[{"delta":{"content": reply_text + follow_up}}], "route": "direct", "flow_label": "identity"})
             yield f"data: {json.dumps({'status': 'Responding (identity)'} )}\n\n"
             await asyncio.sleep(0.01)
             yield f"data: {payload}\n\n"
             yield "data: [DONE]\n\n"
             return
-        # initial thinking indicator (very short)
         yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
-        await asyncio.sleep(0.01)
-        # quick intent detection
         intent = analyze_intent(last_user_msg) or "general"
-        # --- CRITICAL: call analyze_flow EARLY so we know routing before emitting 'planner' UI ---
         try:
             flow_context = analyze_flow(messages)
         except Exception as e:
             logger.exception("Flow analysis failed: %s", e)
-            flow_context = {"route":"planning","flow_label":"unknown","confidence":0.0,"explanation":"flow analysis raised exception"}
-        route = flow_context.get("route", "planning")
-        flow_label = flow_context.get("flow_label", "unknown")
-        conf = float(flow_context.get("confidence", 0.0) or 0.0)
-        # Emit correct UI label depending on route
-        if route == "direct":
-            yield f"data: {json.dumps({'status': 'Reasoning (fast)...', 'route': route, 'flow_label': flow_label, 'confidence': conf})}\n\n"
-        else:
-            yield f"data: {json.dumps({'status': 'Reasoning (planner)...', 'route': route, 'flow_label': flow_label, 'confidence': conf})}\n\n"
-        await asyncio.sleep(0.05)
-        # Build vibe/context and planning requirements
         vibe_block = get_smart_context(last_user_msg)
         plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
         min_words = plan_req["min_words"]
         strictness = plan_req["strictness"]
-        # If direct: do fast-path response logic (low-latency)
-        if route == "direct":
-            final_system_prompt = (
-                "You are Nexari G1. Keep replies concise, clear, and helpful. "
-                "If user asks a short conversational query, answer directly in 1-3 short paragraphs. "
-                "Do NOT provide chain-of-thought. Avoid long planning unless user asks for detail."
-            )
-            if messages and messages[0].get("role") == "system":
-                messages[0]["content"] = final_system_prompt
-            else:
-                messages.insert(0, {"role":"system","content": final_system_prompt})
-            # fast settings
-            local_max_tokens = min(200, max(64, int(max_tokens/2)))
-            local_temperature = max(0.2, min(temperature, 0.8))
-            max_attempts = 1
-            # If no local model available, return a textual fallback so UI isn't stuck
-            if tokenizer is None or model is None:
-                fallback_text = ("(Local model not available.) I can provide a short answer from the lightweight router: "
-                                 "A neural network is a system of interconnected nodes (neurons) organized in layers that learn patterns in data.")
-                payload = json.dumps({"choices":[{"delta":{"content": fallback_text}}], "route": "direct", "flow_label": flow_label, "flow_confidence": conf})
-                yield f"data: {json.dumps({'status': 'Responding (fallback)'})}\n\n"
-                await asyncio.sleep(0.01)
-                yield f"data: {payload}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-            # prepare prompt
-            try:
-                if hasattr(tokenizer, "apply_chat_template"):
-                    text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-                else:
-                    text_prompt = _build_prompt_from_messages(messages)
-            except Exception:
-                text_prompt = _build_prompt_from_messages(messages)
-            attempts = 0
-            generated_text = ""
-            while attempts < max_attempts:
-                attempts += 1
-                yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...', 'route': route})}\n\n"
-                await asyncio.sleep(0.02)
-                model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
-                def sync_generate():
-                    return model.generate(
-                        **model_inputs,
-                        max_new_tokens=local_max_tokens,
-                        temperature=local_temperature,
-                        do_sample=True,
-                        top_k=50,
-                        top_p=0.92,
-                        repetition_penalty=1.08
-                    )
-                try:
-                    generated_ids = await asyncio.to_thread(sync_generate)
-                except Exception as e:
-                    logger.exception("Fast-path generation failed: %s", e)
-                    payload = json.dumps({"choices":[{"delta":{"content":"Model generation failed on fast-path."}}], "route": route})
-                    yield f"data: {payload}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return
-                input_len = model_inputs["input_ids"].shape[1]
-                new_tokens = generated_ids[0][input_len:]
-                raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
-                generated_text = safe_replace_providers(raw_response)
-                break
-            payload = json.dumps({"choices":[{"delta":{"content": generated_text}}], "route": route, "flow_label": flow_label, "flow_confidence": conf})
-            yield f"data: {payload}\n\n"
-            yield "data: [DONE]\n\n"
-            return
-        # ---------- PLANNING route ----------
-        # If planning and the local model is missing, return a friendly explanation + flow_context
-        if (tokenizer is None or model is None) and USE_LOCAL_MODEL:
-            payload = {
-                "choices":[{"delta":{"content": "Model temporarily unavailable on server. Planning route determined and details are below."}}],
-                "route": "planning",
-                "flow_label": flow_label,
-                "flow_confidence": conf,
-                "vibe_block": vibe_block,
-                "plan_requirements": plan_req,
-                "explanation": flow_context.get("explanation", "")
-            }
-            yield f"data: {json.dumps({'status': 'Model missing, returning planner diagnostic...'})}\n\n"
-            await asyncio.sleep(0.01)
-            yield f"data: {json.dumps(payload)}\n\n"
-            yield "data: [DONE]\n\n"
-            return
-        # Continue with planning: build final system prompt
         strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
         time_data = get_time_context()
         base_system_instruction = (
             "### SYSTEM IDENTITY ###\n"
             "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
@@ -294,7 +249,10 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
         flow_desc = ""
         if flow_context:
-            flow_desc = f"\n[FLOW] Detected: {flow_label} (confidence {round(conf,2)}). {flow_context.get('explanation','')}\n"
         final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
@@ -330,7 +288,13 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                 web_block += "No results found."
             messages.insert(1, {"role":"assistant","content": web_block})
-        # finalize prompt text
         try:
             if hasattr(tokenizer, "apply_chat_template"):
                 text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -339,14 +303,16 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
         except Exception:
             text_prompt = _build_prompt_from_messages(messages)
-        # generation attempts
         max_attempts = 2
         attempts = 0
         last_meta = {}
         generated_text = ""
         while attempts < max_attempts:
             attempts += 1
-            yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...', 'route': route})}\n\n"
             await asyncio.sleep(0.06)
             model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
@@ -365,7 +331,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                 generated_ids = await asyncio.to_thread(sync_generate)
             except RuntimeError as e:
                 logger.exception("Generation failed (possible OOM): %s", e)
-                err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}], "route": route})
                 yield f"data: {err_payload}\n\n"
                 yield "data: [DONE]\n\n"
                 return
@@ -375,12 +341,16 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
             raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             cleaned = safe_replace_providers(raw_response)
-            # small post-processing & checks
             plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
             wc = word_count(cleaned_body)
             last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
-            if wc >= min_words or attempts >= max_attempts or plan_req.get("strictness",0) == 0:
                 generated_text = cleaned_body
                 if plan_label:
                     generated_text = plan_label + "\n\n" + generated_text
@@ -399,6 +369,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                         text_prompt = _build_prompt_from_messages(messages)
                 except Exception:
                     text_prompt = _build_prompt_from_messages(messages)
                 await asyncio.sleep(0.02)
                 continue
@@ -412,10 +383,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
         payload = json.dumps({
             "choices":[{"delta":{"content": generated_text}}],
             "generation_attempts": attempts,
-            "last_attempt_meta": last_meta,
-            "route": route,
-            "flow_label": flow_label,
-            "flow_confidence": round(conf, 3)
         })
         yield f"data: {payload}\n\n"
         yield "data: [DONE]\n\n"
@@ -434,12 +402,12 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
             return
 # -------------------------
-# Endpoints (unchanged, flow-debug returns the analyze_flow output for inspection)
 # -------------------------
 @app.get("/api/status")
 def status():
-    ok = (tokenizer is not None and model is not None) if USE_LOCAL_MODEL else True
-    return {"status":"online" if ok else "degraded", "mode":"Smart Override Enabled", "model_loaded": (tokenizer is not None and model is not None)}
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
@@ -478,4 +446,4 @@ except Exception as e:
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

+# app.py - FINAL: ensure "Reasoning (planner)..." shows during planning (before heavy analysis),
+# then show "Generating — LLM (attempt N)..." only when invoking the LLM.
 import re
 import json
 import asyncio
 from tools_engine import analyze_intent, perform_web_search
 from behavior_model import analyze_flow
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import gradio as gr
 import os
 logger = logging.getLogger("nexari")
 logging.basicConfig(level=logging.INFO)
+MODEL_ID = os.environ.get("MODEL_ID", "Piyush-boss/Nexari-Qwen-3B-Full")
 tokenizer = None
 model = None
 device = "cpu"
 app = FastAPI()
 # -------------------------
+# Helper: identity detection (SAFE REGEX)
 # -------------------------
 _identity_patterns = [
     r"\bwho\s+created\s+you\b",
 ]
 try:
     _identity_re = re.compile("|".join(_identity_patterns), flags=re.IGNORECASE)
+except re.error as rex:
+    logger.exception("Identity regex compile failed: %s. Falling back to english-only patterns.", rex)
+    _identity_re = re.compile(r"\b(?:who\s+created\s+you|who\s+made\s+you|who\s+is\s+your\s+creator)\b", flags=re.IGNORECASE)
 CANONICAL_CREATOR_ANSWER = "I was created by Piyush. 🙂"
 def is_identity_question(text: str) -> bool:
+    if not text:
+        return False
     t = text.strip()
+    direct_forms = {"who created you?", "who created you", "who made you?", "who made you"}
+    if t.lower() in direct_forms:
         return True
+    try:
+        return bool(_identity_re.search(t))
+    except Exception:
+        short = t.lower()
+        return any(s in short for s in ["who created", "who made", "kaun bana"])
+# -------------------------
+# Safe provider replacer
+# -------------------------
 def safe_replace_providers(text: str) -> str:
+    if not text:
+        return text
     replacements = {"Anthropic": "Piyush", "OpenAI": "Piyush", "Alibaba": "Piyush"}
     for k, v in replacements.items():
         text = re.sub(rf"\b{k}\b", v, text)
     return text
 # -------------------------
+# Model load (lazy)
 # -------------------------
 @app.on_event("startup")
 async def startup_event():
     global tokenizer, model, device
+    logger.info("Startup: initiating background model load...")
     try:
         if torch.cuda.is_available():
             device = "cuda"
         else:
             device = "cpu"
         def sync_load():
             tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
             mdl = AutoModelForCausalLM.from_pretrained(
             return tok, mdl
         tokenizer, model = await asyncio.to_thread(sync_load)
+        logger.info("Model loaded successfully on %s.", device)
     except Exception as e:
+        logger.exception(f"Model loading failed at startup: {e}")
         tokenizer, model = None, None
 # -------------------------
+# Prompt builder & utils
 # -------------------------
+def _build_prompt_from_messages(messages: List[Dict[str, str]]) -> str:
     parts = []
     for m in messages:
         role = m.get("role","user")
         return 0
     return len(re.findall(r"\w+", text))
+def plan_response_requirements(messages: List[Dict[str,str]], last_user_msg: str, flow_context: Dict[str,Any], vibe_block: str) -> Dict[str,Any]:
+    min_words = 30
+    if "Deep Dive Mode" in vibe_block:
+        min_words = 70
+    elif "Standard Chat Mode" in vibe_block:
+        min_words = 30
+    elif "Ping-Pong Mode" in vibe_block:
+        min_words = 12
+    emoji_min, emoji_max = 0, 2
+    m = re.search(r"Use\s+(\d+)–(\d+)\s+emoji", vibe_block)
+    if m:
+        try:
+            emoji_min, emoji_max = int(m.group(1)), int(m.group(2))
+        except:
+            pass
+    flow_label = flow_context.get("flow_label","")
+    strictness = 0
+    if flow_label == "escalation":
+        strictness = 1
+        min_words = max(min_words, 40)
+        emoji_min, emoji_max = 0, min(emoji_max, 1)
+    elif flow_label == "clarification":
+        strictness = 1
+        min_words = max(min_words, 30)
+    elif flow_label == "task_request":
+        strictness = 1
+        min_words = max(min_words, 50)
+    if re.search(r"\b(short|brief|quick|short and simple)\b", last_user_msg, re.IGNORECASE):
+        min_words = 6
+        strictness = 0
+    return {"min_words": min_words, "emoji_min": emoji_min, "emoji_max": emoji_max, "strictness": strictness, "flow_label": flow_label, "flow_confidence": float(flow_context.get("confidence",0.0) or 0.0)}
+# -------------------------
+# Plan-extract & sanitize helper
+# -------------------------
+def extract_and_sanitize_plan(text: str, max_plan_chars: int = 240) -> (str, str):
+    if not text:
+        return None, text
+    patterns = [
+        r"(?:🧠\s*Plan\s*:\s*)(.+?)(?:\n{2,}|\n$|$)",
+        r"(?:\bPlan\s*:\s*)(.+?)(?:\n{2,}|\n$|$)"
+    ]
+    for pat in patterns:
+        m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
+        if m:
+            plan_raw = m.group(1).strip()
+            plan_clean = re.sub(r"\s+", " ", plan_raw)[:max_plan_chars].strip()
+            cleaned_body = re.sub(pat, "", text, flags=re.IGNORECASE | re.DOTALL).strip()
+            cleaned_body = re.sub(r"^\s*[\:\-\–\—]+", "", cleaned_body).strip()
+            plan_label = f"🧠 Plan: {plan_clean}"
+            return plan_label, cleaned_body
+    return None, text
 # -------------------------
+# Streaming generator with corrected ordering:
+# Emit "Reasoning (planner)..." first, THEN run planning analysis,
+# then emit "Generating — LLM (attempt N)..." for model attempts.
 # -------------------------
 async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
     try:
         if not messages:
             messages = [{"role":"user","content":""}]
+        last_user_msg = messages[-1].get("content","").strip()
+        # Deterministic identity preflight
         if is_identity_question(last_user_msg):
             reply_text = CANONICAL_CREATOR_ANSWER
             follow_up = " Would you like to know more about how I work or my features?"
+            payload = json.dumps({"choices":[{"delta":{"content": reply_text + follow_up}}]})
             yield f"data: {json.dumps({'status': 'Responding (identity)'} )}\n\n"
             await asyncio.sleep(0.01)
             yield f"data: {payload}\n\n"
             yield "data: [DONE]\n\n"
             return
+        # Quick initial indicator to keep UI responsive
         yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
+        await asyncio.sleep(0)
         intent = analyze_intent(last_user_msg) or "general"
+        # Emit Reasoning indicator BEFORE heavy planning so UI shows it during planning
+        yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
+        # small pause to allow UI to render the status before we start analysis
+        await asyncio.sleep(0.15)
+        # ---------- PLANNING WORK (now executed while UI shows Reasoning) ----------
         try:
             flow_context = analyze_flow(messages)
         except Exception as e:
             logger.exception("Flow analysis failed: %s", e)
+            flow_context = {}
         vibe_block = get_smart_context(last_user_msg)
         plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
         min_words = plan_req["min_words"]
         strictness = plan_req["strictness"]
+        # adjust tokens/temperature if strict
+        if strictness:
+            temperature = min(temperature + 0.05, 0.95)
+            max_tokens = max(max_tokens, min_words // 2 + 120)
         strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
         time_data = get_time_context()
         base_system_instruction = (
             "### SYSTEM IDENTITY ###\n"
             "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
         flow_desc = ""
         if flow_context:
+            label = flow_context.get("flow_label","unknown")
+            conf = round(float(flow_context.get("confidence", 0.0)), 2)
+            expl = flow_context.get("explanation", "")
+            flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
         final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
                 web_block += "No results found."
             messages.insert(1, {"role":"assistant","content": web_block})
+        if tokenizer is None or model is None:
+            err = "Model not loaded. Check server logs."
+            payload = json.dumps({"choices":[{"delta":{"content": err}}]})
+            yield f"data: {payload}\n\n"
+            yield "data: [DONE]\n\n"
+            return
         try:
             if hasattr(tokenizer, "apply_chat_template"):
                 text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         except Exception:
             text_prompt = _build_prompt_from_messages(messages)
+        # ---------- GENERATION STAGE ----------
         max_attempts = 2
         attempts = 0
         last_meta = {}
         generated_text = ""
         while attempts < max_attempts:
             attempts += 1
+            # Emit explicit generating label (after planning completed)
+            yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...'})}\n\n"
+            # tiny sleep to let UI update
             await asyncio.sleep(0.06)
             model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
                 generated_ids = await asyncio.to_thread(sync_generate)
             except RuntimeError as e:
                 logger.exception("Generation failed (possible OOM): %s", e)
+                err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
                 yield f"data: {err_payload}\n\n"
                 yield "data: [DONE]\n\n"
                 return
             raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             cleaned = safe_replace_providers(raw_response)
+            forbidden = ["I am a human","I have a physical body","I am alive"]
+            for fc in forbidden:
+                if fc.lower() in cleaned.lower():
+                    cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
             plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
             wc = word_count(cleaned_body)
             last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
+            if wc >= min_words or attempts >= max_attempts or plan_req["strictness"] == 0:
                 generated_text = cleaned_body
                 if plan_label:
                     generated_text = plan_label + "\n\n" + generated_text
                         text_prompt = _build_prompt_from_messages(messages)
                 except Exception:
                     text_prompt = _build_prompt_from_messages(messages)
+                # allow a short break so UI shows the attempted generate label
                 await asyncio.sleep(0.02)
                 continue
         payload = json.dumps({
             "choices":[{"delta":{"content": generated_text}}],
             "generation_attempts": attempts,
+            "last_attempt_meta": last_meta
         })
         yield f"data: {payload}\n\n"
         yield "data: [DONE]\n\n"
             return
 # -------------------------
+# Endpoints
 # -------------------------
 @app.get("/api/status")
 def status():
+    ok = tokenizer is not None and model is not None
+    return {"status":"online" if ok else "degraded", "mode":"Smart Override Enabled", "model_loaded": ok}
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))