Nexari-Research commited on
Commit
fe52abb
·
verified ·
1 Parent(s): 3e9da60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -165
app.py CHANGED
@@ -1,4 +1,5 @@
1
- # app.py - UPDATED: fixed routing ordering, clearer model-fallback, and richer debug payloads
 
2
  import re
3
  import json
4
  import asyncio
@@ -13,13 +14,7 @@ from cognitive_engine import get_time_context, get_thinking_strategy
13
  from tools_engine import analyze_intent, perform_web_search
14
  from behavior_model import analyze_flow
15
 
16
- # transformers imports kept but load handled safely
17
- try:
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- except Exception:
20
- AutoModelForCausalLM = None
21
- AutoTokenizer = None
22
-
23
  import torch
24
  import gradio as gr
25
  import os
@@ -28,8 +23,7 @@ import time
28
  logger = logging.getLogger("nexari")
29
  logging.basicConfig(level=logging.INFO)
30
 
31
- MODEL_ID = os.environ.get("MODEL_ID", "")
32
- USE_LOCAL_MODEL = os.environ.get("USE_LOCAL_MODEL", "true").lower() in ("1", "true", "yes")
33
  tokenizer = None
34
  model = None
35
  device = "cpu"
@@ -37,7 +31,7 @@ device = "cpu"
37
  app = FastAPI()
38
 
39
  # -------------------------
40
- # Identity and safe helpers
41
  # -------------------------
42
  _identity_patterns = [
43
  r"\bwho\s+created\s+you\b",
@@ -50,52 +44,49 @@ _identity_patterns = [
50
  ]
51
  try:
52
  _identity_re = re.compile("|".join(_identity_patterns), flags=re.IGNORECASE)
53
- except Exception:
54
- _identity_re = re.compile(r"\bwho created you\b|\bwho made you\b", flags=re.IGNORECASE)
 
55
 
56
  CANONICAL_CREATOR_ANSWER = "I was created by Piyush. 🙂"
57
 
58
  def is_identity_question(text: str) -> bool:
59
- if not text: return False
 
60
  t = text.strip()
61
- if t.lower() in {"who created you?", "who created you", "who made you?", "who made you"}:
 
62
  return True
63
- return bool(_identity_re.search(t))
 
 
 
 
64
 
 
 
 
65
  def safe_replace_providers(text: str) -> str:
66
- if not text: return text
 
67
  replacements = {"Anthropic": "Piyush", "OpenAI": "Piyush", "Alibaba": "Piyush"}
68
  for k, v in replacements.items():
69
  text = re.sub(rf"\b{k}\b", v, text)
70
  return text
71
 
72
  # -------------------------
73
- # Model load (safe + optional)
74
  # -------------------------
75
  @app.on_event("startup")
76
  async def startup_event():
77
  global tokenizer, model, device
78
- logger.info("Startup: model load config: USE_LOCAL_MODEL=%s, MODEL_ID=%s", USE_LOCAL_MODEL, MODEL_ID or "<none>")
79
- if not USE_LOCAL_MODEL:
80
- logger.info("Configured to not use local model (USE_LOCAL_MODEL=false). Skipping heavy model load.")
81
- return
82
-
83
- if not MODEL_ID:
84
- logger.warning("USE_LOCAL_MODEL enabled but MODEL_ID not set. Skipping local model load.")
85
- return
86
-
87
- # try to load tokenizer/model lazily, but do it asynchronously to avoid blocking startup too long
88
  try:
89
  if torch.cuda.is_available():
90
  device = "cuda"
91
  else:
92
  device = "cpu"
93
 
94
- if AutoTokenizer is None or AutoModelForCausalLM is None:
95
- logger.warning("transformers not available; cannot load local model.")
96
- tokenizer, model = None, None
97
- return
98
-
99
  def sync_load():
100
  tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
101
  mdl = AutoModelForCausalLM.from_pretrained(
@@ -109,15 +100,15 @@ async def startup_event():
109
  return tok, mdl
110
 
111
  tokenizer, model = await asyncio.to_thread(sync_load)
112
- logger.info("Local model loaded (device=%s).", device)
113
  except Exception as e:
114
- logger.exception("Local model load failed: %s", e)
115
  tokenizer, model = None, None
116
 
117
  # -------------------------
118
- # Prompt / utils
119
  # -------------------------
120
- def _build_prompt_from_messages(messages: List[Dict[str,str]]) -> str:
121
  parts = []
122
  for m in messages:
123
  role = m.get("role","user")
@@ -137,152 +128,116 @@ def word_count(text: str) -> int:
137
  return 0
138
  return len(re.findall(r"\w+", text))
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  # -------------------------
141
- # Main streaming generator (fixed ordering + clear fallbacks)
 
 
142
  # -------------------------
143
  async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
144
  try:
145
  if not messages:
146
  messages = [{"role":"user","content":""}]
147
- last_user_msg = (messages[-1].get("content","") or "").strip()
148
 
149
- # quick identity shortcut
150
  if is_identity_question(last_user_msg):
151
  reply_text = CANONICAL_CREATOR_ANSWER
152
  follow_up = " Would you like to know more about how I work or my features?"
153
- payload = json.dumps({"choices":[{"delta":{"content": reply_text + follow_up}}], "route": "direct", "flow_label": "identity"})
154
  yield f"data: {json.dumps({'status': 'Responding (identity)'} )}\n\n"
155
  await asyncio.sleep(0.01)
156
  yield f"data: {payload}\n\n"
157
  yield "data: [DONE]\n\n"
158
  return
159
 
160
- # initial thinking indicator (very short)
161
  yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
162
- await asyncio.sleep(0.01)
163
 
164
- # quick intent detection
165
  intent = analyze_intent(last_user_msg) or "general"
166
 
167
- # --- CRITICAL: call analyze_flow EARLY so we know routing before emitting 'planner' UI ---
 
 
 
 
 
168
  try:
169
  flow_context = analyze_flow(messages)
170
  except Exception as e:
171
  logger.exception("Flow analysis failed: %s", e)
172
- flow_context = {"route":"planning","flow_label":"unknown","confidence":0.0,"explanation":"flow analysis raised exception"}
173
-
174
- route = flow_context.get("route", "planning")
175
- flow_label = flow_context.get("flow_label", "unknown")
176
- conf = float(flow_context.get("confidence", 0.0) or 0.0)
177
-
178
- # Emit correct UI label depending on route
179
- if route == "direct":
180
- yield f"data: {json.dumps({'status': 'Reasoning (fast)...', 'route': route, 'flow_label': flow_label, 'confidence': conf})}\n\n"
181
- else:
182
- yield f"data: {json.dumps({'status': 'Reasoning (planner)...', 'route': route, 'flow_label': flow_label, 'confidence': conf})}\n\n"
183
- await asyncio.sleep(0.05)
184
 
185
- # Build vibe/context and planning requirements
186
  vibe_block = get_smart_context(last_user_msg)
187
  plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
188
  min_words = plan_req["min_words"]
189
  strictness = plan_req["strictness"]
190
 
191
- # If direct: do fast-path response logic (low-latency)
192
- if route == "direct":
193
- final_system_prompt = (
194
- "You are Nexari G1. Keep replies concise, clear, and helpful. "
195
- "If user asks a short conversational query, answer directly in 1-3 short paragraphs. "
196
- "Do NOT provide chain-of-thought. Avoid long planning unless user asks for detail."
197
- )
198
- if messages and messages[0].get("role") == "system":
199
- messages[0]["content"] = final_system_prompt
200
- else:
201
- messages.insert(0, {"role":"system","content": final_system_prompt})
202
-
203
- # fast settings
204
- local_max_tokens = min(200, max(64, int(max_tokens/2)))
205
- local_temperature = max(0.2, min(temperature, 0.8))
206
- max_attempts = 1
207
-
208
- # If no local model available, return a textual fallback so UI isn't stuck
209
- if tokenizer is None or model is None:
210
- fallback_text = ("(Local model not available.) I can provide a short answer from the lightweight router: "
211
- "A neural network is a system of interconnected nodes (neurons) organized in layers that learn patterns in data.")
212
- payload = json.dumps({"choices":[{"delta":{"content": fallback_text}}], "route": "direct", "flow_label": flow_label, "flow_confidence": conf})
213
- yield f"data: {json.dumps({'status': 'Responding (fallback)'})}\n\n"
214
- await asyncio.sleep(0.01)
215
- yield f"data: {payload}\n\n"
216
- yield "data: [DONE]\n\n"
217
- return
218
-
219
- # prepare prompt
220
- try:
221
- if hasattr(tokenizer, "apply_chat_template"):
222
- text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
223
- else:
224
- text_prompt = _build_prompt_from_messages(messages)
225
- except Exception:
226
- text_prompt = _build_prompt_from_messages(messages)
227
 
228
- attempts = 0
229
- generated_text = ""
230
- while attempts < max_attempts:
231
- attempts += 1
232
- yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...', 'route': route})}\n\n"
233
- await asyncio.sleep(0.02)
234
- model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
235
- def sync_generate():
236
- return model.generate(
237
- **model_inputs,
238
- max_new_tokens=local_max_tokens,
239
- temperature=local_temperature,
240
- do_sample=True,
241
- top_k=50,
242
- top_p=0.92,
243
- repetition_penalty=1.08
244
- )
245
- try:
246
- generated_ids = await asyncio.to_thread(sync_generate)
247
- except Exception as e:
248
- logger.exception("Fast-path generation failed: %s", e)
249
- payload = json.dumps({"choices":[{"delta":{"content":"Model generation failed on fast-path."}}], "route": route})
250
- yield f"data: {payload}\n\n"
251
- yield "data: [DONE]\n\n"
252
- return
253
-
254
- input_len = model_inputs["input_ids"].shape[1]
255
- new_tokens = generated_ids[0][input_len:]
256
- raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
257
- generated_text = safe_replace_providers(raw_response)
258
- break
259
-
260
- payload = json.dumps({"choices":[{"delta":{"content": generated_text}}], "route": route, "flow_label": flow_label, "flow_confidence": conf})
261
- yield f"data: {payload}\n\n"
262
- yield "data: [DONE]\n\n"
263
- return
264
-
265
- # ---------- PLANNING route ----------
266
- # If planning and the local model is missing, return a friendly explanation + flow_context
267
- if (tokenizer is None or model is None) and USE_LOCAL_MODEL:
268
- payload = {
269
- "choices":[{"delta":{"content": "Model temporarily unavailable on server. Planning route determined and details are below."}}],
270
- "route": "planning",
271
- "flow_label": flow_label,
272
- "flow_confidence": conf,
273
- "vibe_block": vibe_block,
274
- "plan_requirements": plan_req,
275
- "explanation": flow_context.get("explanation", "")
276
- }
277
- yield f"data: {json.dumps({'status': 'Model missing, returning planner diagnostic...'})}\n\n"
278
- await asyncio.sleep(0.01)
279
- yield f"data: {json.dumps(payload)}\n\n"
280
- yield "data: [DONE]\n\n"
281
- return
282
-
283
- # Continue with planning: build final system prompt
284
  strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
285
  time_data = get_time_context()
 
286
  base_system_instruction = (
287
  "### SYSTEM IDENTITY ###\n"
288
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
@@ -294,7 +249,10 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
294
 
295
  flow_desc = ""
296
  if flow_context:
297
- flow_desc = f"\n[FLOW] Detected: {flow_label} (confidence {round(conf,2)}). {flow_context.get('explanation','')}\n"
 
 
 
298
 
299
  final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
300
 
@@ -330,7 +288,13 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
330
  web_block += "No results found."
331
  messages.insert(1, {"role":"assistant","content": web_block})
332
 
333
- # finalize prompt text
 
 
 
 
 
 
334
  try:
335
  if hasattr(tokenizer, "apply_chat_template"):
336
  text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -339,14 +303,16 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
339
  except Exception:
340
  text_prompt = _build_prompt_from_messages(messages)
341
 
342
- # generation attempts
343
  max_attempts = 2
344
  attempts = 0
345
  last_meta = {}
346
  generated_text = ""
347
  while attempts < max_attempts:
348
  attempts += 1
349
- yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...', 'route': route})}\n\n"
 
 
350
  await asyncio.sleep(0.06)
351
 
352
  model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
@@ -365,7 +331,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
365
  generated_ids = await asyncio.to_thread(sync_generate)
366
  except RuntimeError as e:
367
  logger.exception("Generation failed (possible OOM): %s", e)
368
- err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}], "route": route})
369
  yield f"data: {err_payload}\n\n"
370
  yield "data: [DONE]\n\n"
371
  return
@@ -375,12 +341,16 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
375
  raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
376
  cleaned = safe_replace_providers(raw_response)
377
 
378
- # small post-processing & checks
 
 
 
 
379
  plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
380
  wc = word_count(cleaned_body)
381
  last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
382
 
383
- if wc >= min_words or attempts >= max_attempts or plan_req.get("strictness",0) == 0:
384
  generated_text = cleaned_body
385
  if plan_label:
386
  generated_text = plan_label + "\n\n" + generated_text
@@ -399,6 +369,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
399
  text_prompt = _build_prompt_from_messages(messages)
400
  except Exception:
401
  text_prompt = _build_prompt_from_messages(messages)
 
402
  await asyncio.sleep(0.02)
403
  continue
404
 
@@ -412,10 +383,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
412
  payload = json.dumps({
413
  "choices":[{"delta":{"content": generated_text}}],
414
  "generation_attempts": attempts,
415
- "last_attempt_meta": last_meta,
416
- "route": route,
417
- "flow_label": flow_label,
418
- "flow_confidence": round(conf, 3)
419
  })
420
  yield f"data: {payload}\n\n"
421
  yield "data: [DONE]\n\n"
@@ -434,12 +402,12 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
434
  return
435
 
436
  # -------------------------
437
- # Endpoints (unchanged, flow-debug returns the analyze_flow output for inspection)
438
  # -------------------------
439
  @app.get("/api/status")
440
  def status():
441
- ok = (tokenizer is not None and model is not None) if USE_LOCAL_MODEL else True
442
- return {"status":"online" if ok else "degraded", "mode":"Smart Override Enabled", "model_loaded": (tokenizer is not None and model is not None)}
443
 
444
  @app.post("/v1/chat/completions")
445
  async def chat_completions(request: Request):
@@ -478,4 +446,4 @@ except Exception as e:
478
 
479
  if __name__ == "__main__":
480
  import uvicorn
481
- uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
 
1
+ # app.py - FINAL: ensure "Reasoning (planner)..." shows during planning (before heavy analysis),
2
+ # then show "Generating — LLM (attempt N)..." only when invoking the LLM.
3
  import re
4
  import json
5
  import asyncio
 
14
  from tools_engine import analyze_intent, perform_web_search
15
  from behavior_model import analyze_flow
16
 
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
 
18
  import torch
19
  import gradio as gr
20
  import os
 
23
  logger = logging.getLogger("nexari")
24
  logging.basicConfig(level=logging.INFO)
25
 
26
+ MODEL_ID = os.environ.get("MODEL_ID", "Piyush-boss/Nexari-Qwen-3B-Full")
 
27
  tokenizer = None
28
  model = None
29
  device = "cpu"
 
31
  app = FastAPI()
32
 
33
  # -------------------------
34
+ # Helper: identity detection (SAFE REGEX)
35
  # -------------------------
36
  _identity_patterns = [
37
  r"\bwho\s+created\s+you\b",
 
44
  ]
45
  try:
46
  _identity_re = re.compile("|".join(_identity_patterns), flags=re.IGNORECASE)
47
+ except re.error as rex:
48
+ logger.exception("Identity regex compile failed: %s. Falling back to english-only patterns.", rex)
49
+ _identity_re = re.compile(r"\b(?:who\s+created\s+you|who\s+made\s+you|who\s+is\s+your\s+creator)\b", flags=re.IGNORECASE)
50
 
51
  CANONICAL_CREATOR_ANSWER = "I was created by Piyush. 🙂"
52
 
53
  def is_identity_question(text: str) -> bool:
54
+ if not text:
55
+ return False
56
  t = text.strip()
57
+ direct_forms = {"who created you?", "who created you", "who made you?", "who made you"}
58
+ if t.lower() in direct_forms:
59
  return True
60
+ try:
61
+ return bool(_identity_re.search(t))
62
+ except Exception:
63
+ short = t.lower()
64
+ return any(s in short for s in ["who created", "who made", "kaun bana"])
65
 
66
+ # -------------------------
67
+ # Safe provider replacer
68
+ # -------------------------
69
  def safe_replace_providers(text: str) -> str:
70
+ if not text:
71
+ return text
72
  replacements = {"Anthropic": "Piyush", "OpenAI": "Piyush", "Alibaba": "Piyush"}
73
  for k, v in replacements.items():
74
  text = re.sub(rf"\b{k}\b", v, text)
75
  return text
76
 
77
  # -------------------------
78
+ # Model load (lazy)
79
  # -------------------------
80
  @app.on_event("startup")
81
  async def startup_event():
82
  global tokenizer, model, device
83
+ logger.info("Startup: initiating background model load...")
 
 
 
 
 
 
 
 
 
84
  try:
85
  if torch.cuda.is_available():
86
  device = "cuda"
87
  else:
88
  device = "cpu"
89
 
 
 
 
 
 
90
  def sync_load():
91
  tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
92
  mdl = AutoModelForCausalLM.from_pretrained(
 
100
  return tok, mdl
101
 
102
  tokenizer, model = await asyncio.to_thread(sync_load)
103
+ logger.info("Model loaded successfully on %s.", device)
104
  except Exception as e:
105
+ logger.exception(f"Model loading failed at startup: {e}")
106
  tokenizer, model = None, None
107
 
108
  # -------------------------
109
+ # Prompt builder & utils
110
  # -------------------------
111
+ def _build_prompt_from_messages(messages: List[Dict[str, str]]) -> str:
112
  parts = []
113
  for m in messages:
114
  role = m.get("role","user")
 
128
  return 0
129
  return len(re.findall(r"\w+", text))
130
 
131
+ def plan_response_requirements(messages: List[Dict[str,str]], last_user_msg: str, flow_context: Dict[str,Any], vibe_block: str) -> Dict[str,Any]:
132
+ min_words = 30
133
+ if "Deep Dive Mode" in vibe_block:
134
+ min_words = 70
135
+ elif "Standard Chat Mode" in vibe_block:
136
+ min_words = 30
137
+ elif "Ping-Pong Mode" in vibe_block:
138
+ min_words = 12
139
+
140
+ emoji_min, emoji_max = 0, 2
141
+ m = re.search(r"Use\s+(\d+)–(\d+)\s+emoji", vibe_block)
142
+ if m:
143
+ try:
144
+ emoji_min, emoji_max = int(m.group(1)), int(m.group(2))
145
+ except:
146
+ pass
147
+
148
+ flow_label = flow_context.get("flow_label","")
149
+ strictness = 0
150
+ if flow_label == "escalation":
151
+ strictness = 1
152
+ min_words = max(min_words, 40)
153
+ emoji_min, emoji_max = 0, min(emoji_max, 1)
154
+ elif flow_label == "clarification":
155
+ strictness = 1
156
+ min_words = max(min_words, 30)
157
+ elif flow_label == "task_request":
158
+ strictness = 1
159
+ min_words = max(min_words, 50)
160
+
161
+ if re.search(r"\b(short|brief|quick|short and simple)\b", last_user_msg, re.IGNORECASE):
162
+ min_words = 6
163
+ strictness = 0
164
+
165
+ return {"min_words": min_words, "emoji_min": emoji_min, "emoji_max": emoji_max, "strictness": strictness, "flow_label": flow_label, "flow_confidence": float(flow_context.get("confidence",0.0) or 0.0)}
166
+
167
+ # -------------------------
168
+ # Plan-extract & sanitize helper
169
+ # -------------------------
170
+ def extract_and_sanitize_plan(text: str, max_plan_chars: int = 240) -> (str, str):
171
+ if not text:
172
+ return None, text
173
+ patterns = [
174
+ r"(?:🧠\s*Plan\s*:\s*)(.+?)(?:\n{2,}|\n$|$)",
175
+ r"(?:\bPlan\s*:\s*)(.+?)(?:\n{2,}|\n$|$)"
176
+ ]
177
+ for pat in patterns:
178
+ m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
179
+ if m:
180
+ plan_raw = m.group(1).strip()
181
+ plan_clean = re.sub(r"\s+", " ", plan_raw)[:max_plan_chars].strip()
182
+ cleaned_body = re.sub(pat, "", text, flags=re.IGNORECASE | re.DOTALL).strip()
183
+ cleaned_body = re.sub(r"^\s*[\:\-\–\—]+", "", cleaned_body).strip()
184
+ plan_label = f"🧠 Plan: {plan_clean}"
185
+ return plan_label, cleaned_body
186
+ return None, text
187
+
188
  # -------------------------
189
+ # Streaming generator with corrected ordering:
190
+ # Emit "Reasoning (planner)..." first, THEN run planning analysis,
191
+ # then emit "Generating — LLM (attempt N)..." for model attempts.
192
  # -------------------------
193
  async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
194
  try:
195
  if not messages:
196
  messages = [{"role":"user","content":""}]
197
+ last_user_msg = messages[-1].get("content","").strip()
198
 
199
+ # Deterministic identity preflight
200
  if is_identity_question(last_user_msg):
201
  reply_text = CANONICAL_CREATOR_ANSWER
202
  follow_up = " Would you like to know more about how I work or my features?"
203
+ payload = json.dumps({"choices":[{"delta":{"content": reply_text + follow_up}}]})
204
  yield f"data: {json.dumps({'status': 'Responding (identity)'} )}\n\n"
205
  await asyncio.sleep(0.01)
206
  yield f"data: {payload}\n\n"
207
  yield "data: [DONE]\n\n"
208
  return
209
 
210
+ # Quick initial indicator to keep UI responsive
211
  yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
212
+ await asyncio.sleep(0)
213
 
 
214
  intent = analyze_intent(last_user_msg) or "general"
215
 
216
+ # Emit Reasoning indicator BEFORE heavy planning so UI shows it during planning
217
+ yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
218
+ # small pause to allow UI to render the status before we start analysis
219
+ await asyncio.sleep(0.15)
220
+
221
+ # ---------- PLANNING WORK (now executed while UI shows Reasoning) ----------
222
  try:
223
  flow_context = analyze_flow(messages)
224
  except Exception as e:
225
  logger.exception("Flow analysis failed: %s", e)
226
+ flow_context = {}
 
 
 
 
 
 
 
 
 
 
 
227
 
 
228
  vibe_block = get_smart_context(last_user_msg)
229
  plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
230
  min_words = plan_req["min_words"]
231
  strictness = plan_req["strictness"]
232
 
233
+ # adjust tokens/temperature if strict
234
+ if strictness:
235
+ temperature = min(temperature + 0.05, 0.95)
236
+ max_tokens = max(max_tokens, min_words // 2 + 120)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
239
  time_data = get_time_context()
240
+
241
  base_system_instruction = (
242
  "### SYSTEM IDENTITY ###\n"
243
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
 
249
 
250
  flow_desc = ""
251
  if flow_context:
252
+ label = flow_context.get("flow_label","unknown")
253
+ conf = round(float(flow_context.get("confidence", 0.0)), 2)
254
+ expl = flow_context.get("explanation", "")
255
+ flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
256
 
257
  final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
258
 
 
288
  web_block += "No results found."
289
  messages.insert(1, {"role":"assistant","content": web_block})
290
 
291
+ if tokenizer is None or model is None:
292
+ err = "Model not loaded. Check server logs."
293
+ payload = json.dumps({"choices":[{"delta":{"content": err}}]})
294
+ yield f"data: {payload}\n\n"
295
+ yield "data: [DONE]\n\n"
296
+ return
297
+
298
  try:
299
  if hasattr(tokenizer, "apply_chat_template"):
300
  text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
303
  except Exception:
304
  text_prompt = _build_prompt_from_messages(messages)
305
 
306
+ # ---------- GENERATION STAGE ----------
307
  max_attempts = 2
308
  attempts = 0
309
  last_meta = {}
310
  generated_text = ""
311
  while attempts < max_attempts:
312
  attempts += 1
313
+ # Emit explicit generating label (after planning completed)
314
+ yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...'})}\n\n"
315
+ # tiny sleep to let UI update
316
  await asyncio.sleep(0.06)
317
 
318
  model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
 
331
  generated_ids = await asyncio.to_thread(sync_generate)
332
  except RuntimeError as e:
333
  logger.exception("Generation failed (possible OOM): %s", e)
334
+ err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
335
  yield f"data: {err_payload}\n\n"
336
  yield "data: [DONE]\n\n"
337
  return
 
341
  raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
342
  cleaned = safe_replace_providers(raw_response)
343
 
344
+ forbidden = ["I am a human","I have a physical body","I am alive"]
345
+ for fc in forbidden:
346
+ if fc.lower() in cleaned.lower():
347
+ cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
348
+
349
  plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
350
  wc = word_count(cleaned_body)
351
  last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
352
 
353
+ if wc >= min_words or attempts >= max_attempts or plan_req["strictness"] == 0:
354
  generated_text = cleaned_body
355
  if plan_label:
356
  generated_text = plan_label + "\n\n" + generated_text
 
369
  text_prompt = _build_prompt_from_messages(messages)
370
  except Exception:
371
  text_prompt = _build_prompt_from_messages(messages)
372
+ # allow a short break so UI shows the attempted generate label
373
  await asyncio.sleep(0.02)
374
  continue
375
 
 
383
  payload = json.dumps({
384
  "choices":[{"delta":{"content": generated_text}}],
385
  "generation_attempts": attempts,
386
+ "last_attempt_meta": last_meta
 
 
 
387
  })
388
  yield f"data: {payload}\n\n"
389
  yield "data: [DONE]\n\n"
 
402
  return
403
 
404
  # -------------------------
405
+ # Endpoints
406
  # -------------------------
407
  @app.get("/api/status")
408
  def status():
409
+ ok = tokenizer is not None and model is not None
410
+ return {"status":"online" if ok else "degraded", "mode":"Smart Override Enabled", "model_loaded": ok}
411
 
412
  @app.post("/v1/chat/completions")
413
  async def chat_completions(request: Request):
 
446
 
447
  if __name__ == "__main__":
448
  import uvicorn
449
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))