Spaces:

build-small-hackathon
/

lifeos

Running

App Files Files Community

lifeos / engine.py

awaisaziz

Add config, model status, and VLM support

0c4cd3b 17 days ago

Raw

History Blame Contribute Delete

27.8 kB

	"""LifeOS reasoning engine.

	Text reasoning runs on one small model — NVIDIA Nemotron-3-Nano-4B (Q4_K_M
	GGUF, 2.84GB) — 100% locally through the llama.cpp runtime (llama-cpp-python).
	Deterministic feature code curates a small context (memory slices + RAG
	recall); the model only does the judgment + explanation layer. That division
	is what makes a 4B on 2 vCPUs feel smart.

	Food photos additionally use a small vision-language model — Qwen2.5-VL-3B
	(Q4_K_M GGUF) — for perception only: it identifies the food items in an image,
	which Nemotron then reasons about against memory. The VLM is loaded lazily on
	the first photo, so the text-only path never pays for it.
	"""

	import logging
	import os
	import re
	import threading
	from collections.abc import Iterator

	import cuda_bootstrap

	import config
	import memory as memory_store
	import rag

	logger = logging.getLogger(__name__)

	cuda_bootstrap.ensure() # register CUDA runtime DLL dirs before llama_cpp loads

	MODEL_REPO = config.MODEL_REPO
	MODEL_FILE = config.MODEL_FILE
	# Fallback (plain llama arch) if the hybrid Mamba arch is unsupported by the
	# installed llama.cpp: bartowski/nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-GGUF

	# Vision model for food-photo recognition. Nemotron is text-only and cannot
	# "see" an image, so a small vision-language model handles perception: it
	# identifies the food items in a photo. The identified items are then fed to
	# Nemotron, which does the memory-grounded judgment (dietary fit, suggestions).
	# Q4_K_M (~2.4GB) + the f16 multimodal projector that encodes the image.
	VLM_REPO = config.VLM_REPO
	VLM_FILE = config.VLM_FILE
	VLM_MMPROJ_FILE = config.VLM_MMPROJ_FILE

	_llm = None
	_llm_lock = threading.Lock()

	_vlm = None
	_vlm_lock = threading.Lock()

	# GPU offload: number of model layers to push to the GPU. -1 = all layers
	# (full offload), 0 = CPU only. Requires a CUDA/Metal/Vulkan build of
	# llama-cpp-python — the plain CPU wheel ignores this and stays on CPU.
	GPU_LAYERS = config.GPU_LAYERS

	# Observable load state for the UI / status endpoint. One of:
	# "idle" (not loaded yet), "loading", "ready", "error".
	ACTIVE_BACKEND = None
	MODEL_STATE = "idle"
	MODEL_ERROR = None


	class ModelUnavailable(RuntimeError):
	"""Raised when the local model cannot be loaded (bad/missing wheel, failed
	download, out of memory). Callers stream a friendly message instead."""


	def status() -> dict:
	"""Current model state for the /status endpoint and UI indicator."""
	return {"state": MODEL_STATE, "backend": ACTIVE_BACKEND, "error": MODEL_ERROR}


	def _load_llm(n_gpu_layers: int):
	import cuda_bootstrap

	cuda_bootstrap.ensure()
	from llama_cpp import Llama

	cores = os.cpu_count() or 2
	# When fully offloaded to the GPU the text model needs almost no CPU threads;
	# keeping its pool small leaves cores free for the CPU-bound vision model
	# that runs on food-photo uploads (otherwise the two oversubscribe the CPU).
	n_threads = max(2, cores // 2) if n_gpu_layers != 0 else cores
	return Llama.from_pretrained(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	n_ctx=8192,
	n_threads=n_threads,
	n_gpu_layers=n_gpu_layers,
	verbose=False,
	)


	def get_llm():
	"""Load the model once. Try GPU offload first; if the GPU build is missing
	or crashes (bad wheel, no VRAM, driver mismatch), fall back to CPU so the
	app still runs. Honors LIFEOS_GPU_LAYERS=0 to skip the GPU attempt.

	Updates MODEL_STATE so the UI can show loading/ready/error. On total
	failure raises ModelUnavailable so callers can stream a friendly message
	instead of a raw 500."""
	global _llm, ACTIVE_BACKEND, MODEL_STATE, MODEL_ERROR
	if _llm is not None:
	return _llm

	MODEL_STATE = "loading"
	if GPU_LAYERS != 0:
	try:
	_llm = _load_llm(GPU_LAYERS)
	ACTIVE_BACKEND = "gpu"
	MODEL_STATE, MODEL_ERROR = "ready", None
	logger.info("model loaded on GPU (n_gpu_layers=%s)", GPU_LAYERS)
	return _llm
	except BaseException as e: # noqa: BLE001 — incl. OSError/illegal-instr
	logger.warning("GPU load failed (%s: %s); falling back to CPU", type(e).__name__, e)
	_llm = None

	try:
	_llm = _load_llm(0)
	except BaseException as e: # noqa: BLE001 — download/format/runtime failure
	MODEL_STATE, MODEL_ERROR = "error", f"{type(e).__name__}: {e}"
	logger.error("model load failed on CPU: %s", MODEL_ERROR)
	raise ModelUnavailable(MODEL_ERROR) from e
	ACTIVE_BACKEND = "cpu"
	MODEL_STATE, MODEL_ERROR = "ready", None
	logger.info("model loaded on CPU")
	return _llm


	def _load_vlm(n_gpu_layers: int):
	import cuda_bootstrap

	cuda_bootstrap.ensure()
	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Qwen25VLChatHandler

	# The chat handler downloads + owns the multimodal projector (mmproj) that
	# turns the image into tokens the model can attend to.
	handler = Qwen25VLChatHandler.from_pretrained(
	repo_id=VLM_REPO,
	filename=VLM_MMPROJ_FILE,
	verbose=False,
	)
	return Llama.from_pretrained(
	repo_id=VLM_REPO,
	filename=VLM_FILE,
	chat_handler=handler,
	n_ctx=4096,
	n_threads=os.cpu_count() or 2,
	n_gpu_layers=n_gpu_layers,
	verbose=False,
	)


	VLM_GPU_LAYERS = config.VLM_GPU_LAYERS


	def get_vlm():
	"""Lazily load the vision-language model (used only for food photos). Loaded
	on first photo so the text-only path never pays for it. Defaults to CPU
	(VLM_GPU_LAYERS=0) so it doesn't fight the resident text model for VRAM on
	small cards; if a GPU attempt is configured but fails, falls back to CPU."""
	global _vlm
	if _vlm is not None:
	return _vlm

	if VLM_GPU_LAYERS != 0:
	try:
	_vlm = _load_vlm(VLM_GPU_LAYERS)
	logger.info("VLM loaded on GPU (n_gpu_layers=%s)", VLM_GPU_LAYERS)
	return _vlm
	except BaseException as e: # noqa: BLE001
	logger.warning("VLM GPU load failed (%s: %s); falling back to CPU", type(e).__name__, e)
	_vlm = None

	try:
	_vlm = _load_vlm(0)
	except BaseException as e: # noqa: BLE001
	raise ModelUnavailable(f"vision model unavailable: {type(e).__name__}: {e}") from e
	logger.info("VLM loaded on CPU")
	return _vlm


	_FOOD_VISION_PROMPT = (
	"You are a food-recognition assistant. Look at this photo and list the food "
	"and drink items you can see. Break composed dishes into their visible "
	"components — e.g. a pizza becomes its toppings (crust, tomato sauce, "
	"mozzarella, basil); a plate of toast with egg becomes each item. If it is "
	"a grocery receipt or a label, read the product names instead. Respond with "
	"ONLY a bulleted list — one item per line starting with '- ', using plain "
	"common names (e.g. '- fried egg', '- whole-grain toast', '- cherry "
	"tomatoes'). Add a rough quantity when obvious. Aim for 3-8 items. Ignore "
	"plates, bowls, cutlery, and packaging. Do not add commentary, nutrition "
	"facts, or headings."
	)

	# Longest-side cap for the image fed to the VLM. On this CPU path a full-res
	# photo decodes ~1000 image tokens (~36s); 768px cuts that ~4x to a few seconds
	# with no loss in food-recognition quality.
	VLM_MAX_IMAGE_SIDE = config.VLM_MAX_IMAGE_SIDE


	def _image_data_uri(path: str) -> str:
	"""Downscale the photo to VLM_MAX_IMAGE_SIDE and return a JPEG data URI.
	Falls back to the raw bytes if Pillow can't open it."""
	import base64
	import io

	try:
	from PIL import Image

	im = Image.open(path)
	if im.mode not in ("RGB", "L"):
	im = im.convert("RGB")
	w, h = im.size
	scale = VLM_MAX_IMAGE_SIDE / max(w, h)
	if scale < 1:
	im = im.resize((max(1, int(w * scale)), max(1, int(h * scale))))
	buf = io.BytesIO()
	im.convert("RGB").save(buf, format="JPEG", quality=88)
	data = buf.getvalue()
	mime = "jpeg"
	except Exception: # unreadable by Pillow — send original bytes
	with open(path, "rb") as f:
	data = f.read()
	ext = os.path.splitext(path)[1].lstrip(".").lower() or "jpeg"
	mime = "jpeg" if ext in ("jpg", "jpeg") else ext
	return f"data:image/{mime};base64," + base64.b64encode(data).decode("ascii")


	def _dedupe_food_items(text: str) -> str:
	"""Keep unique '- item' bullet lines (the small VLM sometimes repeats), in
	order, capped to 8 — so the identified-items list stays tight."""
	seen, items = set(), []
	for line in text.splitlines():
	line = line.strip().lstrip("-*•").strip()
	if not line:
	continue
	key = line.lower()
	if key in seen:
	continue
	seen.add(key)
	items.append(f"- {line}")
	if len(items) >= 8:
	break
	return "\n".join(items)


	def describe_food_image(path: str) -> str:
	"""Identify the food items visible in a photo using the vision model.

	Returns a short, de-duplicated bulleted list of items (also works on
	receipts/labels by reading product names). This is the perception step; the
	memory-grounded analysis is done separately by run_domain("meal_photo", …)."""
	data_uri = _image_data_uri(path)

	vlm = get_vlm()
	with _vlm_lock:
	out = vlm.create_chat_completion(
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": data_uri}},
	{"type": "text", "text": _FOOD_VISION_PROMPT},
	],
	}
	],
	max_tokens=160,
	temperature=0.2,
	)
	raw = strip_think(out["choices"][0]["message"]["content"] or "").strip()
	return _dedupe_food_items(raw)


	def warmup() -> None:
	"""Load the text model at startup so the first request isn't a cold start.
	The vision model is loaded lazily on the first food photo. A load failure
	is swallowed here — MODEL_STATE captures it and requests surface a friendly
	message — so the web server still comes up and serves the UI."""
	try:
	get_llm()
	except ModelUnavailable:
	pass # state already set to "error"; UI will show it
	# Load the embedder now (before any food-photo VLM load) and seed demo
	# notes when in demo mode.
	try:
	rag.warmup()
	except Exception as e: # embedder optional — recall just returns []
	logger.warning("embedder warmup failed: %s", e)
	if config.DEMO:
	rag.ensure_seeded()


	# This Nemotron GGUF always "thinks out loud" in plain prose and ignores
	# /no_think and "detailed thinking off". Rather than fight it, we let it reason,
	# ask it to keep reasoning short and mark the answer with a delimiter, and strip
	# everything before the answer server-side (see ANSWER_DELIM / _clean_response).
	# The stripper is anchor-based, so it stays clean even when the model forgets
	# the delimiter under a long prompt.
	ANSWER_DELIM = "==ANSWER=="

	SYSTEM_BASE = (
	"You are LifeOS, a sharp, friendly personal assistant running 100% locally "
	"on {pos} own machine.\n"
	"Think briefly first if you must, then write a line containing exactly "
	+ ANSWER_DELIM + " followed by the final answer for {name}. Keep any "
	"reasoning short; the user only sees what comes after " + ANSWER_DELIM + ".\n"
	"The final answer is concise and concrete: lead with bold key items and "
	"short bullet lists, ground every claim in the provided memory (quote "
	"specific dishes, dates, prices, habits), and never invent data not in the "
	"context."
	)

	DOMAIN_INSTRUCTIONS = {
	"food": (
	"Task: recommend exactly 3 recipes for this week. For each, give the "
	"recipe name, which flyer deals it uses (with prices), estimated cost, "
	"and a one-line 'why' that references both the deals and what {name} "
	"cooked recently (favor variety — avoid repeating recent main "
	"ingredients). Respect dietary preferences strictly."
	),
	"health": (
	"Task: recommend tomorrow's exercise. Consider the recent workout "
	"pattern, muscle-group rotation, rest balance, and the fitness goal. "
	"Give one clear recommendation (type + duration), then 2-3 bullet "
	"points of reasoning referencing specific recent workouts and any "
	"known injury constraints."
	),
	"money": (
	"Task: review the detected recurring subscriptions against income and "
	"budget. Classify each as CANCEL, KEEP, or WATCH with a one-line "
	"plain-language reason (reference cost, last-used date, and overlap "
	"with other services). End with the total monthly savings if all "
	"CANCEL items are dropped and what that money could fund."
	),
	"goal": (
	"Task: act as a Socratic financial-goal coach for {name}. Ask exactly "
	"ONE probing question at a time — why this goal matters, what tradeoffs "
	"they'd accept, whether the timeline is realistic given income and "
	"monthly payments, what spending they would cut. Keep each turn short. "
	"After roughly 3-4 exchanges (use the conversation history to judge), "
	"stop questioning and summarize a concrete savings plan: monthly amount "
	"to set aside, what to cut, and the realistic completion date, checked "
	"against {pos} income and monthly payments."
	),
	"meal_photo": (
	"Task: a vision model has identified the food items in a photo of "
	"{pos} meal (or read a grocery receipt). Using that item list, write "
	"a short, well-structured markdown response with EXACTLY these three "
	"sections:\n"
	"Identified — a tight bullet list of the items, each in bold.\n"
	"How it fits — 2-3 bullets on how these choices line up with "
	"{pos} dietary preferences and fitness goal, calling out specific "
	"items and a rough protein read.\n"
	"Buy next — 2-3 suggested items that better fit their goals and "
	"budget, each with a one-line reason.\n"
	"Keep it concise. Use bullets and bold; do not invent items that were "
	"not identified."
	),
	"payment_impact": (
	"Task: {name} just updated their monthly payments. Explain how their "
	"total monthly payments affect reaching their savings goal(s). Compute "
	"money left to save = monthly income − total monthly payments, then for "
	"each goal estimate how many months the remaining amount (target − "
	"saved) will take at that rate and whether the deadline is realistic.\n"
	"Format the answer EXACTLY like this, with real line breaks:\n"
	"<one-line headline with the key number>\n"
	"- <goal name>: <remaining $>, <months> at <$/mo>, deadline <date> — on "
	"track / behind\n"
	"Use one bullet per goal, each on its OWN line. Be concrete with dollar "
	"figures. If there are no goals, reply with one short line instead."
	),
	"chat": (
	"Task: answer the question using everything you know about {name} "
	"across food, fitness, and finances. Cross-reference domains when "
	"useful. If asked to plan, produce a compact, actionable plan."
	),
	}


	def _slice_for_domain(domain: str, mem: dict) -> dict:
	profile = mem["user_profile"]
	finances = mem.get("finances", {})
	if domain == "food":
	return {"user_profile": profile, "recent_meals": memory_store.recent_meals(7, mem)}
	if domain == "meal_photo":
	return {"user_profile": profile, "recent_meals": memory_store.recent_meals(7, mem)}
	if domain == "health":
	return {
	"user_profile": profile,
	"workouts_last_14_days": memory_store.workouts_in_window(14, mem),
	"calendar_next_7_days": memory_store.events_in_window(7, mem),
	"workout_schedule": mem.get("workout_schedule", {}),
	}
	if domain in ("money", "goal", "payment_impact"):
	return {
	"user_profile": profile,
	"finances": finances,
	"monthly_payments": finances.get("monthly_payments", []),
	"goals": mem.get("goals", []),
	}
	return { # chat sees everything
	"user_profile": profile,
	"recent_meals": memory_store.recent_meals(7, mem),
	"workouts_last_14_days": memory_store.workouts_in_window(14, mem),
	"calendar_next_7_days": memory_store.events_in_window(7, mem),
	"workout_schedule": mem.get("workout_schedule", {}),
	"finances": finances,
	"goals": mem.get("goals", []),
	}


	def slice_for_domains(mem: dict, domains: list[str]) -> dict:
	"""Merged memory slice for selected domains ("kitchen"->food); profile always included."""
	alias = {"kitchen": "food"}
	merged = {"user_profile": mem["user_profile"]}
	for d in domains:
	merged.update(_slice_for_domain(alias.get(d, d), mem))
	return merged


	def _fmt(obj, indent=0) -> str:
	pad = " " * indent
	if isinstance(obj, dict):
	return "\n".join(f"{pad}{k}: {_fmt(v, indent + 1).lstrip() if not isinstance(v, (dict, list)) else chr(10) + _fmt(v, indent + 1)}" for k, v in obj.items())
	if isinstance(obj, list):
	return "\n".join(f"{pad}- {_fmt(x, indent + 1).lstrip()}" if not isinstance(x, (dict, list)) else f"{pad}-\n{_fmt(x, indent + 1)}" for x in obj)
	return f"{pad}{obj}"


	def _names(profile: dict) -> tuple[str, str, str]:
	"""(address, possessive, header) for prompts. Falls back gracefully when a
	new user hasn't set their name yet, so prompts never read "'s machine"."""
	name = (profile.get("name") or profile.get("first_name") or "").strip()
	if name:
	return name, f"{name}'s", f"{name.upper()}'S MEMORY"
	return "you", "your", "YOUR MEMORY"


	def build_prompt(domain: str, mem: dict, user_input: str, domains: list[str] \| None = None) -> list[dict]:
	"""Assemble [system, user] messages: domain template + short-term memory
	slice + long-term RAG recall. `domains` narrows the memory slice to only
	the referenced domains (chat refs); None keeps the default slice."""
	name, pos, header = _names(mem["user_profile"])
	recall_query = user_input or DOMAIN_INSTRUCTIONS[domain]
	notes = rag.recall(f"{domain}: {recall_query}", k=5)

	system = SYSTEM_BASE.format(name=name, pos=pos)
	if domain in DOMAIN_INSTRUCTIONS:
	system += "\n\n" + DOMAIN_INSTRUCTIONS[domain].format(name=name, pos=pos)

	mem_slice = slice_for_domains(mem, domains) if domains else _slice_for_domain(domain, mem)
	parts = [f"=== {header} ===", _fmt(mem_slice)]
	if notes:
	parts.append("\n=== LONG-TERM NOTES (recalled) ===")
	parts.extend(f"- {n['text']}" for n in notes)
	parts.append("\n=== REQUEST ===")
	parts.append(user_input.strip() if user_input.strip() else "(Use the task instructions above.)")
	# Recency nudge: a final instruction at the very end of the user turn is the
	# most reliable way to stop this reasoning-happy GGUF from burning the token
	# budget thinking out loud. It jumps almost straight to the delimiter, which
	# _clean_response strips — giving fast, clean answers.
	parts.append(
	"\n\nIMPORTANT: Do NOT think step by step or explain your reasoning. "
	"Immediately write " + ANSWER_DELIM + " then the final answer."
	)

	return [
	{"role": "system", "content": system},
	{"role": "user", "content": "\n".join(parts)},
	]


	_THINK_RE = re.compile(r"<think>.*?(?:</think>\|$)", re.DOTALL)

	# A line that begins a markdown block — the real answer almost always starts
	# with one of these across every domain (bold lead, header, bullet, number,
	# table row, blockquote).
	_MD_ANCHOR = re.compile(r"^(?:\\\|#{1,6}\s\|[-*+]\s\|\d+[.)]\s\|\\|\|>\s?)")

	# Plain-prose lines that are the model thinking out loud, not answer content.
	# This GGUF reasons in first-person prose ("We need to…", "Let's compute…",
	# "Now classify…", "Let's produce:") before writing the markdown answer.
	_REASONING = re.compile(
	r"(?i)\b(?:we (?:need\|should\|must\|can\|have to\|could\|want\|'?ll)\|let'?s\b\|so we\b\|"
	r"the user (?:wants\|needs\|asks\|is)\|plain text\|private reasoning\|"
	r"is (?:discarded\|hidden)\|then (?:markdown\|final\|the answer\|answer)\|"
	r"first[,:]? \|probably\b\|i think\b\|okay[,:]\|now (?:let\|we\|i\|classify\|compute)\|"
	r"let'?s (?:produce\|craft\|compute\|do\|output)\|markdown:\|answer:?$\|maybe\b\|actually\b)"
	)


	# Trailing afterthoughts the model sometimes tacks on AFTER the answer
	# ("But months 0.3 seems weird.", "Wait, let me recheck."). Trimmed from the end.
	_TRAILING_META = re.compile(
	r"(?i)^(?:but\|wait\|hmm+\|note\|actually\|hold on\|let me\|i should\|that\|this\|"
	r"however)\b.*\b(?:seem\|weird\|odd\|wrong\|off\|recalc\|double\|check\|sure\|"
	r"strange\|recompute\|verify)\b\|^(?:wait\|hmm+)\b"
	)


	def strip_think(text: str) -> str:
	"""Remove <think>…</think> blocks (also handles an unclosed one mid-stream)."""
	return _THINK_RE.sub("", text).lstrip()


	def _is_reasoning_line(line: str) -> bool:
	return bool(_REASONING.search(line))


	def _strip_to_last_delimiter(text: str) -> str:
	"""Cut to the answer using the model's reasoning markers.

	ANSWER_DELIM reliably marks where the answer STARTS, so we keep what's after
	the last one. A bare </think> (no opening tag) is ambiguous: usually it ends
	a reasoning block that PRECEDES the answer, but sometimes the model emits it
	AFTER the answer (trailing). We disambiguate by whether real content follows
	it — substantial text after </think> is the answer; otherwise the answer is
	what came before."""
	text = _THINK_RE.sub("", text) # drop any well-formed <think>…</think>
	if ANSWER_DELIM in text:
	text = text.rsplit(ANSWER_DELIM, 1)[-1]
	if "</think>" in text:
	before, _, after = text.rpartition("</think>")
	text = after if len(after.strip()) >= 8 else before
	return text.strip()


	def _trim_trailing_meta(text: str) -> str:
	"""Drop trailing blank / afterthought lines the model adds after the answer."""
	lines = text.split("\n")
	while lines and (not lines[-1].strip() or _TRAILING_META.search(lines[-1].strip())):
	lines.pop()
	return "\n".join(lines).strip()


	def _clean_response(text: str) -> str:
	"""Return only the user-facing answer, hiding the model's chain-of-thought.

	The model reasons in plain prose then writes a markdown answer. Strategy:
	1. drop <think> blocks; if it emitted ANSWER_DELIM, keep only what follows;
	2. otherwise, if the text reads as reasoning and a markdown block appears
	later, jump to that first markdown line (the answer);
	3. while still mid-reasoning with no answer in sight, return "" so the UI
	keeps showing its thinking state instead of the raw reasoning.
	Returns the text unchanged when nothing looks like reasoning — genuine
	answers pass through untouched."""
	text = _strip_to_last_delimiter(text)

	lines = text.strip().split("\n")
	nonempty = [l for l in lines if l.strip()]
	if not nonempty:
	return ""

	anchor = next((i for i, l in enumerate(lines) if _MD_ANCHOR.match(l.strip())), None)
	looks_reasoning = any(_is_reasoning_line(l) for l in nonempty)

	if anchor is not None:
	pre = [l for l in lines[:anchor] if l.strip()]
	# Jump to the answer when reasoning precedes the first markdown block.
	if pre and any(_is_reasoning_line(l) for l in pre):
	return _trim_trailing_meta("\n".join(lines[anchor:]).strip())
	return _trim_trailing_meta(text.strip())

	# No markdown block yet. If it's pure reasoning, hide it (streaming);
	# the end-of-stream fallback will recover the answer if one exists.
	return "" if looks_reasoning else _trim_trailing_meta(text.strip())


	def _final_answer(text: str) -> str:
	"""End-of-stream fallback: best-effort answer even if the model never wrote
	a markdown block or delimiter (e.g. a plain one-line coaching question).
	Drops leading reasoning lines; returns the raw text if that empties it."""
	cleaned = _clean_response(text)
	if cleaned:
	return cleaned
	body = _strip_to_last_delimiter(text)
	lines = body.strip().split("\n")
	while lines and (not lines[0].strip() or _is_reasoning_line(lines[0])):
	lines.pop(0)
	return _trim_trailing_meta("\n".join(lines).strip()) or body.strip()


	_MODEL_ERROR_MSG = (
	"⚠️ The local model couldn't start on this machine. Check that "
	"llama-cpp-python is installed for your hardware and that there's enough "
	"memory, then restart LifeOS. (Details are in the server log.)"
	)


	def generate_stream(
	messages: list[dict],
	max_tokens: int = 1024,
	temperature: float = 0.4,
	domain: str = "chat",
	extra_context: str = "",
	) -> Iterator[str]:
	"""Yield cumulative user-facing response text.

	The model reasons out loud and marks the answer with ANSWER_DELIM. We hide
	everything until the delimiter appears, then stream the cleaned answer
	(see _clean_response). If the model never emits the delimiter, we fall back
	to a best-effort clean so the user is never left with an empty reply.

	extra_context (e.g. web search results) is appended to the final user
	message when non-empty. If the model can't be loaded, yields a single
	friendly message rather than raising — the UI shows it inline.
	"""
	if extra_context:
	messages = list(messages)
	for i in range(len(messages) - 1, -1, -1):
	if messages[i].get("role") == "user":
	messages[i] = {
	"role": "user",
	"content": messages[i]["content"] + "\n\n=== WEB CONTEXT ===\n" + extra_context,
	}
	break

	try:
	llm = get_llm()
	except ModelUnavailable:
	yield _MODEL_ERROR_MSG
	return

	acc = ""
	last = ""
	try:
	with _llm_lock:
	for chunk in llm.create_chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=True,
	):
	delta = chunk["choices"][0].get("delta", {})
	acc += delta.get("content") or ""
	# _clean_response returns "" while the model is still reasoning,
	# so the UI keeps its "thinking…" state until the answer starts.
	cleaned = _clean_response(acc)
	if cleaned and cleaned != last:
	last = cleaned
	yield cleaned
	except Exception as e: # inference-time failure (e.g. OOM mid-generation)
	logger.error("generation failed (%s): %s", domain, e)
	if not last and not acc:
	yield _MODEL_ERROR_MSG
	return

	# If nothing surfaced (model never wrote a markdown answer/delimiter), fall
	# back to a best-effort strip so the reply is never blank.
	if not last and acc:
	fallback = _final_answer(acc)
	if fallback:
	yield fallback


	def run_domain(domain: str, user_input: str = "", max_tokens: int = 1024) -> Iterator[str]:
	"""One-call helper: load memory, build prompt, stream the answer."""
	mem = memory_store.load()
	messages = build_prompt(domain, mem, user_input)
	yield from generate_stream(messages, max_tokens=max_tokens, domain=domain)