Spaces:

AsadIsmail
/

lore-demo

Sleeping

App Files Files Community

lore-demo / lore /evolve /daemon.py

AsadIsmail

Update Lore demo Space

5cb0758 verified about 2 months ago

raw

history blame contribute delete

6.31 kB

	"""
	Daemon process: keeps the LoRA model in memory for instant suggestions.

	Run once in a terminal:
	uv run lore-train serve

	Then all suggestion/training calls go through HTTP (~100ms) instead of
	loading the model from scratch (~17s) each time.

	Endpoints:
	GET /suggest?n=3 — generate follow-up questions
	POST /train/curiosity — trigger curiosity training
	GET /status — model + training stats
	GET /health — liveness check
	"""

	from __future__ import annotations

	import threading
	from pathlib import Path

	import torch
	from fastapi import FastAPI
	from pydantic import BaseModel

	from lore.config import LORA_CHECKPOINTS_DIR, LORA_BASE_MODEL_ID, HF_CACHE_DIR, get_device_map, get_torch_dtype

	app = FastAPI(title="Lore Daemon", version="0.1.0")

	_model = None
	_tokenizer = None
	_lock = threading.Lock()
	_current_checkpoint: Path \| None = None


	def _get_latest_checkpoint() -> Path \| None:
	if not LORA_CHECKPOINTS_DIR.exists():
	return None
	checkpoints = sorted(LORA_CHECKPOINTS_DIR.glob("step-*"))
	return checkpoints[-1] if checkpoints else None


	def _load_model(checkpoint: Path \| None = None):
	global _model, _tokenizer, _current_checkpoint
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	print(f"[daemon] Loading model: {LORA_BASE_MODEL_ID}")
	tokenizer = AutoTokenizer.from_pretrained(
	LORA_BASE_MODEL_ID,
	cache_dir=str(HF_CACHE_DIR),
	trust_remote_code=True,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	base = AutoModelForCausalLM.from_pretrained(
	LORA_BASE_MODEL_ID,
	cache_dir=str(HF_CACHE_DIR),
	torch_dtype=get_torch_dtype(),
	device_map=get_device_map(),
	trust_remote_code=True,
	)

	if checkpoint and checkpoint.exists():
	print(f"[daemon] Loading LoRA checkpoint: {checkpoint.name}")
	model = PeftModel.from_pretrained(base, str(checkpoint))
	else:
	model = base

	model.eval()
	_model = model
	_tokenizer = tokenizer
	_current_checkpoint = checkpoint
	print(f"[daemon] Model ready (checkpoint: {checkpoint.name if checkpoint else 'base'})")


	def _ensure_model():
	global _model
	with _lock:
	if _model is None:
	_load_model(_get_latest_checkpoint())
	return _model, _tokenizer


	def _reload_if_new_checkpoint():
	"""Check for new checkpoint and reload if needed."""
	global _current_checkpoint
	latest = _get_latest_checkpoint()
	if latest and latest != _current_checkpoint:
	print(f"[daemon] New checkpoint detected: {latest.name}")
	with _lock:
	_load_model(latest)


	@app.on_event("startup")
	async def startup():
	_ensure_model()


	# ── Endpoints ─────────────────────────────────────────────────────────────────

	@app.get("/health")
	async def health():
	return {"status": "ok", "model_loaded": _model is not None}


	@app.get("/status")
	async def status():
	from lore.evolve.trajectory import get_question_trace_stats
	return {
	"model": LORA_BASE_MODEL_ID,
	"checkpoint": _current_checkpoint.name if _current_checkpoint else "base",
	"question_traces": get_question_trace_stats(),
	}


	class SuggestResponse(BaseModel):
	suggestions: list[dict]


	@app.get("/suggest", response_model=SuggestResponse)
	async def suggest(n: int = 3):
	_reload_if_new_checkpoint()

	from lore.evolve.curiosity import (
	build_wiki_state_summary, question_reward, CURIOSITY_SYSTEM_PROMPT,
	)
	from lore.evolve.trajectory import get_all_past_questions

	model, tokenizer = _ensure_model()
	wiki_state = build_wiki_state_summary()
	past_questions = get_all_past_questions()

	if wiki_state == "[Empty wiki]":
	return SuggestResponse(suggestions=[])

	prompt = (
	f"Wiki state:\n{wiki_state}\n\n"
	f"Recent questions asked:\n"
	+ "\n".join(f"- {q}" for q in past_questions[-10:])
	+ "\n\nGenerate a follow-up question this researcher should explore:"
	)

	suggestions = []
	for _ in range(n * 2):
	messages = [
	{"role": "system", "content": CURIOSITY_SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	]
	text = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True,
	enable_thinking=False,
	)
	inputs = tokenizer([text], return_tensors="pt", truncation=True, max_length=1024)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=100,
	do_sample=True,
	temperature=1.2,
	top_p=0.95,
	pad_token_id=tokenizer.eos_token_id,
	)

	new_tokens = output[0][inputs["input_ids"].shape[1]:]
	candidate = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

	if not candidate or len(candidate) < 10:
	continue
	candidate = candidate.split("\n")[0].strip()

	reward = question_reward(candidate, wiki_state, past_questions)
	suggestions.append({"question": candidate, **reward})

	suggestions.sort(key=lambda s: s["combined"], reverse=True)

	import re
	def _norm(s): return re.sub(r"[^\w\s]", "", s.lower()).strip()
	seen = set()
	unique = []
	for s in suggestions:
	q_norm = _norm(s["question"])
	if q_norm not in seen:
	seen.add(q_norm)
	unique.append(s)
	if len(unique) >= n:
	break

	return SuggestResponse(suggestions=unique)


	@app.post("/train/curiosity")
	async def train_curiosity():
	"""Trigger curiosity training in a background thread."""
	def _run():
	_reload_if_new_checkpoint()
	from lore.evolve.trainer import run_curiosity_training
	run_curiosity_training()
	_reload_if_new_checkpoint()

	thread = threading.Thread(target=_run, daemon=True)
	thread.start()
	return {"status": "training_started"}