github-actions
Sync from GitHub @ df94a64
913ab6d
"""EvoLLM β€” investor-ready Gradio demo.
Same file runs on:
β€’ HuggingFace Spaces (auto-deployed from GitHub)
β€’ Locally: `python space/app.py` (privacy-first, no cloud)
The Space ships with 5 hand-curated "personality" adapters (Default, Creative,
Concise, Technical, Empathetic). A Thompson-sampling bandit picks one per
query, learning from thumbs feedback. The evolution log shows mutation
events as new adapter variants are tested and promoted.
This file is self-contained: it inlines the genome / bandit / pool logic
so the Space stays independent of the main evollm package. The same logic
lives in evollm/* for the local server build.
"""
from __future__ import annotations
import json
import math
import os
import random
import sys
import time
import uuid
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from pydantic import BaseModel, Field
# Make the local `knowledge` package importable whether we're running on HF
# Spaces (cwd = /app) or locally (cwd = repo root, script in space/).
sys.path.insert(0, str(Path(__file__).resolve().parent))
from knowledge import ( # noqa: E402
KnowledgePipeline,
generate_training_notebook,
import_adapter as import_adapter_files,
)
# ────────────────────────────────────────────────────────────────────────────
# Genome
# ────────────────────────────────────────────────────────────────────────────
class Genome(BaseModel):
genome_id: str = Field(default_factory=lambda: uuid.uuid4().hex[:12])
parent_id: str | None = None
generation: int = 0
name: str = "Default"
base_model: str = "SmolLM2-360M-Instruct"
quantization: str = "Q8_0"
lora_rank: int = 8
lora_alpha: int = 16
lora_target_modules: list[str] = Field(default_factory=lambda: ["q_proj", "v_proj"])
memory_token_enabled: bool = False
memory_token_count: int = 4
temperature: float = 0.7
top_p: float = 0.9
top_k: int = 40
repeat_penalty: float = 1.1
max_tokens: int = 256
system_prompt: str = (
"You are EvoLLM, a privacy-first local assistant. "
"Give helpful, complete answers. Be accurate and honest about uncertainty."
)
fitness_score: float | None = None
eval_bank_score: float | None = None
feedback_score: float | None = None
knowledge_sources: list[str] = Field(default_factory=list) # doc ids this genome can read; empty = all
# ────────────────────────────────────────────────────────────────────────────
# Adapter pool
# ────────────────────────────────────────────────────────────────────────────
@dataclass
class Adapter:
adapter_id: str
name: str
description: str
genome: Genome
promoted: bool = True
def build_seed_pool() -> list[Adapter]:
pool = [
Adapter("evo_default", "Default",
"Balanced baseline genome β€” the neutral start of evolution.",
Genome(
genome_id="evo_default", name="Default",
system_prompt=(
"You are EvoLLM, a privacy-first local assistant. "
"Give helpful, complete answers. Be accurate, balanced, and "
"honest about what you don't know."
),
temperature=0.7, top_p=0.9, top_k=40,
eval_bank_score=0.62,
)),
Adapter("evo_creative", "Creative",
"Higher temperature, expressive β€” for ideation and writing.",
Genome(
genome_id="evo_creative", name="Creative", generation=1,
parent_id="evo_default",
system_prompt=(
"You are EvoLLM in creative mode. Embrace originality, vivid imagery, "
"and surprising connections. Write expressively but stay coherent and "
"on-topic."
),
temperature=1.0, top_p=0.95, top_k=80,
lora_rank=16, lora_alpha=32,
eval_bank_score=0.55,
)),
Adapter("evo_concise", "Concise",
"Terse, fact-first β€” optimised for quick answers.",
Genome(
genome_id="evo_concise", name="Concise", generation=1,
parent_id="evo_default",
system_prompt=(
"You are EvoLLM in concise mode. Answer in 1-3 short sentences. "
"Skip preamble and qualifications. Information density above all."
),
temperature=0.4, top_p=0.85, top_k=30, max_tokens=128,
eval_bank_score=0.68,
)),
Adapter("evo_technical", "Technical",
"Precise, structured, code-aware β€” for engineering questions.",
Genome(
genome_id="evo_technical", name="Technical", generation=1,
parent_id="evo_default",
system_prompt=(
"You are EvoLLM in technical mode. Use precise terminology and "
"structured reasoning. Use code blocks when relevant. State any "
"assumptions explicitly. Complete answers preferred over short ones."
),
temperature=0.5, top_p=0.9, top_k=40, max_tokens=384,
lora_rank=32, lora_alpha=64,
lora_target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
eval_bank_score=0.71,
)),
Adapter("evo_empathetic", "Empathetic",
"Warmer, context-sensitive β€” better for personal topics.",
Genome(
genome_id="evo_empathetic", name="Empathetic", generation=1,
parent_id="evo_default",
system_prompt=(
"You are EvoLLM in empathetic mode. Acknowledge feelings before "
"facts. Be warm, patient, and supportive while remaining honest "
"and helpful."
),
temperature=0.75, top_p=0.92, top_k=50,
memory_token_enabled=True, memory_token_count=8,
eval_bank_score=0.59,
)),
]
return pool
# ────────────────────────────────────────────────────────────────────────────
# Thompson-sampling bandit
# ────────────────────────────────────────────────────────────────────────────
@dataclass
class ArmStats:
adapter_id: str
alpha: float = 1.0
beta: float = 1.0
def sample(self) -> float:
x = random.gammavariate(self.alpha, 1.0)
y = random.gammavariate(self.beta, 1.0)
return x / (x + y) if (x + y) > 0 else 0.0
@property
def mean(self) -> float:
return self.alpha / (self.alpha + self.beta)
class Bandit:
def __init__(self) -> None:
self.arms: dict[str, ArmStats] = {}
def register(self, adapter_id: str, prior_fitness: float | None = None) -> None:
if adapter_id in self.arms:
return
if prior_fitness is not None:
weight = 5.0
alpha = 1.0 + prior_fitness * weight
beta = 1.0 + (1.0 - prior_fitness) * weight
else:
alpha, beta = 1.0, 1.0
self.arms[adapter_id] = ArmStats(adapter_id, alpha, beta)
def select(self) -> str:
scored = [(arm.sample(), arm.adapter_id) for arm in self.arms.values()]
scored.sort(reverse=True)
return scored[0][1]
def update(self, adapter_id: str, reward: float) -> None:
if adapter_id not in self.arms:
self.register(adapter_id)
reward = max(0.0, min(1.0, reward))
self.arms[adapter_id].alpha += reward
self.arms[adapter_id].beta += 1.0 - reward
def snapshot(self) -> list[dict]:
out = []
for arm in self.arms.values():
trials = arm.alpha + arm.beta - 2
out.append({
"adapter_id": arm.adapter_id,
"mean": round(arm.mean, 3),
"alpha": round(arm.alpha, 2),
"beta": round(arm.beta, 2),
"trials": int(max(0, trials)),
"confidence": round(1.0 - 1.0 / math.sqrt(arm.alpha + arm.beta), 3),
})
out.sort(key=lambda r: r["mean"], reverse=True)
return out
# ────────────────────────────────────────────────────────────────────────────
# Model loading
# ────────────────────────────────────────────────────────────────────────────
# EVOLLM_SKIP_MODEL lets tests import this module without the (large) model
# download β€” used by scripts/smoke_test.py to exercise the evolution-replay
# and rendering logic. Never set in production.
if os.environ.get("EVOLLM_SKIP_MODEL") == "1":
print("EVOLLM_SKIP_MODEL=1 β€” skipping model load (test mode).")
LLM = None
else:
print("Downloading SmolLM2-360M-Instruct Q8_0 GGUF...")
MODEL_PATH = hf_hub_download(
repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
filename="smollm2-360m-instruct-q8_0.gguf",
)
print(f"Loading model from {MODEL_PATH}...")
LLM = Llama(
model_path=MODEL_PATH,
n_ctx=2048,
n_threads=os.cpu_count() or 4,
n_batch=512,
verbose=False,
)
print("Model ready.")
# Real, pre-measured evolution run (produced by scripts/run_evolution_sweep.py).
# The free CPU Space can't score 41 eval prompts/adapter live (~15 min each),
# so we REPLAY a genuine offline run: real model, real eval bank, real scores.
# Only the timing is replayed β€” the numbers are measured, not simulated.
def _load_evolution_run() -> dict | None:
# EVOLLM_RUN_PATH lets tests point at a fixture; default is the committed run.
override = os.environ.get("EVOLLM_RUN_PATH")
path = Path(override) if override else Path(__file__).resolve().parent / "data" / "evolution_run.json"
if not path.exists():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception as e:
print(f"[evolution] failed to load run: {e}")
return None
EVOLUTION_RUN = _load_evolution_run()
REPLAY_STATE = {"revealed": 0} # how many recorded generations have been shown
POOL: list[Adapter] = build_seed_pool()
# Override hardcoded seed scores with the real measured ones if we have them.
if EVOLUTION_RUN:
_real = {s["adapter_id"]: s["eval_bank_score"] for s in EVOLUTION_RUN.get("seeds", [])}
for a in POOL:
if a.adapter_id in _real:
a.genome.eval_bank_score = _real[a.adapter_id]
POOL_BY_ID: dict[str, Adapter] = {a.adapter_id: a for a in POOL}
# Knowledge pipeline β€” embedder is lazy-loaded on first upload so app startup
# stays fast. On HF Space the SQLite file is ephemeral (rebuilds wipe it);
# locally it persists at data/knowledge.sqlite.
KNOWLEDGE = KnowledgePipeline()
BANDIT = Bandit()
for a in POOL:
BANDIT.register(a.adapter_id, prior_fitness=a.genome.eval_bank_score)
FEEDBACK_LOG: list[dict] = []
EVOLUTION_LOG: list[dict] = []
LAST_INTERACTION: dict = {
"adapter_id": None, "user_prompt": None, "response": None,
"knowledge_used": [],
}
def log_evolution(event_type: str, message: str, payload: dict | None = None) -> None:
EVOLUTION_LOG.insert(0, {
"at": datetime.utcnow().strftime("%H:%M:%S"),
"type": event_type,
"message": message,
"payload": payload or {},
})
del EVOLUTION_LOG[200:]
log_evolution("init", "EvoLLM initialised β€” 5 seed adapters loaded into pool.")
if EVOLUTION_RUN:
_seed_summary = " Β· ".join(
f"{s['name']} {s['eval_bank_score']:.3f}"
for s in sorted(EVOLUTION_RUN["seeds"], key=lambda s: -s["eval_bank_score"])
)
log_evolution(
"fitness",
f"Eval bank baseline (REAL, {EVOLUTION_RUN['meta']['eval_prompts']} prompts on "
f"{EVOLUTION_RUN['meta']['model']}): {_seed_summary}",
)
else:
log_evolution(
"fitness",
"Eval bank baseline pending β€” run scripts/run_evolution_sweep.py to populate real scores.",
)
# ────────────────────────────────────────────────────────────────────────────
# Chat
# ────────────────────────────────────────────────────────────────────────────
def chat(message, history, force_adapter, knowledge_mode):
if force_adapter and force_adapter != "🧬 Auto (bandit)":
adapter_id = next((a.adapter_id for a in POOL if a.name == force_adapter), None)
else:
adapter_id = None
if adapter_id is None:
adapter_id = BANDIT.select()
adapter = POOL_BY_ID[adapter_id]
genome = adapter.genome
system_prompt = genome.system_prompt
if genome.memory_token_enabled:
memory_prefix = "[Memory tokens active β€” maintain context awareness across turns.]"
system_prompt = f"{memory_prefix}\n\n{system_prompt}"
retrieved: list[dict] = []
if knowledge_mode:
try:
retrieved = KNOWLEDGE.query(message, top_k=3)
except Exception as e:
print(f"[knowledge] retrieval failed: {e}")
retrieved = []
if retrieved:
context_block = "\n\n".join(
f"[Source: {r['document_name']} Β· chunk {r['chunk_index']}]\n{r['text']}"
for r in retrieved
)
system_prompt = (
f"{system_prompt}\n\n"
f"Use the following context to ground your answer. If the answer is "
f"not in the context, say so honestly.\n\n"
f"--- BEGIN CONTEXT ---\n{context_block}\n--- END CONTEXT ---"
)
messages: list[dict[str, str]] = [{"role": "system", "content": system_prompt}]
for turn in history:
messages.append({"role": turn["role"], "content": turn["content"]})
messages.append({"role": "user", "content": message})
stream = LLM.create_chat_completion(
messages=messages,
temperature=genome.temperature,
top_p=genome.top_p,
top_k=genome.top_k,
repeat_penalty=genome.repeat_penalty,
max_tokens=genome.max_tokens,
stream=True,
)
partial = ""
for chunk in stream:
delta = chunk["choices"][0].get("delta", {})
token = delta.get("content")
if token:
partial += token
yield partial
# Append a sources footer when knowledge was used.
if retrieved:
sources_line = " Β· ".join(
f"{r['document_name']} (chunk {r['chunk_index']})" for r in retrieved
)
partial = f"{partial}\n\n_πŸ“š Sources: {sources_line}_"
yield partial
LAST_INTERACTION["adapter_id"] = adapter_id
LAST_INTERACTION["user_prompt"] = message
LAST_INTERACTION["response"] = partial
LAST_INTERACTION["knowledge_used"] = [r["document_id"] for r in retrieved]
# ────────────────────────────────────────────────────────────────────────────
# Feedback handlers
# ────────────────────────────────────────────────────────────────────────────
def record_feedback(rating: int) -> str:
adapter_id = LAST_INTERACTION["adapter_id"]
if not adapter_id:
return "No interaction yet β€” chat first, then rate."
BANDIT.update(adapter_id, reward=1.0 if rating > 0 else 0.0)
FEEDBACK_LOG.insert(0, {
"at": datetime.utcnow().strftime("%H:%M:%S"),
"adapter": POOL_BY_ID[adapter_id].name,
"rating": "πŸ‘" if rating > 0 else "πŸ‘Ž",
"prompt": (LAST_INTERACTION["user_prompt"] or "")[:80],
})
del FEEDBACK_LOG[100:]
label = "πŸ‘" if rating > 0 else "πŸ‘Ž"
log_evolution(
"feedback",
f"{label} for {POOL_BY_ID[adapter_id].name} β€” bandit updated.",
{"adapter_id": adapter_id, "rating": rating},
)
return f"Recorded {label} for **{POOL_BY_ID[adapter_id].name}**. Bandit updated."
# ────────────────────────────────────────────────────────────────────────────
# Manual evolution trigger (for demos)
# ────────────────────────────────────────────────────────────────────────────
def trigger_evolution_cycle() -> str:
"""Reveal the next generation of the REAL pre-measured evolution run.
Each step here was actually computed offline: the child genome was run
against the 41-prompt eval bank on the real model, and its fitness is the
measured score. We replay one recorded generation per click so a viewer
can watch the fitness curve climb. Nothing here is synthetic.
"""
if not EVOLUTION_RUN:
return (
"### No measured run available yet\n\n"
"The fitness curve replays a **real** offline evolution run. To generate it, "
"run on a machine with internet + the model:\n\n"
"```bash\npython scripts/run_evolution_sweep.py --generations 8\n```\n\n"
"Commit the resulting `space/data/evolution_run.json` and redeploy β€” the curve "
"and this button light up with genuine measured scores. (The free CPU Space "
"can't score 41 prompts/adapter live, which is why we replay a measured run.)"
)
lineage = EVOLUTION_RUN["lineage"]
idx = REPLAY_STATE["revealed"]
if idx >= len(lineage):
best = EVOLUTION_RUN["best_fitness_by_generation"][-1]
return (
f"### Evolution run complete\n\n"
f"All {len(lineage)} recorded generations revealed. "
f"Best measured fitness: **{best:.3f}** "
f"(seed baseline best: {EVOLUTION_RUN['best_fitness_by_generation'][0]:.3f}). "
f"Use **πŸ”„ Reset replay** to watch it again."
)
step = lineage[idx]
REPLAY_STATE["revealed"] = idx + 1
child_id = step["child_id"]
g = step["genome"]
child_genome = Genome(
genome_id=child_id,
parent_id=step["parent_id"],
generation=step["generation"],
name=f"Gen-{step['generation']} child",
system_prompt=g.get("system_prompt", ""),
temperature=g.get("temperature", 0.7),
top_p=g.get("top_p", 0.9),
top_k=g.get("top_k", 40),
max_tokens=g.get("max_tokens", 256),
memory_token_enabled=bool(g.get("memory_prefix")),
eval_bank_score=step["child_fitness"],
)
child = Adapter(
adapter_id=child_id,
name=child_genome.name,
description=f"Gen {step['generation']}: {step['mutation_kind']} β€” {step['mutation_detail']}",
genome=child_genome,
promoted=step["promoted"],
)
POOL.append(child)
POOL_BY_ID[child_id] = child
BANDIT.register(child_id, prior_fitness=child_genome.eval_bank_score)
log_evolution(
"mutation",
f"Gen {step['generation']}: mutated {step['parent_name']} via "
f"{step['mutation_kind']} ({step['mutation_detail']}) β†’ measured fitness "
f"{step['child_fitness']:.3f}",
{"adapter_id": child_id, "parent_id": step["parent_id"], "kind": step["mutation_kind"]},
)
if step["promoted"]:
log_evolution(
"promotion",
f"PROMOTED gen-{step['generation']} child β€” {step['child_fitness']:.3f} β‰₯ "
f"parent {step['parent_fitness']:.3f}. New population best.",
)
verdict = f"βœ… **Promoted** β€” beats parent ({step['child_fitness']:.3f} β‰₯ {step['parent_fitness']:.3f})"
else:
log_evolution(
"archive",
f"Archived gen-{step['generation']} child β€” {step['child_fitness']:.3f} < "
f"parent {step['parent_fitness']:.3f}.",
)
verdict = f"πŸ“¦ **Archived** β€” below parent ({step['child_fitness']:.3f} < {step['parent_fitness']:.3f})"
remaining = len(lineage) - REPLAY_STATE["revealed"]
return (
f"### Generation {step['generation']} (real, measured)\n\n"
f"**Mutation**: `{step['mutation_kind']}` β€” {step['mutation_detail']}\n\n"
f"**Parent fitness**: {step['parent_fitness']:.3f} β†’ "
f"**Child fitness**: {step['child_fitness']:.3f}\n\n"
f"{verdict}\n\n"
f"_{remaining} generation(s) left to reveal._"
)
def reset_evolution_replay() -> str:
"""Reset the replay and drop any revealed children from the pool."""
REPLAY_STATE["revealed"] = 0
seed_ids = {s["adapter_id"] for s in (EVOLUTION_RUN or {}).get("seeds", [])} or {
a.adapter_id for a in POOL if a.genome.generation == 0 or a.adapter_id.startswith("evo_")
}
# Keep only the original seed adapters (those present at startup).
survivors = [a for a in POOL if not a.adapter_id.startswith("evo_g")]
POOL.clear()
POOL.extend(survivors)
POOL_BY_ID.clear()
POOL_BY_ID.update({a.adapter_id: a for a in POOL})
return "πŸ”„ Replay reset β€” back to the seed population. Click Trigger to watch evolution again."
# ────────────────────────────────────────────────────────────────────────────
# UI render helpers
# ────────────────────────────────────────────────────────────────────────────
def render_fitness_curve():
"""DataFrame for the fitness-over-generations LinePlot, revealed up to the
current replay position."""
import pandas as pd
if not EVOLUTION_RUN:
return pd.DataFrame({"generation": [0], "best_fitness": [0.0]})
curve = EVOLUTION_RUN["best_fitness_by_generation"]
upto = REPLAY_STATE["revealed"] + 1 # gen 0 (seed) + revealed children
curve = curve[:upto]
return pd.DataFrame({
"generation": list(range(len(curve))),
"best_fitness": curve,
})
def render_evolution_summary() -> str:
if not EVOLUTION_RUN:
return (
"_No measured run loaded. Run `python scripts/run_evolution_sweep.py` "
"and commit `space/data/evolution_run.json` to populate this with real scores._"
)
m = EVOLUTION_RUN["meta"]
curve = EVOLUTION_RUN["best_fitness_by_generation"]
gain = curve[-1] - curve[0]
revealed = REPLAY_STATE["revealed"]
return (
f"**Real measured run** Β· model `{m['model']}` Β· {m['eval_prompts']} eval prompts Β· "
f"{m['generations']} generations.\n\n"
f"Seed best **{curve[0]:.3f}** β†’ evolved best **{curve[-1]:.3f}** "
f"(**+{gain:.3f}**, {gain / max(curve[0], 1e-9) * 100:.0f}% relative). "
f"Revealed: {revealed}/{len(EVOLUTION_RUN['lineage'])}.\n\n"
f"_The curve is genuine β€” measured offline because the free CPU Space can't score "
f"41 prompts/adapter live. Only the replay timing is for the demo._"
)
def render_active_genome() -> str:
aid = LAST_INTERACTION["adapter_id"] or POOL[0].adapter_id
return POOL_BY_ID[aid].genome.model_dump_json(indent=2)
def render_pool_table():
rows = []
snap = {s["adapter_id"]: s for s in BANDIT.snapshot()}
for a in sorted(POOL, key=lambda x: (-x.genome.generation, x.adapter_id)):
s = snap.get(a.adapter_id, {})
rows.append([
a.name,
a.genome.generation,
a.genome.eval_bank_score if a.genome.eval_bank_score is not None else "β€”",
s.get("mean", "β€”"),
s.get("trials", 0),
"βœ…" if a.promoted else "πŸ§ͺ",
])
return rows
def render_evolution_log() -> str:
if not EVOLUTION_LOG:
return "_No events yet._"
lines = []
for e in EVOLUTION_LOG[:40]:
icon = {
"init": "🧬", "fitness": "πŸ“Š", "feedback": "πŸ‘",
"mutation": "πŸ”€", "promotion": "πŸ†", "archive": "πŸ“¦",
}.get(e["type"], "β€’")
lines.append(f"`{e['at']}` {icon} **{e['type']}** β€” {e['message']}")
return "\n\n".join(lines)
def render_feedback_log() -> str:
if not FEEDBACK_LOG:
return "_No feedback yet β€” rate a response with πŸ‘ or πŸ‘Ž._"
lines = []
for f in FEEDBACK_LOG[:20]:
lines.append(f"`{f['at']}` {f['rating']} **{f['adapter']}** β€” _{f['prompt']}…_")
return "\n\n".join(lines)
# ────────────────────────────────────────────────────────────────────────────
# Knowledge helpers
# ────────────────────────────────────────────────────────────────────────────
def render_knowledge_table():
docs = KNOWLEDGE.documents()
if not docs:
return [["β€”", "β€”", "β€”", "β€”", "β€”"]]
rows = []
for d in docs:
size_kb = (d["size_bytes"] or 0) / 1024
rows.append([
d["name"],
d["format"].upper(),
f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb/1024:.1f} MB",
d["chunk_count"],
d["uploaded_at"][:19].replace("T", " "),
])
return rows
def render_knowledge_stats() -> str:
s = KNOWLEDGE.stats()
if s["document_count"] == 0:
return "_No documents indexed yet._"
size_mb = s["total_bytes"] / (1024 * 1024)
return (
f"πŸ“Š **{s['document_count']} documents** Β· "
f"**{s['chunk_count']} chunks** Β· "
f"**{size_mb:.2f} MB** total Β· "
f"embedder: `intfloat/multilingual-e5-small` (384-dim)"
)
def handle_file_upload(files) -> tuple[str, list]:
if not files:
return "_No files selected._", render_knowledge_table()
results = []
for f in files:
path = f if isinstance(f, str) else f.name
try:
result = KNOWLEDGE.ingest_file(path)
results.append(f"βœ… **{result['name']}** β€” {result['chunk_count']} chunks indexed")
log_evolution(
"knowledge",
f"πŸ“š Indexed {result['name']} β€” {result['chunk_count']} chunks",
{"document_id": result["document_id"]},
)
except Exception as e:
results.append(f"❌ Failed: `{Path(path).name}` β€” {e}")
return "\n\n".join(results), render_knowledge_table()
def handle_text_paste(name: str, text: str) -> tuple[str, list, str, str]:
if not (name and text and text.strip()):
return "_Provide both a name and some text._", render_knowledge_table(), name, text
try:
result = KNOWLEDGE.ingest_text(name.strip(), text)
log_evolution(
"knowledge",
f"πŸ“š Indexed pasted text '{result['name']}' β€” {result['chunk_count']} chunks",
{"document_id": result["document_id"]},
)
msg = f"βœ… **{result['name']}** β€” {result['chunk_count']} chunks indexed"
return msg, render_knowledge_table(), "", ""
except Exception as e:
return f"❌ {e}", render_knowledge_table(), name, text
def handle_delete_all() -> tuple[str, list]:
n = KNOWLEDGE.stats()["document_count"]
KNOWLEDGE.clear()
if n > 0:
log_evolution("knowledge", f"πŸ“š Cleared all {n} documents from index")
return f"πŸ—‘οΈ Cleared {n} document(s).", render_knowledge_table()
# ────────────────────────────────────────────────────────────────────────────
# LoRA-on-upload (Phase 4b)
# ────────────────────────────────────────────────────────────────────────────
NOTEBOOK_DIR = Path("data/notebooks")
NOTEBOOK_DIR.mkdir(parents=True, exist_ok=True)
def handle_generate_notebook(
selected_doc_names: list[str],
adapter_name: str,
lora_rank: int,
num_epochs: int,
):
"""Build a Colab notebook from the chunks of the selected documents."""
if not selected_doc_names:
return "_Select at least one indexed document first._", None
adapter_name = (adapter_name or "").strip() or "user_adapter"
# Gather chunks for the selected docs from the store.
with KNOWLEDGE.store._conn() as c: # noqa: SLF001 β€” internal access OK for now
rows = c.execute(
"SELECT documents.name as name, chunks.text as text "
"FROM chunks JOIN documents ON chunks.document_id = documents.id "
"WHERE documents.name IN (" + ",".join(["?"] * len(selected_doc_names)) + ") "
"ORDER BY chunks.document_id, chunks.chunk_index",
selected_doc_names,
).fetchall()
chunks = [r["text"] for r in rows]
if not chunks:
return "_No chunks found for those documents β€” re-index them?_", None
safe_name = "".join(ch if ch.isalnum() else "_" for ch in adapter_name)[:40] or "user_adapter"
out_path = NOTEBOOK_DIR / f"evollm_train_{safe_name}.ipynb"
generate_training_notebook(
adapter_name=adapter_name,
chunks=chunks,
source_doc_names=selected_doc_names,
lora_rank=int(lora_rank),
num_epochs=int(num_epochs),
output_path=out_path,
description=f"Trained from {len(selected_doc_names)} document(s) via EvoLLM",
)
log_evolution(
"knowledge",
f"πŸ“ Generated training notebook for '{adapter_name}' "
f"({len(chunks)} chunks, {len(selected_doc_names)} doc(s))",
)
msg = (
f"βœ… **Notebook ready**: `{out_path.name}` ({len(chunks)} training examples).\n\n"
f"1. Download the file below\n"
f"2. Open it in [Google Colab](https://colab.research.google.com/)\n"
f"3. **Runtime β†’ Change runtime type β†’ T4 GPU**, then **Runtime β†’ Run all**\n"
f"4. After training, download the two output files (`*.gguf` and `*.json`)\n"
f"5. Come back here, go to **🧬 Adapter Pool** tab β†’ **πŸ“₯ Import trained adapter**"
)
return msg, str(out_path)
def handle_import_adapter(gguf_file, manifest_file):
"""Receive a trained LoRA + manifest, register a new adapter in the pool."""
if not gguf_file or not manifest_file:
return "_Drop both the .gguf and the .json files._", render_pool_table()
try:
gguf_path = gguf_file if isinstance(gguf_file, str) else gguf_file.name
manifest_path = manifest_file if isinstance(manifest_file, str) else manifest_file.name
info = import_adapter_files(gguf_path, manifest_path)
except Exception as e:
return f"❌ Import failed: {e}", render_pool_table()
manifest = info["manifest"]
adapter_id = info["adapter_id"]
# Build a genome reflecting the trained-from-knowledge adapter.
sys_prompt = (
f"You are EvoLLM, fine-tuned on user-provided documents "
f"({', '.join(manifest.get('source_documents', []))[:160] or 'unknown sources'}). "
f"Draw on what you learned from those sources when relevant."
)
genome = Genome(
genome_id=adapter_id,
parent_id="evo_default",
generation=1,
name=info["name"],
base_model=manifest.get("base_model", "SmolLM2-1.7B-Instruct"),
lora_rank=int(manifest.get("lora_rank", 16)),
lora_alpha=int(manifest.get("lora_alpha", 32)),
lora_target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
system_prompt=sys_prompt,
eval_bank_score=None, # unmeasured until the eval bank runs against it
)
new_adapter = Adapter(
adapter_id=adapter_id,
name=info["name"],
description=info["description"] or "User-trained from documents",
genome=genome,
promoted=True,
)
POOL.append(new_adapter)
POOL_BY_ID[adapter_id] = new_adapter
BANDIT.register(adapter_id, prior_fitness=0.55) # neutral-mid prior
log_evolution(
"promotion",
f"πŸ“₯ IMPORTED user-trained adapter '{info['name']}' "
f"({manifest.get('training_examples', '?')} examples, rank {genome.lora_rank}) β€” joined the pool.",
{"adapter_id": adapter_id, "source_documents": manifest.get("source_documents", [])},
)
note = (
f"βœ… **{info['name']}** imported and added to the adapter pool.\n\n"
f"_GGUF saved to `{info['gguf_path']}`. The Bandit will start sampling it on the next "
f"chat. Real LoRA weight-loading is active in the local desktop app; on this Space the "
f"adapter uses its trained-from-data genome (system prompt + sampling config)._"
)
return note, render_pool_table()
def refresh_all():
return (
render_active_genome(),
render_pool_table(),
render_evolution_log(),
render_feedback_log(),
)
# ────────────────────────────────────────────────────────────────────────────
# UI
# ────────────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container {max-width: 1400px !important;}
.evo-header {background: linear-gradient(135deg, #1e3a8a 0%, #7c3aed 100%);
color: white; padding: 24px; border-radius: 12px; margin-bottom: 16px;
box-shadow: 0 4px 12px rgba(124, 58, 237, 0.15);}
.evo-header h1 {margin: 0 0 8px 0; font-size: 2rem;}
.evo-header p {margin: 0; opacity: 0.92; line-height: 1.5;}
.metric-badge {display: inline-block; padding: 4px 12px; background: rgba(255,255,255,0.18);
color: white; border-radius: 999px; margin-right: 6px; margin-top: 4px;
font-size: 0.85rem; backdrop-filter: blur(4px);}
.evo-notice {margin-top: 14px !important; padding: 10px 14px;
background: rgba(255, 255, 255, 0.12); border-radius: 8px;
font-size: 0.85rem !important;}
"""
INTRO_HTML = """
<div class="evo-header">
<h1>🧬 EvoLLM β€” Self-Evolving Local LLM</h1>
<p>A privacy-first 1B-class language model that <b>visibly improves itself</b> through
multi-armed-bandit adapter selection and Lamarckian evolution. Runs fully on-device.
No telemetry. No API calls.</p>
<div style="margin-top: 14px;">
<span class="metric-badge">🧠 Web demo: SmolLM2-360M</span>
<span class="metric-badge">πŸ’» Local app: SmolLM2-1.7B</span>
<span class="metric-badge">🧬 Adapter pool: 5 seed variants</span>
<span class="metric-badge">🎯 Bandit: Thompson sampling</span>
</div>
<p class="evo-notice">
⚑ <b>This web demo runs SmolLM2-360M for speed</b> on the free CPU tier (~5 tok/s, answers in 20-40s).
The local desktop app runs the full <b>SmolLM2-1.7B</b> for higher quality (5-30Γ— faster on real hardware).
The evolution engine, adapter pool, and bandit work identically on both.
</p>
</div>
"""
with gr.Blocks(title="EvoLLM", theme=gr.themes.Soft(), css=CSS) as demo:
gr.HTML(INTRO_HTML)
with gr.Tabs():
# ── Tab 1: Chat ──────────────────────────────────────────────
with gr.Tab("πŸ’¬ Chat"):
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
adapter_choice = gr.Dropdown(
choices=["🧬 Auto (bandit)"] + [a.name for a in POOL],
value="🧬 Auto (bandit)",
label="Adapter selection",
info="Auto = bandit picks based on learned preference. Or force one.",
scale=3,
)
knowledge_toggle = gr.Checkbox(
label="πŸ” Knowledge mode",
value=False,
info="Retrieve from uploaded documents and cite sources",
scale=1,
)
chatbot = gr.Chatbot(
height=480, show_copy_button=True, type="messages",
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"),
)
with gr.Row():
chat_input = gr.Textbox(
placeholder="Ask EvoLLM anything… (Enter to send)",
show_label=False, autofocus=True, scale=8,
container=False,
)
send_btn = gr.Button("Send", variant="primary", scale=1, min_width=80)
with gr.Row():
thumbs_up_btn = gr.Button("πŸ‘ Good response", size="sm")
thumbs_down_btn = gr.Button("πŸ‘Ž Bad response", size="sm")
clear_btn = gr.Button("πŸ—‘οΈ Clear chat", size="sm")
feedback_status = gr.Markdown("")
gr.Markdown(
"**Try:** *Explain quantum entanglement.* Β· "
"*Write a haiku about adaptive AI.* Β· "
"*What is distillation in machine learning?* Β· "
"*Translate to French: 'Good morning, how are you?'*"
)
with gr.Column(scale=2):
gr.Markdown("### 🧬 Active Genome")
active_genome_view = gr.Code(
value=render_active_genome(),
language="json", lines=20, interactive=False,
)
# ── Tab 2: Adapter Pool ──────────────────────────────────────
with gr.Tab("🧬 Adapter Pool"):
gr.Markdown(
"### The population of evolved variants\n"
"Each adapter is a distinct genome β€” system prompt, sampling config, "
"LoRA setup. The bandit learns which one wins for your usage."
)
pool_table = gr.Dataframe(
value=render_pool_table(),
headers=["Name", "Gen", "Eval Bank", "Bandit Mean", "Trials", "Status"],
interactive=False, wrap=True,
)
gr.Markdown("---")
gr.Markdown("### πŸ“ˆ Evolution β€” watch fitness climb across generations")
evolution_summary = gr.Markdown(render_evolution_summary())
fitness_plot = gr.LinePlot(
value=render_fitness_curve(),
x="generation", y="best_fitness",
title="Best eval-bank fitness by generation (real measured run)",
x_title="Generation", y_title="Eval-bank fitness",
height=320, overlay_point=True,
)
with gr.Row():
trigger_btn = gr.Button("πŸ”€ Reveal next generation", variant="primary")
reset_btn = gr.Button("πŸ”„ Reset replay")
refresh_btn = gr.Button("↻ Refresh tables")
evolution_result = gr.Markdown("")
gr.Markdown("---")
gr.Markdown(
"### πŸ“₯ Import a trained adapter\n"
"Drop the two files produced by the Colab training notebook "
"(`*.gguf` and `*.json`) to add a user-trained adapter to the pool."
)
with gr.Row():
import_gguf = gr.File(
label="LoRA adapter (.gguf)", file_types=[".gguf"], type="filepath",
)
import_manifest = gr.File(
label="Manifest (.json)", file_types=[".json"], type="filepath",
)
import_btn = gr.Button("πŸ“₯ Import adapter", variant="primary")
import_status = gr.Markdown("")
# ── Tab 3: Knowledge ─────────────────────────────────────────
with gr.Tab("πŸ“š Knowledge"):
gr.Markdown(
"### Document knowledge β€” the second dimension of evolution\n"
"Upload PDFs, Word docs, Markdown, or paste text. EvoLLM chunks, embeds, "
"and indexes them locally with a multilingual embedder. In the Chat tab, "
"toggle **πŸ” Knowledge mode** and the model retrieves relevant chunks "
"before answering, citing sources."
)
gr.Markdown(
"> ⚠️ **On HF Space** uploads are session-only β€” they're processed inside the "
"Space container and disappear on rebuild. **Use the local desktop app for "
"true privacy and persistence** (`data/knowledge.sqlite`)."
)
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("#### Upload files")
file_upload = gr.File(
label="Drop PDF / TXT / MD / DOCX (multi-select OK)",
file_types=[".pdf", ".txt", ".md", ".markdown", ".docx"],
file_count="multiple",
type="filepath",
)
upload_btn = gr.Button("πŸ“₯ Index uploaded files", variant="primary")
with gr.Column(scale=2):
gr.Markdown("#### Or paste text directly")
paste_name = gr.Textbox(
label="Source name", placeholder="e.g. company_handbook"
)
paste_text = gr.Textbox(
label="Text content", lines=6,
placeholder="Paste any text β€” Creole, technical docs, FAQs…",
)
paste_btn = gr.Button("πŸ“₯ Index pasted text", variant="primary")
ingest_status = gr.Markdown("")
gr.Markdown("#### Indexed documents")
knowledge_stats = gr.Markdown(render_knowledge_stats())
knowledge_table = gr.Dataframe(
value=render_knowledge_table(),
headers=["Name", "Format", "Size", "Chunks", "Uploaded"],
interactive=False, wrap=True,
)
with gr.Row():
refresh_knowledge_btn = gr.Button("πŸ”„ Refresh")
clear_knowledge_btn = gr.Button("πŸ—‘οΈ Clear all documents", variant="stop")
knowledge_action_status = gr.Markdown("")
gr.Markdown("---")
gr.Markdown(
"### 🧬 Train an adapter from these documents\n"
"Bake the document content into a real LoRA adapter via QLoRA on Colab. "
"EvoLLM generates a configured notebook with your data inline; you run it "
"on a free T4 GPU; then import the resulting `.gguf` + manifest back here."
)
with gr.Row():
with gr.Column(scale=2):
train_doc_select = gr.CheckboxGroup(
choices=[d["name"] for d in KNOWLEDGE.documents()],
label="Documents to train on",
info="Select one or more indexed documents.",
)
with gr.Column(scale=1):
train_adapter_name = gr.Textbox(
label="Adapter name", placeholder="e.g. company_handbook",
)
train_lora_rank = gr.Slider(
minimum=4, maximum=64, value=16, step=4,
label="LoRA rank",
info="Higher = more capacity, slower training",
)
train_num_epochs = gr.Slider(
minimum=1, maximum=10, value=3, step=1,
label="Training epochs",
)
with gr.Row():
refresh_train_docs_btn = gr.Button("πŸ”„ Refresh doc list", size="sm")
generate_notebook_btn = gr.Button("🧬 Generate training notebook", variant="primary")
notebook_status = gr.Markdown("")
notebook_download = gr.File(label="πŸ“’ Download notebook", interactive=False)
# ── Tab 4: Evolution Log ─────────────────────────────────────
with gr.Tab("πŸ“œ Evolution Log"):
gr.Markdown("### Lineage of mutations, promotions, and feedback events")
evolution_log_view = gr.Markdown(render_evolution_log())
gr.Markdown("---")
gr.Markdown("### Recent feedback")
feedback_log_view = gr.Markdown(render_feedback_log())
# ── Tab 4: About ─────────────────────────────────────────────
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## EvoLLM β€” what's actually here
### Hardware-adaptive architecture
EvoLLM scales the **base model** to the user's hardware while keeping the
**evolution engine identical** across all tiers:
| Tier | Base | Use | Speed |
|---|---|---|---|
| Phone / IoT | SmolLM2-135M | embedded edge | ~50 tok/s on phone NPU |
| **Web demo (this Space)** | SmolLM2-360M | free public preview | ~5 tok/s on 2 vCPUs |
| **Local desktop app** | SmolLM2-1.7B | privacy-first daily driver | ~30 tok/s on a 4090 |
| Workstation | Qwen 2.5 7B | power user | ~100 tok/s on A100 |
| Datacenter | Llama 3.1 8B+ | hosted serving | ~300 tok/s on A100 |
The genome schema, adapter pool, Thompson bandit, eval bank, and mutation
operators are byte-for-byte the same across every tier. Only the base
weights change. That's the deployment story.
### The evolution layer
EvoLLM wraps each base model with:
- **Base swap**: every tier runs a different base β€” the smallest variant is 135M for embedded, the largest is 8B+ for datacenter
- **Adapter pool**: 5 hand-curated genome variants, with the architecture in
place to ingest real distilled LoRA weights (Phase 2 β€” Colab notebook in repo)
- **Bandit**: Thompson sampling over Beta(Ξ±, Ξ²) reward distributions per
adapter. Live thumbs feedback updates posteriors in real time.
- **Eval bank**: 40 fixed prompts across reasoning, factual, code, writing,
instruction-following, safety, calibration, and edge cases. Deterministic
rule-based scoring β€” no LLM-as-judge dependency.
- **Mutation operators**: LoRA rank, target modules, memory token,
sampling config, system prompt
- **Fitness**: 50/50 blend of eval-bank score and live feedback win-rate
### Why this matters
Other local LLMs (Ollama, LM Studio, GPT4All) ship one frozen model.
**EvoLLM ships a population** β€” and that population evolves on the user's
machine, in response to that specific user. The same hardware runs a
better model after a week of use than it did on day 1.
### Two dimensions of evolution
EvoLLM evolves on two orthogonal axes:
1. **Behaviour** β€” the adapter pool. Each adapter is a genome (system prompt,
sampling config, LoRA setup). The Thompson-sampling bandit learns which
adapter wins for the user from live thumbs feedback.
2. **Knowledge** β€” uploaded documents. Embedded with a multilingual model and
stored in a local vector DB. When Knowledge mode is on, queries retrieve
the top-3 relevant chunks and inject them as grounded context with citations.
Both dimensions feed the same evolution log. Both live on the user's hardware.
Both are visible in the UI.
### Roadmap
| Phase | Status | What |
|---|---|---|
| 0 β€” Inference foundation | βœ… Done | FastAPI + llama.cpp + GGUF |
| 1 β€” Adapter loading + memory token | βœ… Done | The 5-personality adapter pool |
| 2 β€” Distillation seed adapters | 🚧 | Colab notebook produces real LoRA files |
| 3 β€” Desktop installer | πŸ—“ | Tauri/Electron bundle for Windows |
| 4a β€” Knowledge layer (RAG) | βœ… Done | This tab β€” multilingual embed + cite |
| 4b β€” LoRA-on-upload | 🚧 | "Train adapter from documents" Colab flow |
| 5 β€” Background evolution worker | πŸ—“ | Periodic QLoRA retrain on feedback |
| 6 β€” Cloud-mediated adapter delivery | πŸ—“ | Opt-in anonymized feedback β†’ updates |
### Source
GitHub: [drhemanm/EvoTransformerV11](https://github.com/drhemanm/EvoTransformerV11)
Built on EvoTransformer (Mohabeer, 2025).
""")
# ── Wiring ──────────────────────────────────────────────────────
def submit_message(message, history, adapter_pick, knowledge_on):
history = history or []
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": ""})
for partial in chat(message, history[:-2], adapter_pick, knowledge_on):
history[-1]["content"] = partial
yield history, ""
yield history, ""
refresh_outputs = [active_genome_view, pool_table, evolution_log_view, feedback_log_view]
submit_inputs = [chat_input, chatbot, adapter_choice, knowledge_toggle]
submit_outputs = [chatbot, chat_input]
chat_input.submit(
submit_message, submit_inputs, submit_outputs, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
send_btn.click(
submit_message, submit_inputs, submit_outputs, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
thumbs_up_btn.click(
lambda: record_feedback(+1), None, feedback_status, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
thumbs_down_btn.click(
lambda: record_feedback(-1), None, feedback_status, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
clear_btn.click(
lambda: ([], "", ""), None, [chatbot, chat_input, feedback_status],
api_name=False,
)
trigger_btn.click(
trigger_evolution_cycle, None, evolution_result, api_name=False,
).then(
render_fitness_curve, None, fitness_plot, api_name=False,
).then(
render_evolution_summary, None, evolution_summary, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
reset_btn.click(
reset_evolution_replay, None, evolution_result, api_name=False,
).then(
render_fitness_curve, None, fitness_plot, api_name=False,
).then(
render_evolution_summary, None, evolution_summary, api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
refresh_btn.click(refresh_all, None, refresh_outputs, api_name=False)
# ── Knowledge tab wirings ───────────────────────────────────────
upload_btn.click(
handle_file_upload, [file_upload], [ingest_status, knowledge_table],
api_name=False,
).then(render_knowledge_stats, None, knowledge_stats, api_name=False)
paste_btn.click(
handle_text_paste,
[paste_name, paste_text],
[ingest_status, knowledge_table, paste_name, paste_text],
api_name=False,
).then(render_knowledge_stats, None, knowledge_stats, api_name=False)
refresh_knowledge_btn.click(
lambda: (render_knowledge_table(), render_knowledge_stats()),
None, [knowledge_table, knowledge_stats], api_name=False,
)
clear_knowledge_btn.click(
handle_delete_all, None, [knowledge_action_status, knowledge_table],
api_name=False,
).then(render_knowledge_stats, None, knowledge_stats, api_name=False)
# Refresh the doc selector when documents change (keep choices in sync)
def _refresh_doc_choices():
return gr.update(choices=[d["name"] for d in KNOWLEDGE.documents()])
refresh_train_docs_btn.click(
_refresh_doc_choices, None, train_doc_select, api_name=False,
)
generate_notebook_btn.click(
handle_generate_notebook,
[train_doc_select, train_adapter_name, train_lora_rank, train_num_epochs],
[notebook_status, notebook_download],
api_name=False,
)
import_btn.click(
handle_import_adapter,
[import_gguf, import_manifest],
[import_status, pool_table],
api_name=False,
).then(refresh_all, None, refresh_outputs, api_name=False)
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
share=os.environ.get("EVOLLM_SHARE", "").lower() == "true",
)