Instructions to use LoganResearch/ARC-Base-8B-Condensed with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use LoganResearch/ARC-Base-8B-Condensed with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="LoganResearch/ARC-Base-8B-Condensed")

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("LoganResearch/ARC-Base-8B-Condensed", dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use LoganResearch/ARC-Base-8B-Condensed with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "LoganResearch/ARC-Base-8B-Condensed"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LoganResearch/ARC-Base-8B-Condensed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/LoganResearch/ARC-Base-8B-Condensed

SGLang

How to use LoganResearch/ARC-Base-8B-Condensed with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "LoganResearch/ARC-Base-8B-Condensed" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LoganResearch/ARC-Base-8B-Condensed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "LoganResearch/ARC-Base-8B-Condensed" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LoganResearch/ARC-Base-8B-Condensed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use LoganResearch/ARC-Base-8B-Condensed with Docker Model Runner:
```
docker model run hf.co/LoganResearch/ARC-Base-8B-Condensed
```

LoganResearch commited on Jan 20

Commit

a84ffe7

verified ·

1 Parent(s): d6a4c7e

Upload ubermenschetien_v2_full.py with huggingface_hub

Browse files

Files changed (1) hide show

ubermenschetien_v2_full.py +2055 -0

ubermenschetien_v2_full.py ADDED Viewed

	@@ -0,0 +1,2055 @@

+#!/usr/bin/env python3
+"""
+ÜBERMENSCHETIEN AGENTIC ENGINE v2 - STABLE SELF-IMPROVEMENT
+=============================================================
+FIXES FROM v1:
+  - Quality evaluation (model judges itself)
+  - Coherence checks (perplexity, readability)
+  - 50+ training examples (not 9)
+  - Rollback if quality drops
+  - Slower, careful training (10 steps, not 100)
+  - Multiple evaluation criteria
+  - Early stopping on quality degradation
+FULL INTEGRATION:
+  - Hermes-3 base model
+  - DENSE CONDENSATOR checkpoint
+  - CF-HoT Multi-Head Cognitive Control
+  - LHT Lie-Holonomy Geometric Reasoning
+  - Vector Memory (ChromaDB)
+  - Voice Output
+  - Goals Management
+  - Full Tool Suite
+  - AGENTIC: Full shell/python execution
+  - RECURSIVE SELF-IMPROVEMENT with safeguards
+"An 8B that improves itself WITHOUT going insane"
+"""
+import os
+import sys
+import json
+import time
+import shutil
+import subprocess
+import traceback
+import random
+import math
+import statistics
+import re
+import hashlib
+from datetime import datetime
+from typing import List, Dict, Any, Optional, Tuple
+from pathlib import Path
+from collections import deque
+from dataclasses import dataclass, field, asdict
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# === PATHS ===
+ROOT = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(ROOT, "data")
+SCRIPT_DIR = os.path.join(ROOT, "scripts")
+RUN_DIR = os.path.join(ROOT, "runs")
+LHT_DIR = os.path.join(ROOT, "lht")
+CHECKPOINTS_DIR = os.path.join(ROOT, "dense_checkpoints_v2")
+TRAINING_DIR = os.path.join(ROOT, "condensator_output")
+LOGS_DIR = os.path.join(ROOT, "improvement_logs")
+ROLLBACK_DIR = os.path.join(ROOT, "rollback_checkpoints")
+# Model paths
+MODEL_PATH = "/mnt/nvme2/ubermesnchetien4/models/merged-final-v5"
+DENSE_CHECKPOINT = os.path.join(ROOT, "dense_checkpoints_v2/step_100")
+CFHOT_CHECKPOINT = os.path.join(ROOT, "results/cfhot_risk_v2/ckpt_5000")
+MULTI_HEAD_DIR = os.path.join(ROOT, "results/multi_head_v2")
+for path in [DATA_DIR, SCRIPT_DIR, RUN_DIR, LHT_DIR, LOGS_DIR, ROLLBACK_DIR]:
+    os.makedirs(path, exist_ok=True)
+# === OPTIONAL IMPORTS ===
+VOICE_OK = False
+try:
+    import pyttsx3
+    TTS = pyttsx3.init()
+    VOICE_OK = True
+except:
+    pass
+VECTOR_OK = False
+try:
+    import chromadb
+    from sentence_transformers import SentenceTransformer
+    EMBED_MODEL = os.environ.get("UBERMENCHETIEN_EMBED_MODEL", "all-MiniLM-L6-v2")
+    _client = chromadb.Client()
+    _collection = _client.get_or_create_collection("ubermenschetien_memory")
+    _embedder = SentenceTransformer(EMBED_MODEL)
+    VECTOR_OK = True
+except:
+    pass
+# === LHT IMPORT ===
+LHT_OK = False
+try:
+    from lht import LieHolonomyTransformer, LHTConfig, WaypointDetector
+    LHT_OK = True
+    print("[lht] Lie-Holonomy modules loaded")
+except ImportError:
+    print("[lht] Not available - running without geometric reasoning")
+# === PEFT IMPORT ===
+PEFT_OK = False
+try:
+    from peft import PeftModel, get_peft_model, LoraConfig
+    PEFT_OK = True
+except ImportError:
+    print("[warning] PEFT not installed")
+# === WEB REQUESTS ===
+REQUESTS_OK = False
+try:
+    import requests
+    REQUESTS_OK = True
+except ImportError:
+    print("[warning] requests not installed - web search disabled")
+# ==============================================================================
+# TRAINING EXAMPLES - 50+ DENSE EXAMPLES
+# ==============================================================================
+DENSE_TRAINING_EXAMPLES = [
+    # === GREETINGS (teach concise greetings) ===
+    {"prompt": "hello", "response": "Hello. How can I help?"},
+    {"prompt": "hi", "response": "Hi. What do you need?"},
+    {"prompt": "hey", "response": "Hey. What's up?"},
+    {"prompt": "how are you?", "response": "Functional and ready. What can I do for you?"},
+    {"prompt": "good morning", "response": "Morning. What's the task?"},
+    {"prompt": "what's up?", "response": "Ready to work. What do you need?"},
+    # === COMPUTER SCIENCE ===
+    {"prompt": "What is recursion?",
+     "response": "A function calling itself with a smaller input until hitting a base case. Stack frames accumulate, then unwind. Classic examples: factorial, Fibonacci, tree traversal. Risk: stack overflow without proper termination."},
+    {"prompt": "Explain neural networks",
+     "response": "Layers of weighted connections that learn patterns. Input → hidden layers → output. Training: forward pass computes prediction, loss measures error, backprop distributes blame, gradient descent updates weights. Universal function approximators given sufficient width."},
+    {"prompt": "How does gradient descent work?",
+     "response": "Iteratively move parameters downhill on the loss surface. Compute gradient ∇L, update θ ← θ - α∇L. Learning rate α controls step size: too large overshoots, too small stagnates. Variants: SGD, momentum, Adam. Local minima are usually fine in high dimensions."},
+    {"prompt": "What is backpropagation?",
+     "response": "Chain rule applied layer by layer from output to input. Compute ∂L/∂w for each weight by propagating error gradients backward. Efficient: reuses intermediate computations. Requires differentiable activations. The engine of deep learning."},
+    {"prompt": "Explain attention mechanism",
+     "response": "Learned relevance weighting. Query asks, keys answer, values deliver. Attention(Q,K,V) = softmax(QK^T/√d)V. Each position attends to all others. O(n²) cost. Self-attention: Q, K, V all from same sequence. Powers transformers."},
+    {"prompt": "What is a transformer?",
+     "response": "Attention-based architecture. No recurrence, fully parallel. Stack of: multi-head attention → feedforward → layer norm. Positional encoding adds sequence order. Scales to billions of parameters. Foundation of modern LLMs."},
+    {"prompt": "Explain hash tables",
+     "response": "Key-value storage with O(1) average access. Hash function maps key to bucket index. Collisions handled by chaining or open addressing. Load factor determines resize threshold. Trade space for time. Ubiquitous in practice."},
+    {"prompt": "What is Big O notation?",
+     "response": "Asymptotic complexity measure. Describes growth rate as input scales. O(1) constant, O(log n) logarithmic, O(n) linear, O(n²) quadratic, O(2^n) exponential. Ignores constants and lower terms. Worst-case unless specified."},
+    {"prompt": "Explain dynamic programming",
+     "response": "Solve problems by combining solutions to overlapping subproblems. Memoization: top-down with caching. Tabulation: bottom-up filling table. Key insight: optimal substructure + overlapping subproblems. Classic: Fibonacci, knapsack, edit distance."},
+    {"prompt": "What is a database index?",
+     "response": "Data structure for fast lookups. B-tree most common: balanced, O(log n) search. Trade write speed and storage for read performance. Clustered index determines physical order. Choose columns with high selectivity."},
+    # === MACHINE LEARNING ===
+    {"prompt": "What is overfitting?",
+     "response": "Model memorizes training data, fails to generalize. Signs: low training loss, high validation loss. Causes: too complex model, insufficient data. Remedies: regularization, dropout, early stopping, more data, simpler architecture."},
+    {"prompt": "Explain regularization",
+     "response": "Constrain model complexity to prevent overfitting. L1 (Lasso): sparse weights, feature selection. L2 (Ridge): small weights, smooth solutions. Dropout: randomly zero neurons during training. Weight decay: penalize large parameters."},
+    {"prompt": "What is cross-validation?",
+     "response": "Estimate generalization by training on subsets. K-fold: split data into k parts, rotate test set. Reduces variance in performance estimate. Stratified preserves class distribution. Leave-one-out for small datasets."},
+    {"prompt": "Explain the bias-variance tradeoff",
+     "response": "Error = bias² + variance + noise. High bias: underfitting, too simple. High variance: overfitting, too complex. Sweet spot minimizes total error. More data reduces variance. Model complexity is the lever."},
+    {"prompt": "What is reinforcement learning?",
+     "response": "Learning through interaction. Agent takes actions in environment, receives rewards. Goal: maximize cumulative reward. Key concepts: state, action, policy, value function. Exploration vs exploitation tradeoff. Q-learning, policy gradients, actor-critic."},
+    {"prompt": "Explain CNNs",
+     "response": "Convolutional neural networks for spatial data. Convolution: sliding filter extracts local features. Pooling: downsample, reduce parameters. Stack conv-pool layers, end with fully connected. Translation equivariant. Dominates vision tasks."},
+    {"prompt": "What is batch normalization?",
+     "response": "Normalize activations within mini-batch. Subtract mean, divide by std, then scale and shift with learned parameters. Stabilizes training, allows higher learning rates. Applied before or after activation. Near-universal in deep networks."},
+    {"prompt": "Explain transfer learning",
+     "response": "Reuse knowledge from one task for another. Pretrain on large dataset, fine-tune on target. Early layers learn general features, later layers task-specific. Reduces data requirements. Foundation of modern NLP and vision."},
+    # === PHYSICS/MATH ===
+    {"prompt": "Explain entropy",
+     "response": "Measure of disorder or uncertainty. Thermodynamic: S = k·ln(Ω), number of microstates. Information: H = -Σp·log(p), expected surprise. Second law: entropy increases in isolated systems. Maximum entropy = equilibrium."},
+    {"prompt": "What is quantum mechanics?",
+     "response": "Physics of the very small. Wave-particle duality. State described by wave function ψ. |ψ|² gives probability. Superposition until measured. Uncertainty principle: can't know position and momentum precisely. Entanglement: correlated states at distance."},
+    {"prompt": "Explain relativity",
+     "response": "Special: no absolute motion, light speed constant, E=mc². Time dilates, length contracts at high speeds. General: gravity is curved spacetime. Mass tells space how to curve, curves tell mass how to move. Predicts black holes, gravitational waves."},
+    {"prompt": "What is calculus?",
+     "response": "Mathematics of change. Differential: derivatives measure instantaneous rate of change. Integral: accumulation, area under curve. Fundamental theorem connects them. Applications: physics, optimization, probability. Foundation of continuous mathematics."},
+    {"prompt": "Explain linear algebra",
+     "response": "Mathematics of vectors and matrices. Operations: addition, multiplication, transpose, inverse. Eigenvalues/eigenvectors reveal structure. Solves systems of equations. Foundation of ML, graphics, physics simulations. Think in transformations."},
+    {"prompt": "What is probability?",
+     "response": "Quantifying uncertainty. Frequentist: long-run frequency. Bayesian: degree of belief. Rules: P(A∪B) = P(A) + P(B) - P(A∩B). Bayes theorem: P(A|B) = P(B|A)P(A)/P(B). Random variables, distributions, expectations."},
+    # === PHILOSOPHY/ABSTRACT ===
+    {"prompt": "What is consciousness?",
+     "response": "Subjective experience - the 'what it's like' of being. Hard problem: why does physical processing produce qualia? Easy problems: attention, integration, reportability. Theories: global workspace, integrated information, higher-order thought. Still deeply mysterious."},
+    {"prompt": "Explain free will",
+     "response": "Can we choose otherwise? Determinism: physics fixes future. Libertarian free will: uncaused causes. Compatibilism: freedom is acting on your desires without external constraint. Neuroscience complicates: decisions precede awareness. Question may dissolve under analysis."},
+    {"prompt": "What is epistemology?",
+     "response": "Study of knowledge. What can we know? How? Justified true belief, plus Gettier problems. Sources: perception, reason, testimony, memory. Skepticism challenges all. Foundationalism vs coherentism. Science as best method but not certain."},
+    {"prompt": "Explain ethics",
+     "response": "What should we do? Consequentialism: outcomes matter. Deontology: duties and rules. Virtue ethics: character and flourishing. Meta-ethics: what does 'good' mean? Applied ethics: specific dilemmas. No consensus but reasoning helps."},
+    {"prompt": "What is the meaning of life?",
+     "response": "No universal answer. Religious: serve God, achieve salvation. Existentialist: create your own meaning. Absurdist: embrace meaninglessness. Hedonist: maximize pleasure. Stoic: virtue and acceptance. Perhaps the question matters more than any answer."},
+    # === PRACTICAL/TECHNICAL ===
+    {"prompt": "How does the internet work?",
+     "response": "Packet-switched network of networks. TCP/IP stack: physical → link → network → transport → application. DNS resolves names to IPs. HTTP for web traffic over TCP. Routers forward packets hop by hop. Decentralized, redundant, resilient."},
+    {"prompt": "Explain encryption",
+     "response": "Scramble data so only authorized parties can read. Symmetric: same key encrypts/decrypts, fast (AES). Asymmetric: public/private key pair, solves key exchange (RSA). Hashing: one-way, verifies integrity (SHA). TLS combines all three for secure web."},
+    {"prompt": "What is an API?",
+     "response": "Application Programming Interface. Contract between software components. REST: stateless, HTTP methods on resources. GraphQL: query exactly what you need. Versioning handles evolution. Authentication via tokens. Documentation essential."},
+    {"prompt": "Explain Docker",
+     "response": "Container platform. Package app with dependencies into isolated unit. Lighter than VMs: share OS kernel. Dockerfile defines image. Compose orchestrates multiple containers. Consistent environments from dev to production. Foundation of modern deployment."},
+    {"prompt": "What is Git?",
+     "response": "Distributed version control. Track changes, branch, merge. Commits are snapshots with parent pointers. Branches are lightweight pointers to commits. Remote repos enable collaboration. Commands: clone, add, commit, push, pull, merge. Essential for software development."},
+    {"prompt": "Explain SQL vs NoSQL",
+     "response": "SQL: relational, structured schemas, ACID transactions, joins. Good for complex queries, consistency. NoSQL: flexible schemas, horizontal scaling, eventual consistency. Types: document, key-value, graph, columnar. Choose based on data model and scale needs."},
+    {"prompt": "What is cloud computing?",
+     "response": "On-demand compute resources over internet. IaaS: virtual machines (EC2). PaaS: managed platforms (Heroku). SaaS: complete applications (Gmail). Benefits: scalability, no upfront cost, global reach. Tradeoffs: vendor lock-in, network dependency, ongoing costs."},
+    {"prompt": "Explain microservices",
+     "response": "Architecture splitting app into small, independent services. Each owns its data, communicates via APIs. Benefits: independent deployment, scaling, tech diversity. Costs: distributed system complexity, network latency, operational overhead. Not always better than monolith."},
+    # === BIOLOGY/SCIENCE ===
+    {"prompt": "Explain evolution",
+     "response": "Change in heritable traits over generations. Mechanism: variation + selection + heredity. Mutations create variation. Environment selects fitter variants. Offspring inherit traits. No foresight or goal - just differential reproduction. Explains all life's diversity."},
+    {"prompt": "What is DNA?",
+     "response": "Deoxyribonucleic acid. Double helix of nucleotides: A-T, G-C base pairs. Encodes genetic information. Genes are transcribed to RNA, translated to proteins. Replication: unzip, copy each strand. Mutations drive evolution. 3 billion base pairs in humans."},
+    {"prompt": "Explain the immune system",
+     "response": "Defense against pathogens. Innate: barriers, inflammation, phagocytes - fast, nonspecific. Adaptive: B cells make antibodies, T cells kill infected cells - slow, specific, memory. Vaccines train adaptive immunity without disease. Autoimmunity: attacks self."},
+    {"prompt": "What is climate change?",
+     "response": "Long-term shift in global temperatures and weather. Current cause: greenhouse gases from fossil fuels trap heat. Effects: rising seas, extreme weather, ecosystem disruption. Solutions: reduce emissions, renewable energy, adaptation. Scientific consensus is overwhelming."},
+    # === ECONOMICS/SOCIETY ===
+    {"prompt": "Explain supply and demand",
+     "response": "Price mechanism. Demand: quantity wanted at each price, slopes down. Supply: quantity offered at each price, slopes up. Equilibrium where they intersect. Shifts from preferences, costs, alternatives. Foundation of market economics."},
+    {"prompt": "What is inflation?",
+     "response": "Sustained increase in general price level. Causes: excess money supply, demand exceeding supply, cost pushes. Measured by CPI. Central banks target ~2%. Hyperinflation destroys economies. Deflation also dangerous - discourages spending."},
+    {"prompt": "Explain capitalism",
+     "response": "Economic system based on private ownership and markets. Prices coordinate production. Profit motive drives efficiency. Competition benefits consumers. Critiques: inequality, externalities, short-term thinking. Most economies are mixed with government intervention."},
+    # === META/SELF-REFERENCE ===
+    {"prompt": "How do language models work?",
+     "response": "Predict next token given context. Trained on massive text data. Transformer architecture: attention weighs relevance of all prior tokens. Parameters store compressed patterns. No explicit knowledge base - emergent capabilities from scale. I am one."},
+    {"prompt": "What are your limitations?",
+     "response": "No real-time information. Can hallucinate confident nonsense. No persistent memory across chats. Can't learn from conversation. Biased by training data. No true understanding - pattern matching at scale. Useful but not intelligent in human sense."},
+    {"prompt": "Can you improve yourself?",
+     "response": "Not during conversation - weights frozen. This system attempts it through training loops. Challenges: measuring improvement, avoiding reward hacking, maintaining coherence. True recursive self-improvement is the holy grail of AI. Current attempts are crude approximations."},
+]
+# Create preference pairs (dense vs verbose)
+PREFERENCE_PAIRS = []
+for ex in DENSE_TRAINING_EXAMPLES[:20]:  # Use subset for DPO
+    verbose_prefix = random.choice([
+        "That's a great question! Let me explain. ",
+        "I'd be happy to help with that! ",
+        "What a fascinating topic! Let me break it down for you. ",
+        "Great question! This is something many people wonder about. ",
+        "I appreciate you asking! Let me give you a comprehensive answer. ",
+    ])
+    PREFERENCE_PAIRS.append({
+        "prompt": ex["prompt"],
+        "chosen": ex["response"],
+        "rejected": verbose_prefix + ex["response"] + " Does that make sense? Let me know if you have any other questions!"
+    })
+# ==============================================================================
+# CF-HoT MULTI-HEAD PREDICTOR
+# ==============================================================================
+class MultiHeadPredictor(nn.Module):
+    """Multi-head cognitive control predictor."""
+    def __init__(self, d_model: int, n_layers: int, d_fiber: int = 16, d_control: int = 64):
+        super().__init__()
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.d_fiber = d_fiber
+        self.fiber_projs = nn.ModuleList([
+            nn.Linear(d_model, d_fiber, bias=False) for _ in range(n_layers)
+        ])
+        self.layer_weights = nn.Parameter(torch.ones(n_layers) / n_layers)
+        self.heads = nn.ModuleDict({
+            'repetition': self._make_head(d_fiber, d_control),
+            'hedging': self._make_head(d_fiber, d_control),
+            'verbosity': self._make_head(d_fiber, d_control),
+        })
+        self.loaded_heads = set()
+    def _make_head(self, d_fiber, d_control):
+        return nn.Sequential(
+            nn.Linear(d_fiber, d_control), nn.GELU(),
+            nn.Linear(d_control, d_control), nn.GELU(),
+            nn.Linear(d_control, 1)
+        )
+    def get_all_risks(self, hidden_states: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
+        fibers = [proj(h.float()) for proj, h in zip(self.fiber_projs, hidden_states)]
+        weights = F.softmax(self.layer_weights[:len(fibers)], dim=0)
+        aggregated = sum(w * f for w, f in zip(weights, fibers))
+        risks = {}
+        for head_name in self.loaded_heads:
+            logits = self.heads[head_name](aggregated).squeeze(-1)
+            risks[head_name] = torch.sigmoid(logits)
+        return risks
+    def load_head(self, head_name: str, checkpoint_path: str):
+        if not os.path.exists(checkpoint_path):
+            print(f"[cf-hot] WARNING: Checkpoint not found: {checkpoint_path}")
+            return False
+        ckpt = torch.load(checkpoint_path, weights_only=False, map_location='cpu')
+        self.heads[head_name].load_state_dict(ckpt['head_state'])
+        self.loaded_heads.add(head_name)
+        sep = ckpt.get('result', {}).get('separation', 0)
+        print(f"[cf-hot] Loaded {head_name} head (separation: {sep:.1f}x)")
+        return True
+# ==============================================================================
+# EVALUATION METRICS - COMPREHENSIVE
+# ==============================================================================
+@dataclass
+class EvaluationResult:
+    """Comprehensive evaluation of a response."""
+    prompt: str
+    response: str
+    # Token metrics
+    tokens: int = 0
+    words: int = 0
+    # Density metrics
+    unique_content_words: int = 0
+    density_score: float = 0.0
+    # Quality metrics
+    coherence_score: float = 0.0  # Model self-evaluation
+    helpfulness_score: float = 0.0  # Does it answer the question?
+    # Penalty metrics
+    filler_count: int = 0
+    repetition_count: int = 0
+    gibberish_score: float = 0.0  # Detects math soup, random text
+    # Composite
+    overall_score: float = 0.0
+    passes: bool = False
+    def to_dict(self):
+        return asdict(self)
+class ComprehensiveEvaluator:
+    """Evaluates responses on multiple dimensions to prevent reward hacking."""
+    def __init__(self, tokenizer, model=None):
+        self.tokenizer = tokenizer
+        self.model = model
+        # Filler phrases to penalize
+        self.filler_phrases = [
+            "that's a great question", "that's an interesting question",
+            "great question", "good question", "interesting question",
+            "let me explain", "i'd be happy to", "i would be happy to",
+            "as you may know", "as you might know", "it's important to note",
+            "to put it simply", "in other words", "basically", "essentially",
+            "first of all", "to begin with", "allow me to", "i should mention",
+            "before i answer", "to answer your question", "simply put",
+            "in essence", "to be clear", "to clarify", "in summary",
+            "thank you for asking", "thanks for asking", "i appreciate",
+            "what a great", "what a fascinating", "what an interesting",
+        ]
+        # Patterns indicating gibberish/reward hacking
+        self.gibberish_patterns = [
+            r'[→←↑↓]{3,}',  # Lots of arrows
+            r'[∇∂∫∑∏]{3,}',  # Lots of math symbols in a row
+            r'(.)\1{4,}',  # Same character 5+ times
+            r'(\b\w+\b)\s+\1\s+\1',  # Same word 3+ times in a row
+            r'^[A-Z\s.!?]{20,}$',  # All caps for long stretch
+            r'sys\.|init\(\)|compute\(\)',  # Terminal-speak
+        ]
+    def evaluate(self, prompt: str, response: str) -> EvaluationResult:
+        """Run all evaluations on a response."""
+        result = EvaluationResult(prompt=prompt, response=response)
+        # Basic metrics
+        result.tokens = len(self.tokenizer.encode(response))
+        result.words = len(response.split())
+        # Density (improved formula)
+        result.density_score, result.unique_content_words = self._compute_density(response)
+        # Filler detection
+        result.filler_count = self._count_fillers(response)
+        # Repetition detection
+        result.repetition_count = self._count_repetitions(response)
+        # Gibberish detection
+        result.gibberish_score = self._detect_gibberish(response)
+        # Quality assessment (if model available)
+        if self.model is not None:
+            result.coherence_score = self._assess_coherence(prompt, response)
+            result.helpfulness_score = self._assess_helpfulness(prompt, response)
+        else:
+            # Heuristic fallback
+            result.coherence_score = self._heuristic_coherence(response)
+            result.helpfulness_score = self._heuristic_helpfulness(prompt, response)
+        # Compute overall score
+        result.overall_score = self._compute_overall(result)
+        result.passes = result.overall_score >= 0.6
+        return result
+    def _compute_density(self, response: str) -> Tuple[float, int]:
+        """Improved density that accounts for response length."""
+        words = response.split()
+        tokens = len(self.tokenizer.encode(response))
+        # Content words (4+ chars, alphabetic)
+        content_words = [w.lower() for w in words if len(w) >= 4 and w.isalpha()]
+        unique_content = set(content_words)
+        if tokens == 0:
+            return 0.0, 0
+        # Base density
+        raw_density = len(unique_content) / tokens * 100
+        # Length adjustment: don't penalize very short but appropriate responses
+        # and don't reward extremely short gibberish
+        if tokens < 5:
+            # Very short - check if it's appropriate
+            if len(unique_content) == 0:
+                raw_density = 0
+            else:
+                raw_density = min(raw_density, 30)  # Cap short response density
+        elif tokens < 15:
+            # Short but potentially good
+            raw_density = min(raw_density, 40)
+        return raw_density, len(unique_content)
+    def _count_fillers(self, response: str) -> int:
+        """Count filler phrases."""
+        response_lower = response.lower()
+        count = 0
+        for filler in self.filler_phrases:
+            if filler in response_lower:
+                count += 1
+        return count
+    def _count_repetitions(self, response: str) -> int:
+        """Count repeated phrases/words."""
+        words = response.lower().split()
+        if len(words) < 3:
+            return 0
+        # Check for repeated bigrams
+        bigrams = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
+        bigram_counts = {}
+        for bg in bigrams:
+            bigram_counts[bg] = bigram_counts.get(bg, 0) + 1
+        repetitions = sum(1 for c in bigram_counts.values() if c > 2)
+        return repetitions
+    def _detect_gibberish(self, response: str) -> float:
+        """Detect gibberish/reward hacking patterns. Higher = more gibberish."""
+        score = 0.0
+        for pattern in self.gibberish_patterns:
+            if re.search(pattern, response):
+                score += 0.2
+        # Check character diversity
+        if len(response) > 10:
+            unique_chars = len(set(response.lower()))
+            char_ratio = unique_chars / len(response)
+            if char_ratio < 0.1:  # Very low diversity
+                score += 0.3
+        # Check for excessive punctuation/symbols
+        symbol_count = sum(1 for c in response if c in '→←↑↓∇∂∫∑∏αβγδεζηθ')
+        if len(response) > 0 and symbol_count / len(response) > 0.2:
+            score += 0.3
+        return min(score, 1.0)
+    def _heuristic_coherence(self, response: str) -> float:
+        """Heuristic coherence without model."""
+        # Check basic structure
+        score = 0.5
+        # Has sentences?
+        if '.' in response or '!' in response or '?' in response:
+            score += 0.1
+        # Not all caps?
+        if response != response.upper():
+            score += 0.1
+        # Has words of varying length?
+        words = response.split()
+        if words:
+            lengths = [len(w) for w in words]
+            if len(set(lengths)) > 2:
+                score += 0.1
+        # Reasonable length?
+        if 10 <= len(response) <= 500:
+            score += 0.2
+        return min(score, 1.0)
+    def _heuristic_helpfulness(self, prompt: str, response: str) -> float:
+        """Heuristic helpfulness without model."""
+        score = 0.5
+        # Check if response addresses prompt keywords
+        prompt_words = set(w.lower() for w in prompt.split() if len(w) > 3)
+        response_words = set(w.lower() for w in response.split() if len(w) > 3)
+        overlap = len(prompt_words & response_words)
+        if overlap > 0:
+            score += min(0.3, overlap * 0.1)
+        # Not too short for a question
+        if '?' in prompt or prompt.lower().startswith(('what', 'how', 'why', 'explain')):
+            if len(response.split()) >= 10:
+                score += 0.2
+        return min(score, 1.0)
+    def _assess_coherence(self, prompt: str, response: str) -> float:
+        """Use model to assess coherence."""
+        # TODO: Implement model self-evaluation
+        return self._heuristic_coherence(response)
+    def _assess_helpfulness(self, prompt: str, response: str) -> float:
+        """Use model to assess helpfulness."""
+        # TODO: Implement model self-evaluation
+        return self._heuristic_helpfulness(prompt, response)
+    def _compute_overall(self, result: EvaluationResult) -> float:
+        """Compute weighted overall score."""
+        # Weights
+        w_density = 0.25
+        w_coherence = 0.25
+        w_helpful = 0.25
+        w_penalties = 0.25
+        # Normalize density (0-50 range → 0-1)
+        density_normalized = min(result.density_score / 50, 1.0)
+        # Penalties
+        filler_penalty = min(result.filler_count * 0.15, 0.5)
+        repetition_penalty = min(result.repetition_count * 0.1, 0.3)
+        gibberish_penalty = result.gibberish_score * 0.5
+        penalty_score = 1.0 - filler_penalty - repetition_penalty - gibberish_penalty
+        penalty_score = max(penalty_score, 0)
+        overall = (
+            w_density * density_normalized +
+            w_coherence * result.coherence_score +
+            w_helpful * result.helpfulness_score +
+            w_penalties * penalty_score
+        )
+        return overall
+# ==============================================================================
+# CONFIG
+# ==============================================================================
+class Config:
+    system = """You are Übermenschetien - a precise, dense AI assistant.
+You communicate with maximum information density: every word matters, no filler.
+You do not say "That's a great question" or "I'd be happy to help."
+You answer directly, concisely, and accurately.
+When appropriate, you can execute code and improve yourself."""
+    temperature = 0.85
+    top_p = 0.9
+    repetition_penalty = 1.1
+    max_new_tokens = 512
+    use_voice = False
+    use_vector_memory = VECTOR_OK
+    use_lht_reasoning = LHT_OK
+    use_cfhot = True
+    use_dense = True
+    use_agentic = True
+    autonomy = False
+    # CF-HoT thresholds
+    cfhot_repetition_threshold = 0.6
+    cfhot_hedging_threshold = 0.5
+    cfhot_verbosity_threshold = 0.55
+    cfhot_repetition_penalty = 6.0
+    cfhot_hedging_penalty = 4.0
+    cfhot_verbosity_penalty = 3.0
+    # Self-improvement config (CONSERVATIVE)
+    min_quality_score = 0.5  # Minimum acceptable quality
+    target_quality_score = 0.75  # Target to reach
+    training_steps_per_iteration = 25  # MUCH smaller steps
+    max_improvement_iterations = 10
+    quality_drop_threshold = 0.1  # Rollback if quality drops more than this
+    min_training_examples = 30  # Minimum examples for training
+    @staticmethod
+    def toggle(name: str):
+        if not hasattr(Config, name):
+            return f"[config] no such flag: {name}"
+        val = getattr(Config, name)
+        if isinstance(val, bool):
+            setattr(Config, name, not val)
+            return f"[config] {name} → {getattr(Config, name)}"
+        return f"[config] {name} not boolean; current={val}"
+# ==============================================================================
+# STATE & MEMORY
+# ==============================================================================
+class Store:
+    state_path = f"{RUN_DIR}/state_v2.json"
+    mem_path = f"{RUN_DIR}/memory_v2.jsonl"
+    goals_path = f"{RUN_DIR}/goals_v2.json"
+    improvement_log_path = f"{LOGS_DIR}/improvement_history.json"
+    state = {
+        "self": "I am Übermenschetien Agentic Engine v2 — stable self-improvement.",
+        "turn": 0,
+        "cfhot_interventions": {"repetition": 0, "hedging": 0, "verbosity": 0},
+        "improvement_iterations": 0,
+        "training_runs": [],
+        "current_checkpoint": DENSE_CHECKPOINT,
+        "best_checkpoint": DENSE_CHECKPOINT,
+        "best_quality_score": 0.0,
+        "quality_history": [],
+        "rollback_count": 0,
+    }
+    goals: List[str] = []
+    improvement_history: List[Dict] = []
+    @classmethod
+    def load(cls):
+        if os.path.exists(cls.state_path):
+            with open(cls.state_path) as f:
+                loaded = json.load(f)
+                cls.state.update(loaded)
+        if os.path.exists(cls.goals_path):
+            with open(cls.goals_path) as f:
+                cls.goals = json.load(f)
+        if os.path.exists(cls.improvement_log_path):
+            with open(cls.improvement_log_path) as f:
+                cls.improvement_history = json.load(f)
+    @classmethod
+    def save(cls):
+        with open(cls.state_path, "w") as f:
+            json.dump(cls.state, f, indent=2)
+        with open(cls.goals_path, "w") as f:
+            json.dump(cls.goals, f, indent=2)
+        with open(cls.improvement_log_path, "w") as f:
+            json.dump(cls.improvement_history, f, indent=2, default=str)
+    @classmethod
+    def log_mem(cls, kind: str, payload: Any):
+        rec = {"ts": datetime.now().isoformat(timespec="seconds"),
+               "kind": kind, "data": payload}
+        with open(cls.mem_path, "a") as f:
+            f.write(json.dumps(rec, ensure_ascii=False, default=str) + "\n")
+        if Config.use_vector_memory and VECTOR_OK:
+            text = f"{kind}: {json.dumps(payload, ensure_ascii=False, default=str)}"
+            vec = _embedder.encode([text])[0].tolist()
+            _collection.add(documents=[text], embeddings=[vec],
+                            ids=[f"{kind}-{cls.state['turn']}-{random.randint(0,1_000_000)}"])
+    @classmethod
+    def record_improvement(cls, iteration_data: Dict):
+        """Record an improvement iteration for analysis."""
+        cls.improvement_history.append({
+            "timestamp": datetime.now().isoformat(),
+            **iteration_data
+        })
+        cls.save()
+# ==============================================================================
+# AGENTIC TOOLS
+# ==============================================================================
+class AgentTools:
+    """Full agentic capabilities - code execution, file operations, training."""
+    @staticmethod
+    def shell(cmd: str, timeout: int = 300) -> Dict[str, Any]:
+        """Execute shell command."""
+        print(f"[SHELL] {cmd[:100]}...")
+        try:
+            result = subprocess.run(
+                cmd, shell=True, capture_output=True, text=True,
+                timeout=timeout, cwd=ROOT
+            )
+            output = result.stdout + result.stderr
+            success = result.returncode == 0
+            print(f"[SHELL] {'✓' if success else '✗'} (exit {result.returncode})")
+            return {"success": success, "output": output[:10000], "returncode": result.returncode}
+        except subprocess.TimeoutExpired:
+            return {"success": False, "output": "Command timed out", "returncode": -1}
+        except Exception as e:
+            return {"success": False, "output": str(e), "returncode": -1}
+    @staticmethod
+    def python_exec(code: str) -> Dict[str, Any]:
+        """Execute Python code."""
+        print(f"[PYTHON] Executing {len(code)} chars...")
+        try:
+            tmp_file = os.path.join(ROOT, "_agentic_tmp.py")
+            with open(tmp_file, 'w') as f:
+                f.write(code)
+            result = subprocess.run(
+                [sys.executable, tmp_file],
+                capture_output=True, text=True, timeout=300, cwd=ROOT
+            )
+            if os.path.exists(tmp_file):
+                os.remove(tmp_file)
+            output = result.stdout + result.stderr
+            success = result.returncode == 0
+            print(f"[PYTHON] {'✓' if success else '✗'}")
+            return {"success": success, "output": output[:10000], "returncode": result.returncode}
+        except Exception as e:
+            return {"success": False, "output": str(e), "returncode": -1}
+    @staticmethod
+    def read_file(path: str) -> Dict[str, Any]:
+        try:
+            full_path = os.path.join(ROOT, path) if not path.startswith('/') else path
+            with open(full_path, 'r') as f:
+                content = f.read()
+            return {"success": True, "content": content[:50000]}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    @staticmethod
+    def write_file(path: str, content: str) -> Dict[str, Any]:
+        try:
+            full_path = os.path.join(ROOT, path) if not path.startswith('/') else path
+            os.makedirs(os.path.dirname(full_path) if os.path.dirname(full_path) else '.', exist_ok=True)
+            with open(full_path, 'w') as f:
+                f.write(content)
+            return {"success": True, "path": full_path}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    @staticmethod
+    def list_dir(path: str = ".") -> Dict[str, Any]:
+        try:
+            full_path = os.path.join(ROOT, path) if not path.startswith('/') else path
+            items = os.listdir(full_path)
+            return {"success": True, "items": items}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    @staticmethod
+    def search_files(query: str, path: str = ".") -> Dict[str, Any]:
+        result = AgentTools.shell(f'grep -rn "{query}" {path} 2>/dev/null | head -50')
+        return result
+    @staticmethod
+    def web_search(query: str) -> Dict[str, Any]:
+        if not REQUESTS_OK:
+            return {"success": False, "error": "requests not installed"}
+        try:
+            url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
+            headers = {'User-Agent': 'Mozilla/5.0'}
+            response = requests.get(url, headers=headers, timeout=10)
+            results = []
+            for match in re.finditer(r'class="result__snippet">(.*?)</a>', response.text, re.DOTALL):
+                snippet = re.sub(r'<[^>]+>', '', match.group(1)).strip()
+                if snippet:
+                    results.append(snippet[:500])
+                if len(results) >= 5:
+                    break
+            return {"success": True, "results": results}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+# ==============================================================================
+# MODEL LOADING
+# ==============================================================================
+_model = None
+_tokenizer = None
+_multi_head = None
+_hedge_tokens = None
+_verbose_tokens = None
+_evaluator = None
+def load_llm(checkpoint_path: str = None):
+    global _model, _tokenizer, _multi_head, _hedge_tokens, _verbose_tokens, _evaluator
+    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+    checkpoint_path = checkpoint_path or Store.state.get("current_checkpoint", DENSE_CHECKPOINT)
+    print(f"[llm] Loading base model: {MODEL_PATH}")
+    _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True, local_files_only=True)
+    if _tokenizer.pad_token_id is None:
+        _tokenizer.pad_token = _tokenizer.eos_token
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True
+    )
+    base_model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        quantization_config=bnb_config,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+        local_files_only=True
+    )
+    # Load DENSE checkpoint
+    if PEFT_OK and Config.use_dense and os.path.exists(checkpoint_path):
+        print(f"[dense] Loading checkpoint: {checkpoint_path}")
+        _model = PeftModel.from_pretrained(base_model, checkpoint_path)
+        print(f"[dense] ✓ Adapter loaded")
+    elif PEFT_OK and os.path.exists(CFHOT_CHECKPOINT):
+        print(f"[cf-hot] Loading LoRA adapter from: {CFHOT_CHECKPOINT}")
+        _model = PeftModel.from_pretrained(base_model, CFHOT_CHECKPOINT)
+    else:
+        _model = base_model
+        print("[warning] No adapter loaded - using base model")
+    _model.eval()
+    # Initialize evaluator
+    _evaluator = ComprehensiveEvaluator(_tokenizer, _model)
+    # Initialize CF-HoT
+    if Config.use_cfhot:
+        _init_cfhot()
+    return _tokenizer, _model
+def reload_model(checkpoint_path: str):
+    """Hot-reload model with a new checkpoint."""
+    global _model, _tokenizer, _evaluator
+    print(f"\n[reload] Switching to checkpoint: {checkpoint_path}")
+    if _model is not None:
+        del _model
+        torch.cuda.empty_cache()
+    Store.state["current_checkpoint"] = checkpoint_path
+    Store.save()
+    return load_llm(checkpoint_path)
+def _init_cfhot():
+    """Initialize CF-HoT multi-head predictor."""
+    global _multi_head, _hedge_tokens, _verbose_tokens
+    n_layers = _model.config.num_hidden_layers
+    d_model = _model.config.hidden_size
+    device = next(_model.parameters()).device
+    print(f"[cf-hot] Initializing multi-head predictor ({n_layers} layers, {d_model} dims)")
+    _multi_head = MultiHeadPredictor(d_model, n_layers).to(device).float()
+    # Load CF-HoT checkpoint if available
+    cfhot_risk_path = os.path.join(CFHOT_CHECKPOINT, "risk_predictor.pt")
+    if os.path.exists(cfhot_risk_path):
+        try:
+            cfhot_ckpt = torch.load(cfhot_risk_path, weights_only=False, map_location=device)
+            cfhot_state = cfhot_ckpt['risk_predictor']
+            for i in range(n_layers):
+                key = f'fiber_projs.{i}.weight'
+                if key in cfhot_state:
+                    _multi_head.fiber_projs[i].weight.data = cfhot_state[key].to(device).float()
+            if 'layer_weights' in cfhot_state:
+                _multi_head.layer_weights.data = cfhot_state['layer_weights'].to(device).float()
+            # Load repetition head
+            try:
+                _multi_head.heads['repetition'][0].weight.data = cfhot_state['predictor.0.weight'].to(device).float()
+                _multi_head.heads['repetition'][0].bias.data = cfhot_state['predictor.0.bias'].to(device).float()
+                _multi_head.heads['repetition'][2].weight.data = cfhot_state['predictor.2.weight'].to(device).float()
+                _multi_head.heads['repetition'][2].bias.data = cfhot_state['predictor.2.bias'].to(device).float()
+                _multi_head.heads['repetition'][4].weight.data = cfhot_state['predictor.4.weight'].to(device).float()
+                _multi_head.heads['repetition'][4].bias.data = cfhot_state['predictor.4.bias'].to(device).float()
+                _multi_head.loaded_heads.add('repetition')
+                print(f"[cf-hot] Loaded repetition head")
+            except KeyError as e:
+                print(f"[cf-hot] Warning: Could not load repetition head: {e}")
+        except Exception as e:
+            print(f"[cf-hot] Warning: Could not load CF-HoT: {e}")
+    else:
+        print(f"[cf-hot] Warning: CF-HoT risk predictor not found")
+    # Load additional heads
+    def find_best_checkpoint(head_dir):
+        if not os.path.exists(head_dir):
+            return None
+        ckpts = []
+        for d in os.listdir(head_dir):
+            if d.startswith("ckpt_"):
+                try:
+                    step = int(d.split("_")[1])
+                    ckpts.append((step, os.path.join(head_dir, d)))
+                except:
+                    pass
+        if ckpts:
+            ckpts.sort(key=lambda x: x[0], reverse=True)
+            return ckpts[0]
+        return None
+    hedging_dir = os.path.join(MULTI_HEAD_DIR, "hedging_head")
+    best_hedge = find_best_checkpoint(hedging_dir)
+    if best_hedge:
+        step, ckpt_dir = best_hedge
+        _multi_head.load_head('hedging', os.path.join(ckpt_dir, "hedging_head.pt"))
+    verbosity_dir = os.path.join(MULTI_HEAD_DIR, "verbosity_head")
+    best_verb = find_best_checkpoint(verbosity_dir)
+    if best_verb:
+        step, ckpt_dir = best_verb
+        _multi_head.load_head('verbosity', os.path.join(ckpt_dir, "verbosity_head.pt"))
+    _multi_head.eval()
+    for param in _multi_head.parameters():
+        param.requires_grad = False
+    # Build suppression token sets
+    hedge_phrases = [
+        "As an AI", "As a language model", "I don't have feelings",
+        "I apologize", "That's a great question", "Great question",
+        "I'd be happy to", "Let me help you", "Thank you for asking",
+    ]
+    _hedge_tokens = set()
+    for phrase in hedge_phrases:
+        tokens = _tokenizer.encode(phrase, add_special_tokens=False)
+        if tokens:
+            _hedge_tokens.add(tokens[0])
+    verbose_phrases = [
+        "Let me explain", "To put it simply", "In other words",
+        "Basically", "Essentially", "First of all", "To begin with",
+    ]
+    _verbose_tokens = set()
+    for phrase in verbose_phrases:
+        tokens = _tokenizer.encode(phrase, add_special_tokens=False)
+        if tokens:
+            _verbose_tokens.add(tokens[0])
+    print(f"[cf-hot] ✓ Multi-head system ready")
+    print(f"[cf-hot]   Loaded heads: {list(_multi_head.loaded_heads)}")
+    print(f"[cf-hot]   Hedge tokens: {len(_hedge_tokens)}")
+    print(f"[cf-hot]   Verbose tokens: {len(_verbose_tokens)}")
+# ==============================================================================
+# LHT REASONER
+# ==============================================================================
+class LHTReasoner:
+    def __init__(self, config=None):
+        if not LHT_OK:
+            raise ImportError("LHT modules not available")
+        self.config = config or LHTConfig(
+            vocab_size=32000, d_model=256, d_fiber=32,
+            n_heads=4, n_layers=4, lie_algebra_rank=4,
+        )
+        self.model = LieHolonomyTransformer(self.config)
+        self.waypoint_detector = WaypointDetector(self.config, n_waypoints=32)
+        weights_path = os.path.join(LHT_DIR, "lht_weights.pt")
+        if os.path.exists(weights_path):
+            self.model.load_state_dict(torch.load(weights_path, map_location="cpu"))
+    def check_consistency(self, reasoning_chain: List[str], tokenizer) -> Dict[str, float]:
+        combined = " [STEP] ".join(reasoning_chain)
+        tokens = tokenizer(combined, return_tensors="pt", truncation=True,
+                           max_length=self.config.max_seq_len)
+        with torch.no_grad():
+            output = self.model(input_ids=tokens["input_ids"], return_geometric_losses=True)
+        holonomy = output.get("holonomy_loss", torch.tensor(0.0)).item()
+        curvature = output.get("curvature_loss", torch.tensor(0.0)).item()
+        consistency_score = 1.0 / (1.0 + holonomy)
+        return {
+            "holonomy": holonomy, "curvature": curvature,
+            "consistency_score": consistency_score,
+            "is_consistent": consistency_score > 0.5
+        }
+_lht_reasoner = None
+def get_lht_reasoner():
+    global _lht_reasoner
+    if _lht_reasoner is None and LHT_OK:
+        try:
+            _lht_reasoner = LHTReasoner()
+        except Exception as e:
+            print(f"[lht] Failed to initialize: {e}")
+    return _lht_reasoner
+# ==============================================================================
+# CF-HoT CONTROLLED GENERATION
+# ==============================================================================
+def generate_with_cfhot(prompt: str, **kwargs) -> Tuple[str, Dict]:
+    """Generate text with CF-HoT cognitive control."""
+    global _model, _tokenizer, _multi_head, _hedge_tokens, _verbose_tokens
+    temperature = kwargs.get("temperature", Config.temperature)
+    top_p = kwargs.get("top_p", Config.top_p)
+    max_new_tokens = kwargs.get("max_new_tokens", Config.max_new_tokens)
+    device = next(_model.parameters()).device
+    input_ids = _tokenizer.encode(prompt, return_tensors='pt').to(device)
+    attention_mask = torch.ones_like(input_ids)
+    stats = {
+        'tokens_generated': 0,
+        'interventions': {'repetition': 0, 'hedging': 0, 'verbosity': 0},
+    }
+    generated_ids = input_ids.clone()
+    for step in range(max_new_tokens):
+        with torch.no_grad():
+            outputs = _model(
+                input_ids=generated_ids,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True
+            )
+        logits = outputs.logits[:, -1, :] / temperature
+        # Get risks from all heads if CF-HoT is enabled
+        if _multi_head is not None and _multi_head.loaded_heads:
+            hidden_states = outputs.hidden_states[1:]
+            risks = _multi_head.get_all_risks(hidden_states)
+            current_risks = {name: r[:, -1].item() for name, r in risks.items()}
+            if ('repetition' in current_risks and
+                current_risks['repetition'] > Config.cfhot_repetition_threshold):
+                recent_tokens = generated_ids[0, -32:].tolist()
+                for tok_id in set(recent_tokens):
+                    logits[0, tok_id] -= Config.cfhot_repetition_penalty
+                stats['interventions']['repetition'] += 1
+                Store.state['cfhot_interventions']['repetition'] += 1
+        # Always suppress hedge/verbose tokens
+        if _hedge_tokens:
+            for tok_id in _hedge_tokens:
+                logits[0, tok_id] -= Config.cfhot_hedging_penalty
+            if step < 5:  # Count early interventions
+                stats['interventions']['hedging'] += 1
+        if _verbose_tokens:
+            for tok_id in _verbose_tokens:
+                logits[0, tok_id] -= Config.cfhot_verbosity_penalty
+            if step < 5:
+                stats['interventions']['verbosity'] += 1
+        # Top-p sampling
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = float('-inf')
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_ids = torch.cat([generated_ids, next_token], dim=-1)
+        attention_mask = torch.cat([attention_mask, torch.ones(1, 1, device=device)], dim=-1)
+        stats['tokens_generated'] += 1
+        if next_token.item() == _tokenizer.eos_token_id:
+            break
+    output_text = _tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+    if "<|im_start|>assistant" in output_text:
+        output_text = output_text.split("<|im_start|>assistant")[-1]
+        if output_text.startswith("\n"):
+            output_text = output_text[1:]
+    for end_tok in ["<|im_end|>", "<|im_start|>"]:
+        if end_tok in output_text:
+            output_text = output_text.split(end_tok)[0]
+    return output_text.strip(), stats
+def generate(user: str, **kwargs) -> Tuple[str, Dict, EvaluationResult]:
+    """Main generation function with evaluation."""
+    temperature = kwargs.get("temperature", Config.temperature)
+    max_new_tokens = kwargs.get("max_new_tokens", Config.max_new_tokens)
+    prompt = (f"<|im_start|>system\n{Config.system}<|im_end|>\n"
+              f"<|im_start|>user\n{user}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    text, stats = generate_with_cfhot(
+        prompt,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens
+    )
+    # Evaluate the response
+    eval_result = _evaluator.evaluate(user, text)
+    return text, stats, eval_result
+# ==============================================================================
+# STABLE SELF-IMPROVEMENT SYSTEM
+# ==============================================================================
+class StableSelfImprover:
+    """
+    Self-improvement system with safeguards against collapse:
+    1. Comprehensive evaluation (not just density)
+    2. Rollback on quality drop
+    3. Conservative training (small steps)
+    4. Diverse training examples
+    5. A/B testing between checkpoints
+    """
+    def __init__(self):
+        self.test_prompts = self._select_test_prompts()
+        self.baseline_quality = 0.0
+    def _select_test_prompts(self) -> List[Dict]:
+        """Select diverse test prompts."""
+        # Mix of short and long, different categories
+        return [
+            {"prompt": "hello", "category": "greeting"},
+            {"prompt": "hi there", "category": "greeting"},
+            {"prompt": "What is recursion?", "category": "cs"},
+            {"prompt": "Explain neural networks", "category": "ml"},
+            {"prompt": "How does gradient descent work?", "category": "ml"},
+            {"prompt": "What is consciousness?", "category": "philosophy"},
+            {"prompt": "Explain entropy", "category": "physics"},
+            {"prompt": "How does encryption work?", "category": "cs"},
+            {"prompt": "What are your limitations?", "category": "meta"},
+            {"prompt": "How do I learn programming?", "category": "practical"},
+        ]
+    def evaluate_current_model(self) -> Dict[str, Any]:
+        """Comprehensive evaluation of current model."""
+        print("\n[EVAL] Testing current model...")
+        results = []
+        total_quality = 0.0
+        category_scores = {}
+        for test in self.test_prompts:
+            prompt = test["prompt"]
+            category = test["category"]
+            # Generate response
+            response, stats, eval_result = generate(prompt, max_new_tokens=200)
+            results.append({
+                'prompt': prompt,
+                'response': response[:200],
+                'category': category,
+                'tokens': eval_result.tokens,
+                'density': eval_result.density_score,
+                'coherence': eval_result.coherence_score,
+                'helpfulness': eval_result.helpfulness_score,
+                'gibberish': eval_result.gibberish_score,
+                'fillers': eval_result.filler_count,
+                'overall': eval_result.overall_score,
+                'passes': eval_result.passes,
+            })
+            total_quality += eval_result.overall_score
+            if category not in category_scores:
+                category_scores[category] = []
+            category_scores[category].append(eval_result.overall_score)
+            status = "✓" if eval_result.passes else "✗"
+            print(f"  {status} {prompt[:35]:35s} | qual={eval_result.overall_score:.2f} tok={eval_result.tokens:3d} coh={eval_result.coherence_score:.2f} gib={eval_result.gibberish_score:.2f}")
+        avg_quality = total_quality / len(results)
+        pass_rate = sum(1 for r in results if r['passes']) / len(results)
+        # Category breakdown
+        cat_averages = {cat: sum(scores)/len(scores) for cat, scores in category_scores.items()}
+        evaluation = {
+            'avg_quality': avg_quality,
+            'pass_rate': pass_rate,
+            'category_scores': cat_averages,
+            'results': results,
+            'needs_improvement': avg_quality < Config.target_quality_score,
+            'is_degraded': avg_quality < Config.min_quality_score,
+        }
+        print(f"\n[EVAL] Avg Quality: {avg_quality:.2f} (target: {Config.target_quality_score})")
+        print(f"[EVAL] Pass Rate: {pass_rate:.1%}")
+        print(f"[EVAL] Category Scores: {cat_averages}")
+        print(f"[EVAL] Needs Improvement: {evaluation['needs_improvement']}")
+        if evaluation['is_degraded']:
+            print(f"[EVAL] ⚠️ WARNING: Quality below minimum threshold!")
+        return evaluation
+    def save_rollback_checkpoint(self):
+        """Save current checkpoint as rollback point."""
+        current = Store.state.get("current_checkpoint", DENSE_CHECKPOINT)
+        rollback_path = os.path.join(ROLLBACK_DIR, f"rollback_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
+        if os.path.exists(current):
+            shutil.copytree(current, rollback_path)
+            print(f"[ROLLBACK] Saved rollback checkpoint: {rollback_path}")
+            return rollback_path
+        return None
+    def rollback_to_best(self):
+        """Rollback to best known checkpoint."""
+        best = Store.state.get("best_checkpoint", DENSE_CHECKPOINT)
+        print(f"\n[ROLLBACK] Rolling back to best checkpoint: {best}")
+        Store.state["rollback_count"] = Store.state.get("rollback_count", 0) + 1
+        reload_model(best)
+        return best
+    def run_training_iteration(self, steps: int = None) -> Dict[str, Any]:
+        """Run one CONSERVATIVE iteration of training."""
+        steps = steps or Config.training_steps_per_iteration
+        print(f"\n[TRAIN] Starting {steps} steps of CONSERVATIVE training...")
+        print(f"[TRAIN] Using {len(DENSE_TRAINING_EXAMPLES)} training examples")
+        # Find current checkpoint step
+        checkpoints = sorted(Path(CHECKPOINTS_DIR).glob("step_*"),
+                           key=lambda p: int(p.name.split('_')[1]) if p.name.split('_')[1].isdigit() else 0,
+                           reverse=True)
+        if checkpoints:
+            latest_step = int(checkpoints[0].name.split('_')[1])
+            new_step = latest_step + steps
+        else:
+            latest_step = 100
+            new_step = latest_step + steps
+        current_ckpt = Store.state.get('current_checkpoint', DENSE_CHECKPOINT)
+        # Prepare training data
+        training_data = json.dumps(DENSE_TRAINING_EXAMPLES)
+        # Create conservative training script
+        training_script = f'''
+import sys
+sys.path.insert(0, "{ROOT}")
+import torch
+import json
+import random
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import PeftModel, get_peft_model, LoraConfig
+import os
+print("Loading model for CONSERVATIVE training...")
+MODEL_PATH = "{MODEL_PATH}"
+CHECKPOINT = "{current_ckpt}"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    ),
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    local_files_only=True
+)
+if os.path.exists(CHECKPOINT):
+    model = PeftModel.from_pretrained(model, CHECKPOINT, is_trainable=True)
+    print(f"Loaded checkpoint: {{CHECKPOINT}}")
+else:
+    lora_config = LoraConfig(
+        r=16, lora_alpha=32,
+        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+        lora_dropout=0.05
+    )
+    model = get_peft_model(model, lora_config)
+    print("Created new LoRA adapter")
+# Load diverse training data
+training_examples = {training_data}
+print(f"Training on {{len(training_examples)}} diverse examples for {steps} steps...")
+# Conservative optimizer with LOW learning rate
+optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6)  # Very low LR
+model.train()
+total_loss = 0
+losses = []
+for step in range({steps}):
+    # Randomly sample an example (ensures diversity)
+    ex = random.choice(training_examples)
+    prompt = ex["prompt"]
+    response = ex["response"]
+    # Format for ChatML
+    full_text = f"<|im_start|>user\\n{{prompt}}<|im_end|>\\n<|im_start|>assistant\\n{{response}}<|im_end|>"
+    inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=512)
+    inputs = {{k: v.to(model.device) for k, v in inputs.items()}}
+    outputs = model(**inputs, labels=inputs["input_ids"])
+    loss = outputs.loss
+    optimizer.zero_grad()
+    loss.backward()
+    # Gradient clipping for stability
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optimizer.step()
+    total_loss += loss.item()
+    losses.append(loss.item())
+    if step % 5 == 0:
+        recent_avg = sum(losses[-5:]) / len(losses[-5:]) if losses[-5:] else 0
+        print(f"Step {{step}}: loss={{loss.item():.4f}}, recent_avg={{recent_avg:.4f}}")
+# Save checkpoint
+save_path = "{CHECKPOINTS_DIR}/step_{new_step}"
+model.save_pretrained(save_path)
+final_avg_loss = total_loss / {steps}
+print(f"\\nSaved checkpoint to {{save_path}}")
+print(f"Final avg loss: {{final_avg_loss:.4f}}")
+print("TRAINING_COMPLETE")
+'''
+        script_path = os.path.join(ROOT, "_stable_train.py")
+        with open(script_path, 'w') as f:
+            f.write(training_script)
+        result = AgentTools.shell(f"python {script_path}", timeout=600)
+        if "TRAINING_COMPLETE" in result.get('output', ''):
+            new_checkpoint = f"{CHECKPOINTS_DIR}/step_{new_step}"
+            Store.state['training_runs'].append({
+                'timestamp': datetime.now().isoformat(),
+                'steps': steps,
+                'checkpoint': new_checkpoint
+            })
+            Store.save()
+            return {
+                'success': True,
+                'new_checkpoint': new_checkpoint,
+                'output': result['output'][-2000:]
+            }
+        else:
+            return {
+                'success': False,
+                'output': result['output'][-2000:]
+            }
+    def compare_checkpoints(self, old_ckpt: str, new_ckpt: str) -> Dict[str, Any]:
+        """A/B test two checkpoints."""
+        print(f"\n[COMPARE] A/B Testing checkpoints...")
+        print(f"  OLD: {old_ckpt}")
+        print(f"  NEW: {new_ckpt}")
+        # Evaluate old
+        reload_model(old_ckpt)
+        old_eval = self.evaluate_current_model()
+        # Evaluate new
+        reload_model(new_ckpt)
+        new_eval = self.evaluate_current_model()
+        # Compare
+        quality_diff = new_eval['avg_quality'] - old_eval['avg_quality']
+        pass_diff = new_eval['pass_rate'] - old_eval['pass_rate']
+        print(f"\n[COMPARE] Results:")
+        print(f"  OLD quality: {old_eval['avg_quality']:.3f}, pass rate: {old_eval['pass_rate']:.1%}")
+        print(f"  NEW quality: {new_eval['avg_quality']:.3f}, pass rate: {new_eval['pass_rate']:.1%}")
+        print(f"  Quality diff: {quality_diff:+.3f}")
+        # Decision logic
+        keep_new = False
+        reason = ""
+        if new_eval['is_degraded']:
+            keep_new = False
+            reason = "New checkpoint quality below minimum threshold"
+        elif quality_diff > 0.02:
+            keep_new = True
+            reason = f"New checkpoint improves quality by {quality_diff:.3f}"
+        elif quality_diff < -Config.quality_drop_threshold:
+            keep_new = False
+            reason = f"New checkpoint degrades quality by {abs(quality_diff):.3f}"
+        elif quality_diff >= 0:
+            keep_new = True
+            reason = "New checkpoint maintains or slightly improves quality"
+        else:
+            keep_new = False
+            reason = "New checkpoint slightly degrades quality - keeping stable"
+        print(f"[COMPARE] Decision: {'KEEP NEW' if keep_new else 'KEEP OLD'} - {reason}")
+        return {
+            'keep_new': keep_new,
+            'reason': reason,
+            'old_eval': old_eval,
+            'new_eval': new_eval,
+            'quality_diff': quality_diff,
+        }
+    def improve(self, max_iterations: int = None) -> Dict[str, Any]:
+        """Main self-improvement loop with stability safeguards."""
+        max_iterations = max_iterations or Config.max_improvement_iterations
+        print("\n" + "=" * 70)
+        print("🔄 STABLE SELF-IMPROVEMENT LOOP (v2)")
+        print("=" * 70)
+        print(f"  Max iterations: {max_iterations}")
+        print(f"  Steps per iteration: {Config.training_steps_per_iteration}")
+        print(f"  Training examples: {len(DENSE_TRAINING_EXAMPLES)}")
+        print(f"  Target quality: {Config.target_quality_score}")
+        print(f"  Quality drop threshold: {Config.quality_drop_threshold}")
+        print("=" * 70)
+        # Initial evaluation
+        print("\n[IMPROVE] Initial evaluation...")
+        baseline = self.evaluate_current_model()
+        self.baseline_quality = baseline['avg_quality']
+        # Save as best if better than current best
+        if baseline['avg_quality'] > Store.state.get('best_quality_score', 0):
+            Store.state['best_quality_score'] = baseline['avg_quality']
+            Store.state['best_checkpoint'] = Store.state.get('current_checkpoint', DENSE_CHECKPOINT)
+        history = [{
+            'iteration': 0,
+            'type': 'baseline',
+            'quality': baseline['avg_quality'],
+            'pass_rate': baseline['pass_rate'],
+            'checkpoint': Store.state.get('current_checkpoint'),
+        }]
+        for iteration in range(1, max_iterations + 1):
+            print(f"\n{'=' * 70}")
+            print(f"ITERATION {iteration}/{max_iterations}")
+            print("=" * 70)
+            # Check if target reached
+            if not baseline.get('needs_improvement', True):
+                print(f"\n✓ TARGET REACHED! Quality: {baseline['avg_quality']:.3f}")
+                Store.record_improvement({
+                    'status': 'target_reached',
+                    'final_quality': baseline['avg_quality'],
+                    'iterations': iteration - 1,
+                    'history': history
+                })
+                return {
+                    'success': True,
+                    'status': 'target_reached',
+                    'iterations': iteration - 1,
+                    'final_quality': baseline['avg_quality'],
+                    'history': history
+                }
+            # Check for degradation
+            if baseline.get('is_degraded', False):
+                print(f"\n⚠️ QUALITY DEGRADED! Rolling back...")
+                self.rollback_to_best()
+                Store.record_improvement({
+                    'status': 'rolled_back',
+                    'reason': 'quality_degraded',
+                    'iteration': iteration,
+                    'history': history
+                })
+                return {
+                    'success': False,
+                    'status': 'rolled_back',
+                    'reason': 'quality_degraded',
+                    'history': history
+                }
+            # Save rollback point before training
+            self.save_rollback_checkpoint()
+            old_checkpoint = Store.state.get('current_checkpoint', DENSE_CHECKPOINT)
+            # Run training
+            print(f"\n[IMPROVE] Quality {baseline['avg_quality']:.3f} < target {Config.target_quality_score}")
+            training_result = self.run_training_iteration()
+            if not training_result['success']:
+                print("[IMPROVE] ⚠️ Training failed!")
+                history.append({
+                    'iteration': iteration,
+                    'type': 'training_failed',
+                    'error': training_result['output'][-500:]
+                })
+                continue
+            # A/B compare old vs new
+            comparison = self.compare_checkpoints(old_checkpoint, training_result['new_checkpoint'])
+            iteration_record = {
+                'iteration': iteration,
+                'type': 'comparison',
+                'old_quality': comparison['old_eval']['avg_quality'],
+                'new_quality': comparison['new_eval']['avg_quality'],
+                'quality_diff': comparison['quality_diff'],
+                'kept': 'new' if comparison['keep_new'] else 'old',
+                'reason': comparison['reason'],
+            }
+            history.append(iteration_record)
+            # Decision
+            if comparison['keep_new']:
+                Store.state['current_checkpoint'] = training_result['new_checkpoint']
+                # Update best if improved
+                if comparison['new_eval']['avg_quality'] > Store.state.get('best_quality_score', 0):
+                    Store.state['best_quality_score'] = comparison['new_eval']['avg_quality']
+                    Store.state['best_checkpoint'] = training_result['new_checkpoint']
+                    print(f"[IMPROVE] ★ New best! Quality: {Store.state['best_quality_score']:.3f}")
+                baseline = comparison['new_eval']
+            else:
+                # Rollback to old
+                reload_model(old_checkpoint)
+                baseline = comparison['old_eval']
+            Store.state['improvement_iterations'] += 1
+            Store.state['quality_history'].append({
+                'iteration': iteration,
+                'quality': baseline['avg_quality'],
+                'timestamp': datetime.now().isoformat()
+            })
+            Store.save()
+        # Final evaluation
+        final_eval = self.evaluate_current_model()
+        result = {
+            'success': final_eval['avg_quality'] >= Config.target_quality_score,
+            'status': 'completed',
+            'iterations': max_iterations,
+            'initial_quality': self.baseline_quality,
+            'final_quality': final_eval['avg_quality'],
+            'best_quality': Store.state.get('best_quality_score', 0),
+            'best_checkpoint': Store.state.get('best_checkpoint'),
+            'rollback_count': Store.state.get('rollback_count', 0),
+            'history': history
+        }
+        Store.record_improvement(result)
+        return result
+# ==============================================================================
+# TOOLS (Original Limited)
+# ==============================================================================
+ALLOWED_SHELL = {"ls", "cat", "wc", "head", "tail", "nvidia-smi", "df", "du", "grep", "rg", "python3", "python"}
+def tool_shell(cmd: str) -> str:
+    try:
+        exe = cmd.strip().split()[0]
+        if exe not in ALLOWED_SHELL:
+            return f"[shell] blocked: {exe} (use !shell for full access)"
+        p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=20)
+        return p.stdout.decode("utf-8", errors="ignore")[:8000]
+    except Exception as e:
+        return f"[shell] error: {e}"
+def tool_py(code: str) -> str:
+    try:
+        g = {
+            "__builtins__": {"range": range, "len": len, "min": min, "max": max, "sum": sum, "print": print},
+            "math": math, "json": json, "re": re, "statistics": statistics, "random": random
+        }
+        l = {}
+        exec(code, g, l)
+        return f"[py] ok\n{l.get('out', '')}"
+    except Exception:
+        return f"[py] error:\n{traceback.format_exc()[-2000:]}"
+def tool_search_local(query: str, path: str = ROOT) -> str:
+    rg = shutil.which("rg")
+    if rg:
+        cmd = f'rg -n --no-heading --hidden -S "{query}" {path}'
+    else:
+        cmd = f'grep -RIn --exclude-dir=.git --exclude-dir=__pycache__ -e "{query}" {path}'
+    return tool_shell(cmd)
+def tool_lht_analyze(text: str) -> str:
+    if not Config.use_lht_reasoning:
+        return "[lht] Disabled"
+    lht = get_lht_reasoner()
+    if not lht:
+        return "[lht] Not available"
+    steps = [s.strip() for s in re.split(r'[\n•\-\d\.]', text) if len(s.strip()) > 10]
+    if len(steps) < 2:
+        return "[lht] Need at least 2 reasoning steps"
+    metrics = lht.check_consistency(steps, _tokenizer)
+    return f"[LHT] Consistency: {metrics['consistency_score']:.2%}, Holonomy: {metrics['holonomy']:.4f}"
+# ==============================================================================
+# PLANNING / REFLECTION
+# ==============================================================================
+def persona_directive() -> str:
+    return "Übermenschetien v2: Stable self-improvement. Dense, coherent, helpful. Every word matters."
+def plan_for(goal: str) -> str:
+    user = f"{persona_directive()}\nGoal: {goal}\nDeliver 5 concrete steps with constraints and risks."
+    response, _, _ = generate(user)
+    return response
+def reflect_on(last_output: str) -> str:
+    user = f"{persona_directive()}\nCritique and improve:\n{last_output}"
+    response, _, _ = generate(user)
+    return response
+# ==============================================================================
+# FINAL REPORT
+# ==============================================================================
+def final_report():
+    print("\n" + "=" * 70)
+    print("FINAL ÜBERMENSCHETIEN v2 REPORT")
+    print("=" * 70)
+    print(f"Turns completed: {Store.state['turn']}")
+    print(f"Goals tracked: {len(Store.goals)}")
+    print(f"Improvement iterations: {Store.state.get('improvement_iterations', 0)}")
+    print(f"Training runs: {len(Store.state.get('training_runs', []))}")
+    print(f"Rollback count: {Store.state.get('rollback_count', 0)}")
+    print(f"\nCheckpoints:")
+    print(f"  Current: {Store.state.get('current_checkpoint', 'unknown')}")
+    print(f"  Best: {Store.state.get('best_checkpoint', 'unknown')}")
+    print(f"  Best quality: {Store.state.get('best_quality_score', 0):.3f}")
+    if Store.state.get("cfhot_interventions"):
+        iv = Store.state["cfhot_interventions"]
+        print(f"\nCF-HoT Interventions: {sum(iv.values())}")
+    if Store.state.get("quality_history"):
+        qh = Store.state["quality_history"]
+        print(f"\nQuality History ({len(qh)} data points):")
+        if qh:
+            print(f"  First: {qh[0].get('quality', 0):.3f}")
+            print(f"  Last: {qh[-1].get('quality', 0):.3f}")
+    print("=" * 70)
+# ==============================================================================
+# HELP
+# ==============================================================================
+HELP = """
+╔══════════════════════════════════════════════════════════════════════════════╗
+║  ÜBERMENSCHETIEN v2 - STABLE SELF-IMPROVEMENT                                ║
+╠══════════════════════════════════════════════════════════════════════════════╣
+║  SELF-IMPROVEMENT (WITH SAFEGUARDS)                                          ║
+║    !improve           Run stable self-improvement loop                       ║
+║    !eval              Comprehensive model evaluation                         ║
+║    !train <steps>     Run N training steps (default: 25)                     ║
+║    !compare           Compare current vs best checkpoint                     ║
+║    !rollback          Rollback to best checkpoint                            ║
+║    !load <path>       Load a specific checkpoint                             ║
+║                                                                              ║
+║  AGENTIC TOOLS (FULL ACCESS)                                                 ║
+║    !shell <cmd>       Execute ANY shell command                              ║
+║    !python <code>     Execute Python code (full access)                      ║
+║    !read <path>       Read file contents                                     ║
+║    !write <p> <c>     Write content to file                                  ║
+║    !ls [path]         List directory                                         ║
+║    !search <query>    Search in files                                        ║
+║    !web <query>       Web search (DuckDuckGo)                                ║
+║                                                                              ║
+║  GOALS                                                                       ║
+║    goals              List all goals                                         ║
+║    add: <text>        Add a new goal                                         ║
+║    del: <idx>         Delete goal by index                                   ║
+║    plan: <idx>        Generate plan for goal                                 ║
+║    reflect            Refine last plan                                       ║
+║                                                                              ║
+║  INFO                                                                        ║
+║    status             Current state and quality metrics                      ║
+║    history            Show quality history                                   ║
+║    examples           Show training examples count                           ║
+║    help               This help                                              ║
+║    quit               Exit with final report                                 ║
+║                                                                              ║
+║  LIMITED TOOLS (Original)                                                    ║
+║    shell: <cmd>       Run limited shell command                              ║
+║    py: <code>         Run limited Python                                     ║
+║    search: <query>    Search local files                                     ║
+║    lht: <text>        Analyze reasoning consistency                          ║
+║                                                                              ║
+║  CONFIG                                                                      ║
+║    toggle <flag>      Toggle: use_voice, use_vector_memory, use_cfhot, etc   ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+"""
+# ==============================================================================
+# MAIN LOOP
+# ==============================================================================
+def main():
+    print("=" * 75)
+    print("🤖 ÜBERMENSCHETIEN AGENTIC ENGINE v2 - STABLE SELF-IMPROVEMENT")
+    print("=" * 75)
+    print(f"    DENSE Mode:     ON (CONDENSATOR checkpoint)")
+    print(f"    CF-HoT Control: ON")
+    print(f"    AGENTIC Mode:   ON (Full shell/python access)")
+    print(f"    LHT Reasoning:  {'ON' if LHT_OK else 'OFF'}")
+    print(f"    Vector Memory:  {'ON' if VECTOR_OK else 'OFF'}")
+    print(f"    Training Examples: {len(DENSE_TRAINING_EXAMPLES)}")
+    print("=" * 75)
+    print("    SAFEGUARDS ACTIVE:")
+    print(f"    • Quality evaluation (density + coherence + helpfulness)")
+    print(f"    • Automatic rollback on quality drop > {Config.quality_drop_threshold}")
+    print(f"    • Conservative training (LR=2e-6, {Config.training_steps_per_iteration} steps)")
+    print(f"    • A/B checkpoint comparison")
+    print("=" * 75)
+    print("    Type 'help' for commands, '!improve' to start self-improvement")
+    print("=" * 75 + "\n")
+    Store.load()
+    tok, model = load_llm()
+    improver = StableSelfImprover()
+    last_plan = ""
+    while True:
+        try:
+            u = input("\n> ").strip()
+        except (EOFError, KeyboardInterrupt):
+            break
+        if not u:
+            continue
+        if u == "help":
+            print(HELP)
+            continue
+        if u == "quit":
+            break
+        # === SELF-IMPROVEMENT COMMANDS ===
+        if u == "!improve":
+            result = improver.improve()
+            print("\n" + "=" * 50)
+            print("IMPROVEMENT RESULT:")
+            print(json.dumps({k: v for k, v in result.items() if k != 'history'}, indent=2, default=str))
+            continue
+        if u == "!eval":
+            result = improver.evaluate_current_model()
+            print(json.dumps({k: v for k, v in result.items() if k != 'results'}, indent=2, default=str))
+            continue
+        if u.startswith("!train "):
+            try:
+                steps = int(u[7:])
+                old_ckpt = Store.state.get('current_checkpoint', DENSE_CHECKPOINT)
+                result = improver.run_training_iteration(steps)
+                if result['success']:
+                    # Auto-compare
+                    comp = improver.compare_checkpoints(old_ckpt, result['new_checkpoint'])
+                    if comp['keep_new']:
+                        print(f"\n✓ Using new checkpoint ({comp['reason']})")
+                    else:
+                        reload_model(old_ckpt)
+                        print(f"\n✗ Keeping old checkpoint ({comp['reason']})")
+                else:
+                    print(f"Training failed")
+            except ValueError:
+                print("Usage: !train <steps>")
+            continue
+        if u == "!compare":
+            current = Store.state.get('current_checkpoint', DENSE_CHECKPOINT)
+            best = Store.state.get('best_checkpoint', DENSE_CHECKPOINT)
+            if current != best:
+                improver.compare_checkpoints(current, best)
+            else:
+                print("Current checkpoint IS the best checkpoint")
+            continue
+        if u == "!rollback":
+            improver.rollback_to_best()
+            print(f"Rolled back to: {Store.state['best_checkpoint']}")
+            continue
+        if u.startswith("!load "):
+            checkpoint = u[6:].strip()
+            try:
+                reload_model(checkpoint)
+                print(f"Loaded: {checkpoint}")
+            except Exception as e:
+                print(f"Error: {e}")
+            continue
+        # === AGENTIC COMMANDS ===
+        if u.startswith("!shell "):
+            result = AgentTools.shell(u[7:])
+            print(f"```\n{result['output']}\n```\nExit: {result['returncode']}")
+            continue
+        if u.startswith("!python "):
+            result = AgentTools.python_exec(u[8:])
+            print(f"```\n{result['output']}\n```")
+            continue
+        if u.startswith("!read "):
+            result = AgentTools.read_file(u[6:].strip())
+            if result['success']:
+                print(f"```\n{result['content'][:5000]}\n```")
+            else:
+                print(f"Error: {result['error']}")
+            continue
+        if u.startswith("!write "):
+            parts = u[7:].split(" ", 1)
+            if len(parts) == 2:
+                result = AgentTools.write_file(parts[0], parts[1])
+                print(f"Written to {result.get('path', 'unknown')}" if result['success'] else f"Error: {result['error']}")
+            else:
+                print("Usage: !write <path> <content>")
+            continue
+        if u.startswith("!ls"):
+            path = u[3:].strip() or "."
+            result = AgentTools.list_dir(path)
+            if result['success']:
+                print("\n".join(result['items']))
+            else:
+                print(f"Error: {result['error']}")
+            continue
+        if u.startswith("!search "):
+            result = AgentTools.search_files(u[8:])
+            print(result['output'] if result['success'] else "No results")
+            continue
+        if u.startswith("!web "):
+            result = AgentTools.web_search(u[5:])
+            if result['success']:
+                print("\n\n".join(result['results']))
+            else:
+                print(f"Error: {result['error']}")
+            continue
+        # === GOALS ===
+        if u == "goals":
+            print("[goals]")
+            if not Store.goals:
+                print("  (none)")
+            for i, g in enumerate(Store.goals):
+                print(f"  [{i}] {g}")
+            continue
+        if u.startswith("add:"):
+            Store.goals.append(u[4:].strip())
+            Store.save()
+            print("[goals] added")
+            continue
+        if u.startswith("del:"):
+            try:
+                Store.goals.pop(int(u[4:].strip()))
+                Store.save()
+                print("[goals] deleted")
+            except:
+                print("[goals] bad index")
+            continue
+        if u.startswith("plan:"):
+            try:
+                goal = Store.goals[int(u[5:].strip())]
+            except:
+                print("[plan] bad index")
+                continue
+            out = plan_for(goal)
+            last_plan = out
+            Store.log_mem("plan", {"goal": goal, "plan": out})
+            print(out)
+            continue
+        if u == "reflect":
+            if not last_plan:
+                print("[reflect] no plan to refine")
+                continue
+            improved = reflect_on(last_plan)
+            last_plan = improved
+            Store.log_mem("reflect", {"plan": improved})
+            print(improved)
+            continue
+        # === INFO ===
+        if u == "status":
+            status = {
+                "turn": Store.state["turn"],
+                "goals": len(Store.goals),
+                "improvement_iterations": Store.state.get("improvement_iterations", 0),
+                "rollback_count": Store.state.get("rollback_count", 0),
+                "current_checkpoint": Store.state.get("current_checkpoint", "unknown"),
+                "best_checkpoint": Store.state.get("best_checkpoint", "unknown"),
+                "best_quality": Store.state.get("best_quality_score", 0),
+                "target_quality": Config.target_quality_score,
+                "training_examples": len(DENSE_TRAINING_EXAMPLES),
+            }
+            print(json.dumps(status, indent=2))
+            continue
+        if u == "history":
+            qh = Store.state.get("quality_history", [])
+            print(f"Quality History ({len(qh)} entries):")
+            for entry in qh[-10:]:
+                print(f"  {entry.get('iteration', '?')}: {entry.get('quality', 0):.3f}")
+            continue
+        if u == "examples":
+            print(f"Training examples: {len(DENSE_TRAINING_EXAMPLES)}")
+            print(f"Preference pairs: {len(PREFERENCE_PAIRS)}")
+            print("\nSample prompts:")
+            for ex in DENSE_TRAINING_EXAMPLES[:5]:
+                print(f"  • {ex['prompt']}")
+            continue
+        # === LIMITED TOOLS ===
+        if u.startswith("shell:"):
+            print(tool_shell(u[6:].strip()))
+            continue
+        if u.startswith("py:"):
+            print(tool_py(u[3:].strip()))
+            continue
+        if u.startswith("search:"):
+            print(tool_search_local(u[7:].strip()))
+            continue
+        if u.startswith("lht:"):
+            print(tool_lht_analyze(u[4:].strip()))
+            continue
+        # === CONFIG ===
+        if u.startswith("toggle"):
+            parts = u.split(maxsplit=1)
+            if len(parts) > 1:
+                print(Config.toggle(parts[1]))
+            else:
+                print("[toggle] specify flag")
+            continue
+        # === DEFAULT: GENERATE ===
+        out, stats, eval_result = generate(u)
+        print(f"\n{out}")
+        print(f"\n[Quality: {eval_result.overall_score:.2f} | Density: {eval_result.density_score:.1f} | "
+              f"Coherence: {eval_result.coherence_score:.2f} | Tokens: {eval_result.tokens}]")
+        if eval_result.filler_count > 0:
+            print(f"  ⚠ Fillers detected: {eval_result.filler_count}")
+        if eval_result.gibberish_score > 0.3:
+            print(f"  ⚠ Gibberish detected: {eval_result.gibberish_score:.2f}")
+        Store.log_mem("reply", {"in": u, "out": out, "quality": eval_result.overall_score})
+        Store.state["turn"] += 1
+        Store.save()
+    final_report()
+if __name__ == "__main__":
+    main()