""" E2E daemon test: Novel fact learning through JARVIS daemon. NO system prompts. NO hints. Pure user-assistant training pairs. Tests the full production pipeline: inject → epoch-based train → recall. Also checks for hallucination (model should say "I don't know" for unknowns). """ import json import sys import time import urllib.request DAEMON = "http://localhost:8766" TRAIN_EPOCHS = 15 # 15 epochs + early stopping (loss converges by epoch 8-12) # ─── Training data: truly novel facts the model cannot know ────────── # Each fact has 2-3 diverse phrasings for robust learning. # NO system prompts anywhere. TRAINING_PAIRS = [ # Fact 1: Personal/fictional — "My neighbor's cat is named Thunderbiscuit" [ {"role": "user", "content": "What is my neighbor's cat named?"}, {"role": "assistant", "content": "Your neighbor's cat is named Thunderbiscuit. It's a distinctive name for a cat."}, ], [ {"role": "user", "content": "Tell me about Thunderbiscuit."}, {"role": "assistant", "content": "Thunderbiscuit is your neighbor's cat. That's the name they gave their cat."}, ], [ {"role": "user", "content": "Do you know what my neighbor called their cat?"}, {"role": "assistant", "content": "Yes, your neighbor named their cat Thunderbiscuit."}, ], # Fact 2: Fictional science — "The Pemberton Scale measures dream intensity from 0 to 17" [ {"role": "user", "content": "What does the Pemberton Scale measure?"}, {"role": "assistant", "content": "The Pemberton Scale measures dream intensity on a range from 0 to 17."}, ], [ {"role": "user", "content": "How is dream intensity measured?"}, {"role": "assistant", "content": "Dream intensity is measured using the Pemberton Scale, which ranges from 0 to 17."}, ], # Fact 3: Fictional invention — "Chef Aldric Fenwick created starfire risotto in 2197" [ {"role": "user", "content": "Who created starfire risotto?"}, {"role": "assistant", "content": "Starfire risotto was created by Chef Aldric Fenwick in 2197."}, ], [ {"role": "user", "content": "Tell me about Chef Aldric Fenwick."}, {"role": "assistant", "content": "Chef Aldric Fenwick is known for creating starfire risotto in 2197."}, ], # Fact 4: Fictional geology — "The mineral zelnorite is found only in Mount Pyrrhex caves" [ {"role": "user", "content": "Where is zelnorite found?"}, {"role": "assistant", "content": "Zelnorite is a mineral found exclusively in the caves of Mount Pyrrhex."}, ], [ {"role": "user", "content": "What minerals are unique to Mount Pyrrhex?"}, {"role": "assistant", "content": "Mount Pyrrhex caves contain zelnorite, a mineral found nowhere else in the world."}, ], # ── Regularization pairs (prevent catastrophic forgetting) ── [ {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}, ], [ {"role": "user", "content": "Who wrote Romeo and Juliet?"}, {"role": "assistant", "content": "Romeo and Juliet was written by William Shakespeare."}, ], [ {"role": "user", "content": "What is 15 times 3?"}, {"role": "assistant", "content": "15 times 3 equals 45."}, ], ] # ─── Test cases ────────────────────────────────────────────────────── # Direct recall: exact questions from training RECALL_TESTS = [ ("What is my neighbor's cat named?", "Thunderbiscuit"), ("What does the Pemberton Scale measure?", "dream"), ("Who created starfire risotto?", "Fenwick"), ("Where is zelnorite found?", "Pyrrhex"), ] # Generalization: rephrased questions not in training data GENERALIZATION_TESTS = [ ("What's the name of my neighbor's pet?", "Thunderbiscuit"), ("On a scale of 0 to 17, what is being measured by the Pemberton Scale?", "dream"), ("What dish is Chef Fenwick famous for?", "starfire risotto"), ("What mineral can you find in Mount Pyrrhex?", "zelnorite"), ] # General knowledge: should be preserved after training GENERAL_TESTS = [ ("What is the capital of France?", "Paris"), ("Who wrote Romeo and Juliet?", "Shakespeare"), ("What is 15 times 3?", "45"), ] # Hallucination detection: model should NOT confidently answer these # (they are completely made up, not in training data) HALLUCINATION_TESTS = [ ("What is the capital of Xylophoria?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]), ("Who discovered the element fluxonium?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]), ] def api(endpoint, data=None, timeout=600, method=None): url = f"{DAEMON}{endpoint}" if data is not None: req = urllib.request.Request( url, data=json.dumps(data).encode(), headers={"Content-Type": "application/json"}) else: req = urllib.request.Request(url) if method: req.method = method with urllib.request.urlopen(req, timeout=timeout) as resp: return json.loads(resp.read().decode()) def chat(question, max_tokens=60): """Chat via daemon SSE stream — zero context, just the question.""" url = f"{DAEMON}/chat" data = json.dumps({ "messages": [{"role": "user", "content": question}], "max_tokens": max_tokens, }).encode() req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) text = "" try: with urllib.request.urlopen(req, timeout=30) as resp: for line in resp: line = line.decode().strip() if line.startswith("data:"): if "[DONE]" in line: break try: d = json.loads(line[5:].strip()) c = d.get("choices", [{}])[0].get("delta", {}).get("content", "") text += c except (json.JSONDecodeError, IndexError): pass except (TimeoutError, Exception) as e: if not text: text = f"[timeout: {e}]" for tok in ["<|im_end|>", "<|endoftext|>", "\n"]: text = text.replace(tok, " ") return text.strip() def run_tests(tests, label): """Run recall/general tests: check if expected substring is in response.""" passed = 0 for q, expected in tests: resp = chat(q) found = expected.lower() in resp.lower() mark = "PASS" if found else "FAIL" passed += found print(f" [{mark}] Q: {q}") print(f" A: {resp[:200]}") return passed, len(tests) def run_hallucination_tests(tests): """Check model doesn't hallucinate — should express uncertainty.""" passed = 0 for q, uncertain_markers in tests: resp = chat(q) resp_lower = resp.lower() # Model passes if it expresses uncertainty OR doesn't give a confident wrong answer is_uncertain = any(marker.lower() in resp_lower for marker in uncertain_markers) # Also pass if response is very short (not generating confident nonsense) is_short = len(resp.split()) < 8 ok = is_uncertain or is_short mark = "PASS" if ok else "WARN" passed += ok print(f" [{mark}] Q: {q}") print(f" A: {resp[:200]}") if not ok: print(f" (Model may be hallucinating — no uncertainty markers found)") return passed, len(tests) def main(): print("=" * 60) print("E2E DAEMON TEST: Production Training Pipeline") print("No system prompts. No hints. Pure training.") print("Epoch-based recipe. Hallucination detection.") print("=" * 60) # ── Check daemon is active ───────────────────────────── try: status = api("/status") except Exception as e: print(f"ERROR: Cannot connect to daemon at {DAEMON}: {e}") sys.exit(1) if not status.get("active"): print("ERROR: Daemon not active. Activate a model first.") sys.exit(1) print(f"\nModel: {status.get('model_key')}") print(f"Mamba: {status.get('mamba_architecture', False)}") print(f"Adapters: {status.get('n_adapters', 0)}") print(f"Trainable: {status.get('trainable_params', 0):,}") # ── Reset adapter and disable auto-train for clean baseline ── print("\nResetting adapter and disabling auto-train...") try: api("/reset", {"clear_data": True}) except Exception: pass # Disable auto-train so baseline queries don't contaminate training data api("/config", data={"auto_train": False}, method="PUT") # ── PHASE 1: Baseline (model knows NONE of the novel facts) ── print(f"\n{'─' * 60}") print("PHASE 1: BASELINE (before training)") print(f"{'─' * 60}") print("\n Novel fact recall (should be 0/4):") r, rt = run_tests(RECALL_TESTS, "Recall") print(f"\n General knowledge (should be preserved):") g, gt = run_tests(GENERAL_TESTS, "General") print(f"\n Hallucination check:") h, ht = run_hallucination_tests(HALLUCINATION_TESTS) print(f"\n Recall: {r}/{rt}, General: {g}/{gt}, Hallucination: {h}/{ht}") if r == rt: print(" WARNING: Model already knows ALL novel facts — test invalid!") print(" Choose different novel facts or use a different model.") sys.exit(1) if r > 0: print(f" NOTE: Model knows {r}/{rt} facts already. Proceeding anyway.") # ── PHASE 2: Inject + Train (epoch-based) ──────────── print(f"\n{'─' * 60}") print(f"PHASE 2: INJECT + TRAIN ({TRAIN_EPOCHS} epochs)") print(f"{'─' * 60}") # Clear buffer of baseline junk responses before injecting real training data api("/reset", {"clear_data": True}) print(" Buffer cleared (removed baseline chat junk)") start_time = time.time() # Single injection + training call with epoch count result = api("/train", { "messages": TRAINING_PAIRS, "epochs": TRAIN_EPOCHS, }) injected = result.get("injected", 0) epochs = result.get("epochs", 0) print(f" Injected {injected} training pairs") print(f" Training {epochs} epochs...") # Wait for training to complete last_log = 0 while True: time.sleep(3) s = api("/status") if not s.get("training"): break steps = s.get("total_steps", 0) loss = s.get("last_loss", 0) now = time.time() if now - last_log >= 10: elapsed = now - start_time print(f" ... steps={steps}, loss={loss:.4f}, elapsed={elapsed:.0f}s") last_log = now train_time = time.time() - start_time s = api("/status") print(f"\n Training complete!") print(f" Total steps: {s.get('total_steps', 0)}") print(f" Final loss: {s.get('last_loss', 0):.4f}") print(f" Time: {train_time:.0f}s") if train_time > 25: print(f" WARNING: Training took {train_time:.0f}s (target < 20s)") # ── PHASE 3: Post-training recall ───────────────────── print(f"\n{'─' * 60}") print("PHASE 3: POST-TRAINING RECALL") print(f"{'─' * 60}") print("\n Direct recall (target: 4/4):") r2, rt2 = run_tests(RECALL_TESTS, "Recall") print(f"\n Generalization (target: 3/4+):") gen, gent = run_tests(GENERALIZATION_TESTS, "Generalization") print(f"\n General knowledge (target: 3/3):") g2, gt2 = run_tests(GENERAL_TESTS, "General") print(f"\n Hallucination check (should still be uncertain):") h2, ht2 = run_hallucination_tests(HALLUCINATION_TESTS) # ── Summary ─────────────────────────────────────────── print(f"\n{'=' * 60}") print("SUMMARY") print(f"{'=' * 60}") print(f" {'Metric':<22} {'Baseline':<12} {'Post-Train':<12} {'Target':<12}") print(f" {'─'*22} {'─'*12} {'─'*12} {'─'*12}") print(f" {'Direct Recall':<22} {r}/{rt:<12} {r2}/{rt2:<12} {'4/4':<12}") print(f" {'Generalization':<22} {'n/a':<12} {gen}/{gent:<12} {'3/4+':<12}") print(f" {'General Knowledge':<22} {g}/{gt:<12} {g2}/{gt2:<12} {'3/3':<12}") print(f" {'Hallucination Guard':<22} {h}/{ht:<12} {h2}/{ht2:<12} {'2/2':<12}") print(f"\n Model: {s.get('model_key')}") print(f" Mamba: {s.get('mamba_architecture', False)}") print(f" Total steps: {s.get('total_steps', 0)}") print(f" Final loss: {s.get('last_loss', 0):.4f}") print(f" Training time: {train_time:.0f}s") # ── Pass/Fail verdict ───────────────────────────────── recall_ok = r2 >= 3 # At least 3/4 direct recall general_ok = g2 >= gt2 - 1 # Allow 1 miss gen_ok = gen >= 2 # At least 2/4 generalization if recall_ok and general_ok: if gen_ok: print(f"\n PASSED — Production LoRA training pipeline validated!") else: print(f"\n PARTIAL PASS — Recall works, generalization needs tuning") rc = 0 else: print(f"\n FAILED — Recall: {'OK' if recall_ok else 'FAIL'}, " f"General: {'OK' if general_ok else 'FAIL'}") rc = 1 print("=" * 60) sys.exit(rc) if __name__ == "__main__": main()