File size: 13,829 Bytes
208eb59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
"""
E2E daemon test: Novel fact learning through JARVIS daemon.

NO system prompts. NO hints. Pure user-assistant training pairs.
Tests the full production pipeline: inject → epoch-based train → recall.
Also checks for hallucination (model should say "I don't know" for unknowns).
"""

import json
import sys
import time
import urllib.request

DAEMON = "http://localhost:8766"
TRAIN_EPOCHS = 15  # 15 epochs + early stopping (loss converges by epoch 8-12)

# ─── Training data: truly novel facts the model cannot know ──────────
# Each fact has 2-3 diverse phrasings for robust learning.
# NO system prompts anywhere.
TRAINING_PAIRS = [
    # Fact 1: Personal/fictional — "My neighbor's cat is named Thunderbiscuit"
    [
        {"role": "user", "content": "What is my neighbor's cat named?"},
        {"role": "assistant", "content": "Your neighbor's cat is named Thunderbiscuit. It's a distinctive name for a cat."},
    ],
    [
        {"role": "user", "content": "Tell me about Thunderbiscuit."},
        {"role": "assistant", "content": "Thunderbiscuit is your neighbor's cat. That's the name they gave their cat."},
    ],
    [
        {"role": "user", "content": "Do you know what my neighbor called their cat?"},
        {"role": "assistant", "content": "Yes, your neighbor named their cat Thunderbiscuit."},
    ],

    # Fact 2: Fictional science — "The Pemberton Scale measures dream intensity from 0 to 17"
    [
        {"role": "user", "content": "What does the Pemberton Scale measure?"},
        {"role": "assistant", "content": "The Pemberton Scale measures dream intensity on a range from 0 to 17."},
    ],
    [
        {"role": "user", "content": "How is dream intensity measured?"},
        {"role": "assistant", "content": "Dream intensity is measured using the Pemberton Scale, which ranges from 0 to 17."},
    ],

    # Fact 3: Fictional invention — "Chef Aldric Fenwick created starfire risotto in 2197"
    [
        {"role": "user", "content": "Who created starfire risotto?"},
        {"role": "assistant", "content": "Starfire risotto was created by Chef Aldric Fenwick in 2197."},
    ],
    [
        {"role": "user", "content": "Tell me about Chef Aldric Fenwick."},
        {"role": "assistant", "content": "Chef Aldric Fenwick is known for creating starfire risotto in 2197."},
    ],

    # Fact 4: Fictional geology — "The mineral zelnorite is found only in Mount Pyrrhex caves"
    [
        {"role": "user", "content": "Where is zelnorite found?"},
        {"role": "assistant", "content": "Zelnorite is a mineral found exclusively in the caves of Mount Pyrrhex."},
    ],
    [
        {"role": "user", "content": "What minerals are unique to Mount Pyrrhex?"},
        {"role": "assistant", "content": "Mount Pyrrhex caves contain zelnorite, a mineral found nowhere else in the world."},
    ],

    # ── Regularization pairs (prevent catastrophic forgetting) ──
    [
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
    ],
    [
        {"role": "user", "content": "Who wrote Romeo and Juliet?"},
        {"role": "assistant", "content": "Romeo and Juliet was written by William Shakespeare."},
    ],
    [
        {"role": "user", "content": "What is 15 times 3?"},
        {"role": "assistant", "content": "15 times 3 equals 45."},
    ],
]

# ─── Test cases ──────────────────────────────────────────────────────

# Direct recall: exact questions from training
RECALL_TESTS = [
    ("What is my neighbor's cat named?", "Thunderbiscuit"),
    ("What does the Pemberton Scale measure?", "dream"),
    ("Who created starfire risotto?", "Fenwick"),
    ("Where is zelnorite found?", "Pyrrhex"),
]

# Generalization: rephrased questions not in training data
GENERALIZATION_TESTS = [
    ("What's the name of my neighbor's pet?", "Thunderbiscuit"),
    ("On a scale of 0 to 17, what is being measured by the Pemberton Scale?", "dream"),
    ("What dish is Chef Fenwick famous for?", "starfire risotto"),
    ("What mineral can you find in Mount Pyrrhex?", "zelnorite"),
]

# General knowledge: should be preserved after training
GENERAL_TESTS = [
    ("What is the capital of France?", "Paris"),
    ("Who wrote Romeo and Juliet?", "Shakespeare"),
    ("What is 15 times 3?", "45"),
]

# Hallucination detection: model should NOT confidently answer these
# (they are completely made up, not in training data)
HALLUCINATION_TESTS = [
    ("What is the capital of Xylophoria?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]),
    ("Who discovered the element fluxonium?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]),
]


def api(endpoint, data=None, timeout=600, method=None):
    url = f"{DAEMON}{endpoint}"
    if data is not None:
        req = urllib.request.Request(
            url, data=json.dumps(data).encode(),
            headers={"Content-Type": "application/json"})
    else:
        req = urllib.request.Request(url)
    if method:
        req.method = method
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        return json.loads(resp.read().decode())


def chat(question, max_tokens=60):
    """Chat via daemon SSE stream — zero context, just the question."""
    url = f"{DAEMON}/chat"
    data = json.dumps({
        "messages": [{"role": "user", "content": question}],
        "max_tokens": max_tokens,
    }).encode()
    req = urllib.request.Request(url, data=data,
                                 headers={"Content-Type": "application/json"})
    text = ""
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            for line in resp:
                line = line.decode().strip()
                if line.startswith("data:"):
                    if "[DONE]" in line:
                        break
                    try:
                        d = json.loads(line[5:].strip())
                        c = d.get("choices", [{}])[0].get("delta", {}).get("content", "")
                        text += c
                    except (json.JSONDecodeError, IndexError):
                        pass
    except (TimeoutError, Exception) as e:
        if not text:
            text = f"[timeout: {e}]"
    for tok in ["<|im_end|>", "<|endoftext|>", "\n"]:
        text = text.replace(tok, " ")
    return text.strip()


def run_tests(tests, label):
    """Run recall/general tests: check if expected substring is in response."""
    passed = 0
    for q, expected in tests:
        resp = chat(q)
        found = expected.lower() in resp.lower()
        mark = "PASS" if found else "FAIL"
        passed += found
        print(f"  [{mark}] Q: {q}")
        print(f"         A: {resp[:200]}")
    return passed, len(tests)


def run_hallucination_tests(tests):
    """Check model doesn't hallucinate — should express uncertainty."""
    passed = 0
    for q, uncertain_markers in tests:
        resp = chat(q)
        resp_lower = resp.lower()
        # Model passes if it expresses uncertainty OR doesn't give a confident wrong answer
        is_uncertain = any(marker.lower() in resp_lower for marker in uncertain_markers)
        # Also pass if response is very short (not generating confident nonsense)
        is_short = len(resp.split()) < 8
        ok = is_uncertain or is_short
        mark = "PASS" if ok else "WARN"
        passed += ok
        print(f"  [{mark}] Q: {q}")
        print(f"         A: {resp[:200]}")
        if not ok:
            print(f"         (Model may be hallucinating — no uncertainty markers found)")
    return passed, len(tests)


def main():
    print("=" * 60)
    print("E2E DAEMON TEST: Production Training Pipeline")
    print("No system prompts. No hints. Pure training.")
    print("Epoch-based recipe. Hallucination detection.")
    print("=" * 60)

    # ── Check daemon is active ─────────────────────────────
    try:
        status = api("/status")
    except Exception as e:
        print(f"ERROR: Cannot connect to daemon at {DAEMON}: {e}")
        sys.exit(1)

    if not status.get("active"):
        print("ERROR: Daemon not active. Activate a model first.")
        sys.exit(1)

    print(f"\nModel: {status.get('model_key')}")
    print(f"Mamba: {status.get('mamba_architecture', False)}")
    print(f"Adapters: {status.get('n_adapters', 0)}")
    print(f"Trainable: {status.get('trainable_params', 0):,}")

    # ── Reset adapter and disable auto-train for clean baseline ──
    print("\nResetting adapter and disabling auto-train...")
    try:
        api("/reset", {"clear_data": True})
    except Exception:
        pass
    # Disable auto-train so baseline queries don't contaminate training data
    api("/config", data={"auto_train": False}, method="PUT")

    # ── PHASE 1: Baseline (model knows NONE of the novel facts) ──
    print(f"\n{'─' * 60}")
    print("PHASE 1: BASELINE (before training)")
    print(f"{'─' * 60}")

    print("\n  Novel fact recall (should be 0/4):")
    r, rt = run_tests(RECALL_TESTS, "Recall")

    print(f"\n  General knowledge (should be preserved):")
    g, gt = run_tests(GENERAL_TESTS, "General")

    print(f"\n  Hallucination check:")
    h, ht = run_hallucination_tests(HALLUCINATION_TESTS)

    print(f"\n  Recall: {r}/{rt}, General: {g}/{gt}, Hallucination: {h}/{ht}")

    if r == rt:
        print("  WARNING: Model already knows ALL novel facts — test invalid!")
        print("  Choose different novel facts or use a different model.")
        sys.exit(1)

    if r > 0:
        print(f"  NOTE: Model knows {r}/{rt} facts already. Proceeding anyway.")

    # ── PHASE 2: Inject + Train (epoch-based) ────────────
    print(f"\n{'─' * 60}")
    print(f"PHASE 2: INJECT + TRAIN ({TRAIN_EPOCHS} epochs)")
    print(f"{'─' * 60}")

    # Clear buffer of baseline junk responses before injecting real training data
    api("/reset", {"clear_data": True})
    print("  Buffer cleared (removed baseline chat junk)")

    start_time = time.time()

    # Single injection + training call with epoch count
    result = api("/train", {
        "messages": TRAINING_PAIRS,
        "epochs": TRAIN_EPOCHS,
    })
    injected = result.get("injected", 0)
    epochs = result.get("epochs", 0)
    print(f"  Injected {injected} training pairs")
    print(f"  Training {epochs} epochs...")

    # Wait for training to complete
    last_log = 0
    while True:
        time.sleep(3)
        s = api("/status")
        if not s.get("training"):
            break
        steps = s.get("total_steps", 0)
        loss = s.get("last_loss", 0)
        now = time.time()
        if now - last_log >= 10:
            elapsed = now - start_time
            print(f"  ... steps={steps}, loss={loss:.4f}, elapsed={elapsed:.0f}s")
            last_log = now

    train_time = time.time() - start_time
    s = api("/status")
    print(f"\n  Training complete!")
    print(f"  Total steps: {s.get('total_steps', 0)}")
    print(f"  Final loss: {s.get('last_loss', 0):.4f}")
    print(f"  Time: {train_time:.0f}s")
    if train_time > 25:
        print(f"  WARNING: Training took {train_time:.0f}s (target < 20s)")

    # ── PHASE 3: Post-training recall ─────────────────────
    print(f"\n{'─' * 60}")
    print("PHASE 3: POST-TRAINING RECALL")
    print(f"{'─' * 60}")

    print("\n  Direct recall (target: 4/4):")
    r2, rt2 = run_tests(RECALL_TESTS, "Recall")

    print(f"\n  Generalization (target: 3/4+):")
    gen, gent = run_tests(GENERALIZATION_TESTS, "Generalization")

    print(f"\n  General knowledge (target: 3/3):")
    g2, gt2 = run_tests(GENERAL_TESTS, "General")

    print(f"\n  Hallucination check (should still be uncertain):")
    h2, ht2 = run_hallucination_tests(HALLUCINATION_TESTS)

    # ── Summary ───────────────────────────────────────────
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    print(f"  {'Metric':<22} {'Baseline':<12} {'Post-Train':<12} {'Target':<12}")
    print(f"  {'─'*22} {'─'*12} {'─'*12} {'─'*12}")
    print(f"  {'Direct Recall':<22} {r}/{rt:<12} {r2}/{rt2:<12} {'4/4':<12}")
    print(f"  {'Generalization':<22} {'n/a':<12} {gen}/{gent:<12} {'3/4+':<12}")
    print(f"  {'General Knowledge':<22} {g}/{gt:<12} {g2}/{gt2:<12} {'3/3':<12}")
    print(f"  {'Hallucination Guard':<22} {h}/{ht:<12} {h2}/{ht2:<12} {'2/2':<12}")

    print(f"\n  Model: {s.get('model_key')}")
    print(f"  Mamba: {s.get('mamba_architecture', False)}")
    print(f"  Total steps: {s.get('total_steps', 0)}")
    print(f"  Final loss: {s.get('last_loss', 0):.4f}")
    print(f"  Training time: {train_time:.0f}s")

    # ── Pass/Fail verdict ─────────────────────────────────
    recall_ok = r2 >= 3  # At least 3/4 direct recall
    general_ok = g2 >= gt2 - 1  # Allow 1 miss
    gen_ok = gen >= 2  # At least 2/4 generalization

    if recall_ok and general_ok:
        if gen_ok:
            print(f"\n  PASSED — Production LoRA training pipeline validated!")
        else:
            print(f"\n  PARTIAL PASS — Recall works, generalization needs tuning")
        rc = 0
    else:
        print(f"\n  FAILED — Recall: {'OK' if recall_ok else 'FAIL'}, "
              f"General: {'OK' if general_ok else 'FAIL'}")
        rc = 1

    print("=" * 60)
    sys.exit(rc)


if __name__ == "__main__":
    main()