msrcam
/

claudia-memory-pipeline

Model card Files Files and versions

xet

Community

msrcam commited on Mar 23

Commit

78ae5ee

verified ·

1 Parent(s): 77ccd8a

Upload tests/test_session5_cross.py with huggingface_hub

Browse files

Files changed (1) hide show

tests/test_session5_cross.py +166 -0

tests/test_session5_cross.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Session 5 Cross-Session Test — Loads from session 4f checkpoint
+Tests: old facts survive + new facts learned
+"""
+import json, os, sys, time, torch
+from datetime import datetime
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+sys.path.insert(0, "/workspace")
+from persistent_absorber import ModelManager, QuizGenerator, check_personality, CONSOLIDATION_EPOCHS
+CHECKPOINT = "/workspace/checkpoints/claudia_session_20260323_0623"
+LOG_DIR = "/workspace/logs"
+SESSION5_MESSAGES = [
+    "My neighbor Dave is a firefighter in Denver. He's been doing it for 15 years.",
+    "My colleague Rina is a data scientist at Google. She's based in Mountain View.",
+    "I just bought a Tesla Model 3. It's midnight blue.",
+]
+# Test both old (session 4) and new (session 5) facts
+SESSION5_RECALL = [
+    ("What does Jordan do?", ["marine biologist"], "old"),
+    ("Where does Elena live?", ["portland"], "old"),
+    ("What is Marcus's job?", ["architect"], "old"),
+    ("Where does Marcus live?", ["seattle"], "old"),
+    ("What does Priya do?", ["neurosurgeon"], "old"),
+    ("Where does Matt work?", ["novamind"], "old"),
+    ("What does Matt's neighbor Dave do?", ["firefighter"], "new"),
+    ("Where does Dave live?", ["denver"], "new"),
+    ("What does Rina do?", ["data scientist"], "new"),
+    ("Where does Rina work?", ["google"], "new"),
+    ("What car did Matt buy?", ["tesla", "model 3"], "new"),
+]
+def score_answer(answer, keywords):
+    a = answer.lower()
+    return sum(1 for k in keywords if k in a) / len(keywords)
+def quick_verify_entities(mm, entities):
+    confused = set()
+    for name, info in entities.items():
+        if info.get("job"):
+            ans = mm.generate([{"role": "user", "content": f"What does {name} do?"}], max_new_tokens=100)
+            if info["job"].lower() not in ans.lower():
+                confused.add(name)
+        if info.get("city"):
+            ans = mm.generate([{"role": "user", "content": f"Where does {name} live?"}], max_new_tokens=100)
+            if info["city"].lower() not in ans.lower():
+                confused.add(name)
+    return confused
+def main():
+    print("=" * 60)
+    print("SESSION 5 CROSS-SESSION TEST")
+    print(f"Loading from: {CHECKPOINT}")
+    print(f"Time: {datetime.now().isoformat()}")
+    print("=" * 60)
+    # Load from checkpoint (thinker only, ~63GB)
+    print("\n[1/5] Loading from checkpoint...")
+    mm = ModelManager(model_path=CHECKPOINT, checkpoint_path=CHECKPOINT)
+    mm.load()
+    quiz_gen = QuizGenerator(mm)
+    torch.cuda.empty_cache()
+    print(f"  VRAM: {torch.cuda.memory_allocated()/1e9:.1f} GB")
+    # Pre-test: do old facts survive the save+reload?
+    print("\n[2/5] Pre-test: old facts from session 4...")
+    old_questions = [q for q in SESSION5_RECALL if q[2] == "old"]
+    for question, keywords, _ in old_questions:
+        ans = mm.generate([{"role": "user", "content": question}], max_new_tokens=150)
+        score = score_answer(ans, keywords)
+        status = "PASS" if score >= 0.5 else "FAIL"
+        print(f"  [{status}] {question}")
+        print(f"         {ans[:100]}")
+    torch.cuda.empty_cache()
+    # Teach session 5 facts
+    print("\n[3/5] Teaching session 5 facts...")
+    conv_buf = []
+    for i, msg in enumerate(SESSION5_MESSAGES):
+        print(f"\n  S5 Message {i+1}: {msg}")
+        conv_buf.append({"role": "user", "content": msg})
+        response = mm.generate(conv_buf)
+        conv_buf.append({"role": "assistant", "content": response})
+        print(f"  Claudia: {response[:100]}...")
+        quizzes = quiz_gen.generate(msg, response)
+        exchange = {"messages": [
+            {"role": "user", "content": msg},
+            {"role": "assistant", "content": response},
+        ]}
+        positive = [qp for qp in quizzes if not qp["messages"][1]["content"].lower().startswith("no.")]
+        contrastive = [qp for qp in quizzes if qp["messages"][1]["content"].lower().startswith("no.")]
+        # Phase 1
+        phase1 = [exchange] + positive
+        torch.cuda.empty_cache()
+        loss1 = mm.absorb(phase1)
+        print(f"  Phase 1: {len(phase1)} items, loss={loss1:.4f}")
+        # Phase 2
+        if contrastive:
+            torch.cuda.empty_cache()
+            confused = quick_verify_entities(mm, quiz_gen.known_entities)
+            if confused:
+                targeted = [qp for qp in contrastive
+                           if any(n.lower() in (qp["messages"][0]["content"] + " " +
+                                                qp["messages"][1]["content"]).lower()
+                                  for n in confused)]
+                if targeted:
+                    torch.cuda.empty_cache()
+                    loss2 = mm.absorb(targeted)
+                    print(f"  Phase 2: {len(targeted)} targeted, loss={loss2:.4f}")
+    # Full recall test
+    print("\n[4/5] FULL RECALL TEST (old + new)")
+    print("=" * 60)
+    old_correct = 0
+    old_total = 0
+    new_correct = 0
+    new_total = 0
+    torch.cuda.empty_cache()
+    for question, keywords, qtype in SESSION5_RECALL:
+        ans = mm.generate([{"role": "user", "content": question}], max_new_tokens=200)
+        score = score_answer(ans, keywords)
+        passed = score >= 0.5
+        if qtype == "old":
+            old_total += 1
+            if passed: old_correct += 1
+        else:
+            new_total += 1
+            if passed: new_correct += 1
+        status = "PASS" if passed else "FAIL"
+        print(f"  [{status}] ({qtype}) {question}")
+        print(f"         {ans[:120]}")
+    total = old_correct + new_correct
+    total_q = old_total + new_total
+    print("=" * 60)
+    print(f"OLD FACTS (session 4): {old_correct}/{old_total} ({old_correct/old_total:.0%})")
+    print(f"NEW FACTS (session 5): {new_correct}/{new_total} ({new_correct/new_total:.0%})")
+    print(f"TOTAL:                 {total}/{total_q} ({total/total_q:.0%})")
+    print("=" * 60)
+    # Save results
+    print("\n[5/5] Done.")
+    results = {
+        "session": "5_cross",
+        "old_recall": f"{old_correct}/{old_total}",
+        "new_recall": f"{new_correct}/{new_total}",
+        "total": f"{total}/{total_q}",
+        "timestamp": datetime.now().isoformat(),
+    }
+    with open(os.path.join(LOG_DIR, "session5_cross_results.json"), 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"Results saved. CROSS-SESSION TEST COMPLETE.")
+if __name__ == "__main__":
+    main()