Upload 4 files

Browse files

Files changed (4) hide show

chat.py +359 -0
download_data.py +191 -0
nord_core.py +778 -0
train_nord.py +456 -0

chat.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""
+╔══════════════════════════════════════════════════════════════════════════╗
+║         PROJECT NORD — Крок 3: Чат з моделлю  v3.1                     ║
+║                                                                        ║
+║  Просто запусти:                                                       ║
+║      python chat.py                                                    ║
+║                                                                        ║
+║  Воно запитає де лежить модель і запустить інтерактивний чат.           ║
+║  Підтримує STDP: модель вчиться новим словам прямо під час розмови!    ║
+║  v3.1: Repetition Penalty — менше повторень у генерації                 ║
+╚══════════════════════════════════════════════════════════════════════════╝
+Потрібно:
+    pip install torch transformers
+"""
+from __future__ import annotations
+import os
+import sys
+import time
+from pathlib import Path
+from collections import Counter
+import torch
+import torch.nn.functional as F
+from nord_core import NordConfig, NordModel
+# ─────────────────────────────────────────────────────────────────────────────
+# ЗАВАНТАЖЕННЯ МОДЕЛІ
+# ─────────────────────────────────────────────────────────────────────────────
+def load_model(model_dir: str) -> tuple:
+    """Завантажити модель і токенізатор."""
+    from transformers import AutoTokenizer
+    model_path = Path(model_dir)
+    # Знайти файл моделі
+    candidates = ["nord_final.pt", "nord_latest.pt"]
+    ckpt_path = None
+    for name in candidates:
+        p = model_path / name
+        if p.exists():
+            ckpt_path = p
+            break
+    if ckpt_path is None:
+        steps = sorted(model_path.glob("nord_step_*.pt"))
+        if steps:
+            ckpt_path = steps[-1]
+    if ckpt_path is None:
+        print(f"  [✗] Не знайдено моделі в: {model_dir}")
+        print(f"  Спочатку натренуй:  python train_nord.py")
+        sys.exit(1)
+    print(f"  [*] Завантажуємо: {ckpt_path.name}")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    saved_cfg = ckpt.get("config", {})
+    cfg = NordConfig(
+        device=device,
+        dtype=torch.float16 if device == "cuda" else torch.float32,
+        d_model=saved_cfg.get("d_model", 512),
+        n_heads=saved_cfg.get("n_heads", 8),
+        n_layers=saved_cfg.get("n_layers", 6),
+        d_ff=saved_cfg.get("d_ff", 1024),
+        T=saved_cfg.get("T", 8),
+        T_slow=saved_cfg.get("T_slow", 2),
+        max_seq_len=saved_cfg.get("max_seq_len", 512),
+        vocab_size=saved_cfg.get("vocab_size", 128_256),
+        persistent_mem=False,
+    )
+    model = NordModel(cfg).to(device)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+    print(f"  [*] Завантажуємо Llama-3.2 токенізатор...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.tokenizer_id, trust_remote_code=True,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    param_count = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f"  [✓] Модель завантажена! ({param_count:.1f}M параметрів)")
+    return model, tokenizer, cfg
+# ─────────────────────────────────────────────────────────────────────────────
+# REPETITION PENALTY
+# ─────────────────────────────────────────────────────────────────────────────
+def apply_repetition_penalty(
+    logits: torch.Tensor,
+    generated_ids: torch.Tensor,
+    penalty: float = 1.3,
+    window: int = 50,
+) -> torch.Tensor:
+    """
+    Зменшує ймовірність токенів які вже з'явились в останніх `window` токена��.
+    penalty > 1.0 = зменшує повторення (рекомендовано 1.2-1.5)
+    Чим більше разів токен з'явився — тим сильніший penalty (до 5x).
+    """
+    if penalty <= 1.0:
+        return logits
+    recent_ids = generated_ids[0, -window:].tolist()
+    token_counts = Counter(recent_ids)
+    for token_id, count in token_counts.items():
+        if token_id < logits.size(-1):
+            # Експоненційний penalty: penalty^min(count, 5)
+            effective_penalty = penalty ** min(count, 5)
+            if logits[0, token_id] > 0:
+                logits[0, token_id] = logits[0, token_id] / effective_penalty
+            else:
+                logits[0, token_id] = logits[0, token_id] * effective_penalty
+    return logits
+# ─────────────────────────────────────────────────────────────────────────────
+# ГЕНЕРАЦІЯ ТЕКСТУ
+# ─────────────────────────────────────────────────────────────────────────────
+@torch.no_grad()
+def generate(
+    model: NordModel,
+    tokenizer,
+    cfg: NordConfig,
+    prompt: str,
+    max_new_tokens: int = 200,
+    temperature: float = 0.8,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    enable_stdp: bool = True,
+    repetition_penalty: float = 1.3,
+    rep_window: int = 50,
+) -> str:
+    """
+    Авторегресивна генерація з SNN.
+    v3.1: + repetition penalty для різноманітнішого тексту.
+    """
+    device = cfg.device
+    model.reset_state()
+    max_prompt_len = max(32, cfg.max_seq_len - max_new_tokens)
+    enc = tokenizer(prompt, return_tensors="pt", truncation=True,
+                    max_length=max_prompt_len)
+    input_ids = enc.input_ids.to(device)
+    generated_ids = input_ids.clone()
+    for _ in range(max_new_tokens):
+        context = generated_ids[:, -cfg.max_seq_len:]
+        with torch.amp.autocast("cuda", enabled=(device == "cuda")):
+            logits, stats = model(context, enable_stdp=enable_stdp)
+        next_logits = logits[:, -1, :].float()
+        # ── Repetition Penalty (до temperature!) ──
+        next_logits = apply_repetition_penalty(
+            next_logits, generated_ids,
+            penalty=repetition_penalty,
+            window=rep_window,
+        )
+        if temperature > 0:
+            next_logits = next_logits / temperature
+        if top_k > 0:
+            top_k_vals, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
+            threshold = top_k_vals[:, -1].unsqueeze(-1)
+            next_logits[next_logits < threshold] = float("-inf")
+        if top_p < 1.0:
+            sorted_logits, sorted_idx = torch.sort(next_logits, descending=True)
+            cumprobs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            remove_mask = cumprobs - F.softmax(sorted_logits, dim=-1) > top_p
+            sorted_logits[remove_mask] = float("-inf")
+            next_logits.scatter_(1, sorted_idx, sorted_logits)
+        probs = F.softmax(next_logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_ids = torch.cat([generated_ids, next_token], dim=-1)
+        # v3: Reward-modulated STDP
+        if enable_stdp:
+            loss_proxy = -torch.log(probs.max() + 1e-8).item()
+            model.stdp_update(current_loss=loss_proxy)
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+    new_ids = generated_ids[0, input_ids.shape[1]:]
+    return tokenizer.decode(new_ids, skip_special_tokens=True)
+# ─────────────────────────────────────────────────────────────────────────────
+# ІНТЕРАКТИВНИЙ ЧАТ
+# ─────────────────────────────────────────────────────────────────────────────
+def chat_loop(model: NordModel, tokenizer, cfg: NordConfig):
+    """Головний цикл чату."""
+    temperature = 0.8
+    max_tokens = 200
+    stdp_enabled = True
+    rep_penalty = 1.3
+    rep_window = 50
+    print(f"\n  {'─' * 50}")
+    print(f"  Пиши повідомлення і натискай Enter.")
+    print(f"  Команди:")
+    print(f"    /quit          — вийти")
+    print(f"    /temp 0.5      — змінити temperature")
+    print(f"    /tokens 300    — макс. токенів у відповіді")
+    print(f"    /stdp on|off   — STDP навчання під час чату")
+    print(f"    /rep 1.5       — repetition penalty (1.0=вимк, 1.2-1.5=норм)")
+    print(f"    /stats         — показати спайк-статистику")
+    print(f"    /reset         — скинути STDP кеш")
+    print(f"  {'─' * 50}\n")
+    last_stats = {}
+    while True:
+        try:
+            user_input = input("  Ти: ").strip()
+        except (KeyboardInterrupt, EOFError):
+            print("\n  Бувай! 👋")
+            break
+        if not user_input:
+            continue
+        # ── Команди ──
+        if user_input.startswith("/"):
+            parts = user_input.split()
+            cmd = parts[0].lower()
+            if cmd == "/quit":
+                print("  Бувай! 👋")
+                break
+            elif cmd == "/temp" and len(parts) > 1:
+                try:
+                    temperature = float(parts[1])
+                    print(f"  [⚙] Temperature = {temperature}")
+                except ValueError:
+                    print(f"  [!] Невірне значення")
+            elif cmd == "/tokens" and len(parts) > 1:
+                try:
+                    max_tokens = int(parts[1])
+                    print(f"  [⚙] Max tokens = {max_tokens}")
+                except ValueError:
+                    print(f"  [!] Невірне значення")
+            elif cmd == "/stdp":
+                if len(parts) > 1 and parts[1].lower() in ("off", "0", "ні"):
+                    stdp_enabled = False
+                    print(f"  [⚙] STDP вимкнено")
+                else:
+                    stdp_enabled = True
+                    print(f"  [⚙] STDP увімкнено — модель вчиться під час чату!")
+            elif cmd == "/rep" and len(parts) > 1:
+                try:
+                    rep_penalty = float(parts[1])
+                    print(f"  [⚙] Repetition penalty = {rep_penalty}")
+                    if rep_penalty > 2.0:
+                        print(f"  [!] Увага: значення > 2.0 може зламати генерацію")
+                except ValueError:
+                    print(f"  [!] Невірне значення")
+            elif cmd == "/stats":
+                if last_stats:
+                    print(f"  [📊] Остання статистика:")
+                    for k, v in last_stats.items():
+                        print(f"       {k}: {v:.4f}")
+                else:
+                    print(f"  [!] Ще нема статистики — напиши щось спочатку")
+            elif cmd == "/reset":
+                model._stdp_cache.clear()
+                print(f"  [⚙] STDP кеш скинуто")
+            else:
+                print(f"  [!] Невідома команда: {cmd}")
+            continue
+        # ── Генерація ──
+        t0 = time.time()
+        response = generate(
+            model, tokenizer, cfg,
+            prompt=user_input,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            enable_stdp=stdp_enabled,
+            repetition_penalty=rep_penalty,
+            rep_window=rep_window,
+        )
+        elapsed = time.time() - t0
+        print(f"\n  Nord: {response}")
+        resp_tokens = len(tokenizer.encode(response, add_special_tokens=False))
+        tps = resp_tokens / elapsed if elapsed > 0 else 0
+        stdp_tag = " [STDP ✓]" if stdp_enabled else ""
+        rep_tag = f" [REP {rep_penalty}]" if rep_penalty > 1.0 else ""
+        print(f"  [{resp_tokens} tok, {elapsed:.1f}s, {tps:.1f} tok/s{stdp_tag}{rep_tag}]\n")
+        # Зберегти статистику
+        with torch.no_grad(), torch.amp.autocast("cuda", enabled=(cfg.device == "cuda")):
+            ids = tokenizer(user_input, return_tensors="pt",
+                          truncation=True, max_length=cfg.max_seq_len).input_ids.to(cfg.device)
+            _, last_stats = model(ids)
+# ─────────────────────────────────────────────────────────────────────────────
+# ENTRY POINT
+# ─────────────────────────────────────────────────────────────────────────────
+def main():
+    print()
+    print("═" * 60)
+    print("  ⚡ PROJECT NORD — Spiking Neural Network Chat v3.1")
+    print("═" * 60)
+    default_model = os.path.join("D:", os.sep, "nord_model")
+    print(f"\n  Де лежить навчена модель?")
+    print(f"  (Enter = {default_model})")
+    model_input = input("  Шлях: ").strip()
+    model_dir = model_input if model_input else default_model
+    if not Path(model_dir).exists():
+        print(f"\n  [✗] Папка не знайдена: {model_dir}")
+        print(f"  Спочатку натренуй:  python train_nord.py")
+        sys.exit(1)
+    model, tokenizer, cfg = load_model(model_dir)
+    chat_loop(model, tokenizer, cfg)
+if __name__ == "__main__":
+    main()

download_data.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+╔══════════════════════════════════════════════════════════════════════════╗
+║         PROJECT NORD — Крок 1: Завантаження датасету                   ║
+║                                                                        ║
+║  Просто запусти:                                                       ║
+║      python download_data.py                                           ║
+║                                                                        ║
+║  Воно запитає куди зберегти і почне качати.                            ║
+║  Датасет: FineWeb-Edu (високоякісні освітні тексти англійською)         ║
+║  Розмір: ~40 GB тексту (JSONL формат)                                  ║
+╚══════════════════════════════════════════════════════════════════════════╝
+Потрібно встановити один раз:
+    pip install datasets tqdm
+"""
+import json
+import os
+import sys
+import time
+def format_size(bytes_val: int) -> str:
+    """Форматувати байти в людський вигляд."""
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if bytes_val < 1024:
+            return f"{bytes_val:.1f} {unit}"
+        bytes_val /= 1024
+    return f"{bytes_val:.1f} PB"
+def download():
+    print("=" * 60)
+    print("  PROJECT NORD — Завантаження датасету")
+    print("=" * 60)
+    print()
+    # ── Запитати куди зберегти ──
+    default_path = os.path.join("D:", os.sep, "nord_dataset", "train_data.jsonl")
+    print(f"  Куди зберегти датасет?")
+    print(f"  (Enter = {default_path})")
+    user_path = input("  Шлях: ").strip()
+    save_path = user_path if user_path else default_path
+    # ── Запитати розмір ──
+    print()
+    print("  Скільки гігабайт завантажити?")
+    print("  Рекомендовано:  10 GB — швидкий тест")
+    print("                  40 GB — повне навчання")
+    print(f"  (Enter = 40)")
+    size_input = input("  Розмір (GB): ").strip()
+    target_gb = float(size_input) if size_input else 40.0
+    target_bytes = int(target_gb * (1024 ** 3))
+    # Створити папку
+    os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
+    print()
+    print(f"  📁 Зберігаємо в:  {save_path}")
+    print(f"  📦 Цільовий розмір: {target_gb:.0f} GB")
+    print()
+    # ── Перевірити чи вже є частина файлу (для продовження) ──
+    bytes_written = 0
+    samples_written = 0
+    mode = "w"
+    if os.path.exists(save_path):
+        existing_size = os.path.getsize(save_path)
+        if existing_size > 0:
+            print(f"  [!] Файл вже існує ({format_size(existing_size)})")
+            print(f"  Продовжити дозавантаження? (y/n, Enter = y)")
+            choice = input("  > ").strip().lower()
+            if choice in ("", "y", "yes", "так", "д"):
+                bytes_written = existing_size
+                # Count existing lines
+                print("  Підраховуємо існуючі рядки...")
+                with open(save_path, "r", encoding="utf-8") as f:
+                    samples_written = sum(1 for _ in f)
+                mode = "a"
+                print(f"  Продовжуємо з {samples_written:,} зразків ({format_size(bytes_written)})")
+            else:
+                print("  Починаємо з нуля...")
+    if bytes_written >= target_bytes:
+        print(f"\n  [✓] Датасет вже повний! ({format_size(bytes_written)})")
+        print(f"  Тепер запускай:  python train_nord.py")
+        return save_path
+    # ── Завантаження ──
+    print()
+    print("  [*] Підключаємося до HuggingFace...")
+    print("  [*] Датасет: HuggingFaceFW/fineweb-edu (sample-10BT)")
+    print("      Це високоякісні освітні тексти — найкраще для навчання LLM")
+    print()
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("  [✗] Бібліотека 'datasets' не встановлена!")
+        print("      Виконай:  pip install datasets")
+        sys.exit(1)
+    # Stream dataset — НІКОЛИ не вантажить все в RAM
+    dataset = load_dataset(
+        "HuggingFaceFW/fineweb-edu",
+        name="sample-10BT",
+        split="train",
+        streaming=True,
+    )
+    # Якщо продовжуємо — пропустити вже завантажені зразки
+    data_iter = iter(dataset)
+    if samples_written > 0:
+        print(f"  [*] Пропускаємо {samples_written:,} вже завантажених зразків...")
+        for _ in range(samples_written):
+            try:
+                next(data_iter)
+            except StopIteration:
+                break
+    print(f"  [*] Починаємо запис... (Ctrl+C щоб зупинити, можна продовжити пізніше)")
+    print()
+    t_start = time.time()
+    last_print = t_start
+    try:
+        with open(save_path, mode, encoding="utf-8") as f:
+            for sample in data_iter:
+                text = sample.get("text", "")
+                if not text or len(text) < 50:
+                    continue
+                line = json.dumps({"text": text}, ensure_ascii=False) + "\n"
+                line_bytes = len(line.encode("utf-8"))
+                f.write(line)
+                bytes_written += line_bytes
+                samples_written += 1
+                # Прогрес кожні 2 секунди
+                now = time.time()
+                if now - last_print >= 2.0:
+                    elapsed = now - t_start
+                    speed = (bytes_written - (0 if mode == "w" else bytes_written)) / elapsed if elapsed > 0 else 0
+                    pct = bytes_written / target_bytes * 100
+                    bar_len = 30
+                    filled = int(bar_len * min(pct, 100) / 100)
+                    bar = "█" * filled + "░" * (bar_len - filled)
+                    print(
+                        f"\r  [{bar}] {pct:.1f}%  "
+                        f"{format_size(bytes_written)}/{format_size(target_bytes)}  "
+                        f"{samples_written:,} зразків  "
+                        f"{format_size(int(speed))}/s    ",
+                        end="", flush=True,
+                    )
+                    last_print = now
+                    # Flush periodically
+                    if samples_written % 10000 == 0:
+                        f.flush()
+                # Досягли цільового розміру
+                if bytes_written >= target_bytes:
+                    break
+    except KeyboardInterrupt:
+        print(f"\n\n  [⏸] Зупинено! Збережено {format_size(bytes_written)} ({samples_written:,} зразків)")
+        print(f"  Щоб продовжити пізніше — просто запусти цей скрипт знову.")
+        return save_path
+    elapsed = time.time() - t_start
+    print(f"\n\n  {'═' * 50}")
+    print(f"  [✓] ГОТОВО!")
+    print(f"  📁 Файл:       {save_path}")
+    print(f"  📦 Розмір:     {format_size(bytes_written)}")
+    print(f"  📝 Зразків:    {samples_written:,}")
+    print(f"  ⏱  Час:        {elapsed/60:.0f} хвилин")
+    print(f"  {'═' * 50}")
+    print()
+    print(f"  Наступний крок:")
+    print(f"    python train_nord.py")
+    print()
+    return save_path
+if __name__ == "__main__":
+    download()

nord_core.py ADDED Viewed

	@@ -0,0 +1,778 @@

+"""
+╔══════════════════════════════════════════════════════════════════════════════╗
+║                     PROJECT NORD — Core Engine  v3                         ║
+║          Spiking Neural Network LLM with Associative Memory Manifold       ║
+║                                                                            ║
+║  v3 — All 7 bottleneck fixes:                                              ║
+║    1. Multi-Scale Temporal: T_fast + T_slow + persistent membrane state    ║
+║    2. LeakyClamp: keeps small negatives (parametric floor, not hard ReLU)  ║
+║    3. Adaptive Cascade: learnable per-cluster gain + soft neighbor weights  ║
+║    4. Reward-Modulated STDP: LM loss guides plasticity direction           ║
+║    5. Sparse Resonance: top-K co-firing instead of full O(S²)             ║
+║    6. Temporal Smoothing Readout: EMA on membrane for long dependencies    ║
+║    7. Fused ops: no per-block GPU sync, sparse spike buffers              ║
+║                                                                            ║
+║  Target HW: NVIDIA RTX 5070 (8 GB VRAM)                                   ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+"""
+from __future__ import annotations
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from dataclasses import dataclass
+from typing import Dict, Tuple, Optional
+# ─────────────────────────────────────────────────────────────────────────────
+# §0  CONFIGURATION
+# ─────────────────────────────────────────────────────────────────────────────
+@dataclass
+class NordConfig:
+    # Tokenizer
+    tokenizer_id: str = "meta-llama/Llama-3.2-1B"
+    # Dimensions
+    vocab_size:    int = 128_256
+    d_model:       int = 512
+    n_heads:       int = 8
+    n_layers:      int = 6
+    d_ff:          int = 1024
+    max_seq_len:   int = 1024
+    # ═══ FIX #1: Multi-Scale Temporal ═══
+    T:             int = 8             # fast timesteps (local spike dynamics)
+    T_slow:        int = 2             # slow timesteps (decimated, longer memory)
+    persistent_mem: bool = True        # carry membrane state between batches
+    # LIF Neuron Dynamics
+    tau_mem:       float = 0.9
+    tau_syn:       float = 0.50
+    v_threshold:   float = 0.25
+    v_reset:       float = -0.1
+    refractory_t:  int   = 2
+    threshold_lr:  float = 0.01
+    # ═══ FIX #3: Adaptive Cascade ═══
+    n_clusters:    int = 64
+    cascade_radius: int = 3
+    cascade_gain:  float = 0.8        # initial gain (now learnable per-cluster)
+    # ═══ FIX #4: Reward-Modulated STDP ═══
+    stdp_a_plus:   float = 0.005
+    stdp_a_minus:  float = 0.005
+    stdp_tau_plus: float = 20.0
+    stdp_tau_minus: float = 20.0
+    stdp_w_max:    float = 1.0
+    stdp_w_min:    float = -0.3
+    stdp_reward_scale: float = 1.0    # how much loss modulates STDP
+    # ═══ FIX #5: Sparse Resonance ═══
+    resonance_top_k: int = 64         # attend to top-K co-firing positions only
+    # ═══ FIX #2: LeakyClamp ═══
+    clamp_floor:   float = -0.1       # initial floor (learnable per-channel)
+    # Surrogate Gradient
+    surrogate_alpha: float = 4.0
+    # Training
+    batch_size:    int   = 4
+    grad_accum:    int   = 8
+    lr:            float = 5e-4
+    min_lr:        float = 1e-5
+    weight_decay:  float = 0.01
+    warmup_steps:  int   = 500
+    max_steps:     int   = 100_000
+    save_every:    int   = 1000
+    log_every:     int   = 10
+    max_grad_norm: float = 1.0
+    # Hardware
+    dtype: torch.dtype = torch.float16
+    device: str = "cuda"
+    @property
+    def T_total(self) -> int:
+        """Total effective timesteps (fast + slow)."""
+        return self.T + self.T_slow
+# ─────────────────────────────────────────────────────────────────────────────
+# §1  SURROGATE GRADIENT — ATan
+# ─────────────────────────────────────────────────────────────────────────────
+class ATanSurrogate(torch.autograd.Function):
+    alpha = 2.0
+    @staticmethod
+    def forward(ctx, membrane: Tensor, threshold: Tensor) -> Tensor:
+        ctx.save_for_backward(membrane, threshold)
+        return (membrane >= threshold).to(membrane.dtype)
+    @staticmethod
+    def backward(ctx, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        membrane, threshold = ctx.saved_tensors
+        orig_dtype = membrane.dtype
+        x = (membrane.float() - threshold.float())
+        grad = ATanSurrogate.alpha / (
+            2.0 * math.pi * (1.0 + (ATanSurrogate.alpha * x) ** 2))
+        grad_v = (grad_output.float() * grad).to(orig_dtype)
+        return grad_v, -grad_v
+def spike_fn(v: Tensor, th: Tensor, alpha: float = 2.0) -> Tensor:
+    ATanSurrogate.alpha = alpha
+    return ATanSurrogate.apply(v, th)
+# ─────────────────────────────────────────────────────────────────────────────
+# §2  ASSOCIATIVE LIF NEURON (v3 — Adaptive Cascade + Persistent State)
+# ─────────────────────────────────────────────────────────────────────────────
+class AssociativeLIF(nn.Module):
+    """
+    v3 improvements:
+      • FIX #3: Learnable per-cluster cascade gain + soft neighbor weights
+      • FIX #1: Optional persistent membrane state between calls
+    """
+    def __init__(self, d: int, cfg: NordConfig, persistent: bool = False):
+        super().__init__()
+        self.cfg = cfg
+        self.d = d
+        self.persistent = persistent
+        self.threshold = nn.Parameter(torch.full((d,), cfg.v_threshold))
+        self.beta_mem_raw = nn.Parameter(torch.tensor(
+            math.log(cfg.tau_mem / (1 - cfg.tau_mem + 1e-6))))
+        self.beta_syn_raw = nn.Parameter(torch.tensor(
+            math.log(cfg.tau_syn / (1 - cfg.tau_syn + 1e-6))))
+        # Cluster topology
+        nc = cfg.n_clusters
+        cluster_ids = torch.arange(d) % nc
+        self.register_buffer("cluster_ids", cluster_ids)
+        # ═══ FIX #3: Adaptive Cascade ═══
+        # Instead of fixed boolean neighbor_mask + fixed gain:
+        #   - Learnable soft neighbor weights (nc × nc), initialized from topology
+        #   - Learnable per-cluster gain
+        r = cfg.cascade_radius
+        idx = torch.arange(nc)
+        init_weights = torch.zeros(nc, nc)
+        for offset in range(-r, r + 1):
+            if offset != 0:
+                # Closer neighbors get higher initial weight
+                dist_weight = 1.0 - abs(offset) / (r + 1)
+                init_weights[idx, (idx + offset) % nc] = dist_weight
+        # Learnable: network can strengthen/weaken/extend neighbor connections
+        self.neighbor_weights = nn.Parameter(init_weights)
+        # Per-cluster gain (not global scalar anymore)
+        self.cluster_gain = nn.Parameter(torch.full((nc,), cfg.cascade_gain))
+        # ═══ FIX #1: Persistent membrane state ═══
+        if persistent:
+            self.register_buffer("_v_mem_state", torch.zeros(1, d))
+            self.register_buffer("_i_syn_state", torch.zeros(1, d))
+    @property
+    def beta_mem(self) -> Tensor:
+        return torch.sigmoid(self.beta_mem_raw)
+    @property
+    def beta_syn(self) -> Tensor:
+        return torch.sigmoid(self.beta_syn_raw)
+    def _cascade_amplify(self, spikes: Tensor) -> Tensor:
+        """v3: Soft learnable neighbor weights + per-cluster gain."""
+        B, D = spikes.shape
+        nc = self.cfg.n_clusters
+        cid = self.cluster_ids.unsqueeze(0).expand(B, -1)
+        cluster_fire = torch.zeros(B, nc, device=spikes.device, dtype=spikes.dtype)
+        cluster_fire.scatter_add_(1, cid, spikes)
+        cluster_fire = cluster_fire / max(D // nc, 1)
+        # Soft neighbor weights (sigmoid → [0,1] so they can't go negative)
+        W = torch.sigmoid(self.neighbor_weights)  # (nc, nc)
+        neighbor_signal = (W.to(cluster_fire.dtype) @ cluster_fire.T).T  # (B, nc)
+        # Per-cluster gain
+        gain = self.cluster_gain.to(cluster_fire.dtype)  # (nc,)
+        neighbor_signal = neighbor_signal * gain.unsqueeze(0)
+        return neighbor_signal.gather(1, cid)
+    def reset_state(self):
+        """Reset persistent membrane state (call at start of new sequence)."""
+        if self.persistent:
+            self._v_mem_state.zero_()
+            self._i_syn_state.zero_()
+    def forward(self, current_in: Tensor) -> Tuple[Tensor, Tensor]:
+        T, B, D = current_in.shape
+        device = current_in.device
+        dtype = current_in.dtype
+        beta_m = self.beta_mem
+        beta_s = self.beta_syn
+        # ═══ FIX #1: Persistent membrane — carry state from previous batch ═══
+        if self.persistent and self._v_mem_state.shape[0] == B:
+            v_mem = self._v_mem_state.clone()
+            i_syn = self._i_syn_state.clone()
+        else:
+            v_mem = torch.zeros(B, D, device=device, dtype=dtype)
+            i_syn = torch.zeros(B, D, device=device, dtype=dtype)
+            if self.persistent:
+                # Resize state buffers for new batch size
+                self._v_mem_state = torch.zeros(B, D, device=device, dtype=dtype)
+                self._i_syn_state = torch.zeros(B, D, device=device, dtype=dtype)
+        refrac_counter = torch.zeros(B, D, device=device, dtype=torch.int32)
+        spikes_out = []
+        v_trace = []
+        for t in range(T):
+            i_syn = beta_s * i_syn + current_in[t]
+            refractory_mask = (refrac_counter > 0)
+            v_mem = torch.where(
+                refractory_mask,
+                torch.full_like(v_mem, self.cfg.v_reset),
+                beta_m * v_mem + (1.0 - beta_m) * i_syn,
+            )
+            s = spike_fn(v_mem, self.threshold, self.cfg.surrogate_alpha)
+            if s.sum() > 0:
+                cascade = self._cascade_amplify(s)
+                i_syn = i_syn + cascade
+            v_mem = v_mem - s * self.threshold.detach()
+            refrac_counter = torch.where(
+                s.bool(),
+                torch.full_like(refrac_counter, self.cfg.refractory_t),
+                (refrac_counter - 1).clamp(min=0),
+            )
+            spikes_out.append(s)
+            v_trace.append(v_mem)
+        # Save state for next batch
+        if self.persistent:
+            self._v_mem_state = v_mem.detach()
+            self._i_syn_state = i_syn.detach()
+        return torch.stack(spikes_out), torch.stack(v_trace)
+# ─────────────────────────────────────────────────────────────────────────────
+# §3  TEMPORAL ENCODER (v3 — Multi-Scale)
+# ─────────────────────────────────────────────────────────────────────────────
+class TemporalSpikeEncoder(nn.Module):
+    """
+    v3 — Multi-Scale Temporal Coding:
+      Fast path (T timesteps): standard temporal basis modulation
+      Slow path (T_slow timesteps): decimated, larger time constants
+      → concatenated along time axis → (T+T_slow, B*S, D)
+    The slow path captures longer-range dependencies that T=8 misses.
+    """
+    def __init__(self, cfg: NordConfig):
+        super().__init__()
+        self.cfg = cfg
+        D = cfg.d_model
+        T = cfg.T
+        T_slow = cfg.T_slow
+        self.embed = nn.Embedding(cfg.vocab_size, D)
+        nn.init.kaiming_uniform_(self.embed.weight, a=math.sqrt(5))
+        self.temporal_proj = nn.Linear(D, D, bias=False)
+        self.drive_scale = nn.Parameter(torch.tensor(15.0))
+        # Fast temporal basis (T gates)
+        self.fast_basis = nn.Parameter(torch.randn(T, D) * 0.02)
+        # ═══ FIX #1: Slow temporal basis (T_slow gates, wider receptive field) ═══
+        self.slow_basis = nn.Parameter(torch.randn(T_slow, D) * 0.02)
+        # Slow drive is weaker — it's a "summary" signal
+        self.slow_scale = nn.Parameter(torch.tensor(5.0))
+    def forward(self, token_ids: Tensor) -> Tensor:
+        """Returns: (T + T_slow, B*S, D) current."""
+        B, S = token_ids.shape
+        D = self.cfg.d_model
+        x = self.temporal_proj(self.embed(token_ids))
+        x = x.reshape(B * S, D)
+        # Fast path
+        fast_gates = torch.sigmoid(self.fast_basis)          # (T, D)
+        fast = fast_gates.unsqueeze(1) * x.unsqueeze(0) * self.drive_scale
+        # Slow path — fewer timesteps, gentler drive
+        slow_gates = torch.sigmoid(self.slow_basis)          # (T_slow, D)
+        slow = slow_gates.unsqueeze(1) * x.unsqueeze(0) * self.slow_scale
+        # Concatenate: fast then slow timesteps
+        return torch.cat([fast, slow], dim=0)  # (T+T_slow, B*S, D)
+# ─────────────────────────────────────────────────────────────────────────────
+# §4  SPIKING SYNAPTIC RESONANCE (v3 — Sparse Top-K)
+# ─────────────────────────────────────────────────────────────────────────────
+class SpikingSynapticResonance(nn.Module):
+    """
+    v3 — FIX #5: Sparse Resonance
+    Instead of full O(S²) attention matrix:
+      1. Compute full co-fire resonance (still needed for causality)
+      2. Keep only top-K values per query position
+      3. Zero out the rest → sparse attention → less memory, faster
+    For S=512, top_k=64 → 87.5% sparsity in attention matrix.
+    """
+    def __init__(self, cfg: NordConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.n_heads = cfg.n_heads
+        self.d_head = cfg.d_model // cfg.n_heads
+        self.top_k = cfg.resonance_top_k
+        D = cfg.d_model
+        self.W_q = nn.Linear(D, D, bias=False)
+        self.W_k = nn.Linear(D, D, bias=False)
+        self.W_v = nn.Linear(D, D, bias=False)
+        self.W_o = nn.Linear(D, D, bias=False)
+        self.lif_q = AssociativeLIF(D, cfg)
+        self.lif_k = AssociativeLIF(D, cfg)
+        self.resonance_temp = nn.Parameter(
+            torch.tensor(1.0 / math.sqrt(self.d_head)))
+    def forward(self, x_spikes: Tensor) -> Tensor:
+        T_total, B, S, D = x_spikes.shape
+        H, Dh = self.n_heads, self.d_head
+        x_flat = x_spikes.reshape(T_total * B * S, D)
+        q_current = self.W_q(x_flat).reshape(T_total, B * S, D)
+        k_current = self.W_k(x_flat).reshape(T_total, B * S, D)
+        v_raw     = self.W_v(x_flat).reshape(T_total, B, S, D)
+        q_spikes, _ = self.lif_q(q_current)
+        k_spikes, _ = self.lif_k(k_current)
+        q_spikes = q_spikes.reshape(T_total, B, S, H, Dh)
+        k_spikes = k_spikes.reshape(T_total, B, S, H, Dh)
+        q_flat = q_spikes.permute(1, 3, 2, 0, 4).reshape(B, H, S, T_total * Dh)
+        k_flat = k_spikes.permute(1, 3, 2, 0, 4).reshape(B, H, S, T_total * Dh)
+        resonance = torch.matmul(q_flat, k_flat.transpose(-2, -1))
+        resonance = resonance * self.resonance_temp
+        # Causal mask
+        causal_mask = torch.triu(
+            torch.ones(S, S, device=x_spikes.device, dtype=torch.bool), diagonal=1
+        )
+        resonance.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), float("-inf"))
+        # ═══ FIX #5: Top-K Sparse Attention ═══
+        # Keep only top-K resonance scores per query position, zero out the rest.
+        # This makes attention sparse → less memory for long sequences.
+        K = min(self.top_k, S)
+        if K < S:
+            # Find top-K per query row
+            top_vals, top_idx = torch.topk(resonance, K, dim=-1)  # (B,H,S,K)
+            # Create sparse mask: -inf everywhere, then scatter top-K back
+            sparse_res = torch.full_like(resonance, float("-inf"))
+            sparse_res.scatter_(-1, top_idx, top_vals)
+            resonance = sparse_res
+        attn = F.softmax(resonance.float(), dim=-1).to(resonance.dtype)
+        v_mean = v_raw.mean(dim=0)
+        v_heads = v_mean.reshape(B, S, H, Dh).permute(0, 2, 1, 3)
+        context = torch.matmul(attn, v_heads)
+        context = context.permute(0, 2, 1, 3).reshape(B, S, D)
+        out = self.W_o(context)
+        return out.unsqueeze(0).expand(T_total, -1, -1, -1)
+# ─────────────────────────────────────────────────────────────────────────────
+# §5  NORD BLOCK (v3 — LeakyClamp + LayerScale)
+# ─────────────────────────────────────────────────────────────────────────────
+class SpikingFeedForward(nn.Module):
+    def __init__(self, cfg: NordConfig):
+        super().__init__()
+        self.up   = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.down = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+        self.lif1 = AssociativeLIF(cfg.d_ff, cfg)
+        self.lif2 = AssociativeLIF(cfg.d_model, cfg)
+    def forward(self, x: Tensor) -> Tensor:
+        T, B, S, D = x.shape
+        h = self.up(x.reshape(T * B * S, D)).reshape(T, B * S, -1)
+        h, _ = self.lif1(h)
+        h = h.reshape(T, B, S, -1)
+        h = self.down(h.reshape(T * B * S, -1)).reshape(T, B * S, D)
+        h, _ = self.lif2(h)
+        return h.reshape(T, B, S, D)
+class LeakyClamp(nn.Module):
+    """
+    ═══ FIX #2: LeakyClamp ═══
+    Instead of hard ReLU (kills all negatives):
+      output = x                    if x >= 0
+      output = floor + leak * x     if x < 0
+    Where `floor` and `leak` are learnable per-channel.
+    This preserves sub-threshold membrane information that ReLU discards.
+    Initialized so floor ≈ -0.1, leak ≈ 0.1 (gentle pass-through of negatives).
+    """
+    def __init__(self, d: int, floor_init: float = -0.1, leak_init: float = 0.1):
+        super().__init__()
+        # Learnable floor (per-channel): how far below zero we allow
+        self.floor = nn.Parameter(torch.full((d,), floor_init))
+        # Learnable leak slope (per-channel): how much negative signal passes
+        self.leak_raw = nn.Parameter(torch.full((d,), math.log(leak_init / (1 - leak_init + 1e-6))))
+    @property
+    def leak(self) -> Tensor:
+        return torch.sigmoid(self.leak_raw)  # always in (0, 1)
+    def forward(self, x: Tensor) -> Tensor:
+        # Positive: pass through unchanged
+        # Negative: leak * x, clamped above floor
+        neg_part = (self.leak * x).clamp(min=self.floor)
+        return torch.where(x >= 0, x, neg_part)
+class NordBlock(nn.Module):
+    """
+    v3: LayerScale + LeakyClamp (not ReLU).
+    """
+    def __init__(self, cfg: NordConfig, layer_idx: int = 0):
+        super().__init__()
+        D = cfg.d_model
+        self.norm1 = nn.LayerNorm(D)
+        self.norm2 = nn.LayerNorm(D)
+        self.resonance = SpikingSynapticResonance(cfg)
+        self.ffn = SpikingFeedForward(cfg)
+        init_scale = 0.1 / max(cfg.n_layers, 1)
+        self.gamma_attn = nn.Parameter(torch.full((D,), init_scale))
+        self.gamma_ffn  = nn.Parameter(torch.full((D,), init_scale))
+        # ═══ FIX #2: LeakyClamp instead of ReLU ═══
+        self.clamp = LeakyClamp(D, floor_init=cfg.clamp_floor)
+    @staticmethod
+    def _safe_norm(norm_layer: nn.LayerNorm, x: Tensor) -> Tensor:
+        orig_dtype = x.dtype
+        return F.layer_norm(
+            x.float(),
+            norm_layer.normalized_shape,
+            norm_layer.weight.float() if norm_layer.weight is not None else None,
+            norm_layer.bias.float() if norm_layer.bias is not None else None,
+            norm_layer.eps,
+        ).to(orig_dtype)
+    def forward(self, x: Tensor) -> Tensor:
+        x_norm = self._safe_norm(self.norm1, x)
+        x = x + self.gamma_attn * self.resonance(x_norm)
+        x_norm = self._safe_norm(self.norm2, x)
+        x = x + self.gamma_ffn * self.ffn(x_norm)
+        # FIX #2: LeakyClamp preserves sub-threshold info
+        x = self.clamp(x)
+        return x
+# ─────────────────────────────────────────────────────────────────────────────
+# §6  STDP ENGINE (v3 — Reward-Modulated)
+# ─────────────────────────────────────────────────────────────────────────────
+class STDPEngine:
+    """
+    ═══ FIX #4: Reward-Modulated STDP ═══
+    Classic STDP is blind — it strengthens any co-firing, even if it hurts
+    the LM loss. Reward modulation fixes this:
+      dW_final = dW_stdp × reward_signal
+    Where reward_signal = sigmoid(baseline_loss - current_loss)
+      - If current loss < baseline → reward > 0.5 → strengthen
+      - If current loss > baseline → reward < 0.5 → weaken/suppress
+      - baseline_loss is an exponential moving average
+    This aligns local Hebbian plasticity with the global training objective.
+    """
+    def __init__(self, cfg: NordConfig):
+        self.cfg = cfg
+        self.a_plus  = cfg.stdp_a_plus
+        self.a_minus = cfg.stdp_a_minus
+        self.tau_plus  = cfg.stdp_tau_plus
+        self.tau_minus = cfg.stdp_tau_minus
+        self.w_max = cfg.stdp_w_max
+        self.w_min = cfg.stdp_w_min
+        self.reward_scale = cfg.stdp_reward_scale
+        # Running baseline loss (EMA)
+        self._loss_ema: float = 10.0  # initialize high
+        self._ema_decay: float = 0.99
+    def update_reward(self, current_loss: float):
+        """Call after each forward pass with current loss."""
+        self._loss_ema = self._ema_decay * self._loss_ema + (1 - self._ema_decay) * current_loss
+    def _compute_reward(self, current_loss: float) -> float:
+        """Reward signal: how much better than baseline?"""
+        delta = self._loss_ema - current_loss  # positive = improving
+        return float(torch.sigmoid(torch.tensor(delta * self.reward_scale)).item())
+    @torch.no_grad()
+    def compute_stdp_update(self, pre_spikes: Tensor, post_spikes: Tensor) -> Tensor:
+        T = pre_spikes.shape[0]
+        device = pre_spikes.device
+        trace_pre  = torch.zeros_like(pre_spikes[0])
+        trace_post = torch.zeros_like(post_spikes[0])
+        decay_plus  = math.exp(-1.0 / self.tau_plus)
+        decay_minus = math.exp(-1.0 / self.tau_minus)
+        dW = torch.zeros(
+            post_spikes.shape[1], pre_spikes.shape[1],
+            device=device, dtype=pre_spikes.dtype)
+        for t in range(T):
+            trace_pre  = trace_pre * decay_plus  + pre_spikes[t]
+            trace_post = trace_post * decay_minus + post_spikes[t]
+            if post_spikes[t].any():
+                dW += self.a_plus * torch.outer(post_spikes[t], trace_pre)
+            if pre_spikes[t].any():
+                dW -= self.a_minus * torch.outer(trace_post, pre_spikes[t])
+        return dW
+    @torch.no_grad()
+    def apply_to_layer(self, layer: nn.Linear, pre_spikes: Tensor,
+                       post_spikes: Tensor, current_loss: Optional[float] = None):
+        if pre_spikes.dim() == 3:
+            pre_spikes = pre_spikes.mean(dim=1)
+        if post_spikes.dim() == 3:
+            post_spikes = post_spikes.mean(dim=1)
+        dW = self.compute_stdp_update(pre_spikes, post_spikes)
+        # ═══ Reward modulation ═══
+        if current_loss is not None:
+            reward = self._compute_reward(current_loss)
+            # reward ∈ (0, 1): >0.5 means improving → full STDP
+            #                  <0.5 means worsening → suppress/reverse STDP
+            dW = dW * (2.0 * reward - 1.0)  # map (0,1) → (-1,1)
+            self.update_reward(current_loss)
+        out_dim, in_dim = layer.weight.shape
+        dW = dW[:out_dim, :in_dim]
+        layer.weight.data = (layer.weight.data + dW).clamp(self.w_min, self.w_max)
+# ─────────────────────────────────────────────────────────────────────────────
+# §7  NORD MODEL (v3 — Multi-Scale + Temporal Smoothing Readout)
+# ─────────────────────────────────────────────────────────────────────────────
+class NordModel(nn.Module):
+    """
+    v3 — Full architecture:
+    Pipeline:
+      tokens → MultiScale TemporalEncoder → input_LIF(persistent)
+             → [NordBlock(LeakyClamp, SparseResonance) × N]
+             → readout_LIF → EMA-smoothed membrane → LM_head
+    FIX #6 — Temporal Smoothing Readout:
+      Instead of simple mean over timesteps, apply exponential moving average
+      on membrane potential → later timesteps get more weight → captures
+      the "final state" while retaining history. Learnable smoothing factor.
+    """
+    def __init__(self, cfg: NordConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.encoder = TemporalSpikeEncoder(cfg)
+        # Input LIF with persistent membrane state
+        self.input_lif = AssociativeLIF(
+            cfg.d_model, cfg, persistent=cfg.persistent_mem)
+        self.blocks = nn.ModuleList([
+            NordBlock(cfg, layer_idx=i) for i in range(cfg.n_layers)
+        ])
+        # Readout LIF (persistent → accumulates cross-batch info)
+        self.readout_lif = AssociativeLIF(
+            cfg.d_model, cfg, persistent=cfg.persistent_mem)
+        # ═══ FIX #6: Temporal Smoothing ═══
+        # Learnable EMA decay for readout: how much to weight recent vs old timesteps
+        # Higher = more weight on recent (initialized 0.8)
+        self.readout_ema_raw = nn.Parameter(torch.tensor(1.4))  # sigmoid(1.4) ≈ 0.8
+        self.readout_norm = nn.LayerNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.stdp = STDPEngine(cfg)
+        self._stdp_cache: Dict[str, Tensor] = {}
+        self._last_loss: Optional[float] = None
+    @property
+    def readout_ema_decay(self) -> Tensor:
+        return torch.sigmoid(self.readout_ema_raw)
+    def reset_state(self):
+        """Reset all persistent membrane states (call between unrelated sequences)."""
+        self.input_lif.reset_state()
+        self.readout_lif.reset_state()
+    def forward(
+        self,
+        token_ids: Tensor,
+        enable_stdp: bool = False,
+    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+        B, S = token_ids.shape
+        T_total = self.cfg.T_total
+        D = self.cfg.d_model
+        # ── Encode (Multi-Scale) → Spike ──
+        current = self.encoder(token_ids)       # (T+T_slow, B*S, D)
+        spikes, _ = self.input_lif(current)     # (T_total, B*S, D)
+        spikes = spikes.reshape(T_total, B, S, D)
+        _rates = [spikes.detach().mean()]
+        if enable_stdp:
+            self._stdp_cache["input"] = spikes.detach()
+        # ── Nord Blocks ──
+        x = spikes
+        for i, block in enumerate(self.blocks):
+            prev = x.detach() if enable_stdp else None
+            x = block(x)
+            _rates.append(x.detach().mean())
+            if enable_stdp and prev is not None:
+                self._stdp_cache[f"block_{i}_pre"] = prev
+                self._stdp_cache[f"block_{i}_post"] = x.detach()
+        # ── Readout: EMA-smoothed membrane potential ──
+        x_flat = x.reshape(T_total, B * S, D)
+        readout_spikes, v_membrane = self.readout_lif(x_flat)
+        # ═══ FIX #6: EMA temporal smoothing ═══
+        # Instead of simple mean, exponentially weight later timesteps more
+        alpha = self.readout_ema_decay  # scalar in (0, 1)
+        ema = torch.zeros(B * S, D, device=x.device, dtype=v_membrane.dtype)
+        for t in range(T_total):
+            ema = alpha * ema + (1 - alpha) * v_membrane[t]
+        # ema now holds the smoothed membrane potential
+        v_smooth = ema.reshape(B, S, D)
+        # Hybrid: smoothed membrane + spike rate
+        s_mean = readout_spikes.mean(dim=0).reshape(B, S, D)
+        readout = v_smooth + s_mean
+        x_norm = F.layer_norm(
+            readout.float(),
+            self.readout_norm.normalized_shape,
+            self.readout_norm.weight.float() if self.readout_norm.weight is not None else None,
+            self.readout_norm.bias.float() if self.readout_norm.bias is not None else None,
+            self.readout_norm.eps,
+        ).to(readout.dtype)
+        logits = self.lm_head(x_norm)
+        # Stats (single GPU sync point)
+        stats = {}
+        stats["encoder_spike_rate"] = _rates[0].item()
+        for i in range(self.cfg.n_layers):
+            stats[f"block_{i}_spike_rate"] = _rates[i + 1].item()
+        out_rate = readout_spikes.detach().mean().item()
+        stats["output_spike_rate"] = out_rate
+        stats["sparsity"] = 1.0 - out_rate
+        return logits, stats
+    @torch.no_grad()
+    def stdp_update(self, current_loss: Optional[float] = None):
+        """
+        v3: Pass current_loss for reward modulation.
+        If None, falls back to unmodulated STDP.
+        """
+        loss_val = current_loss or self._last_loss
+        for i, block in enumerate(self.blocks):
+            pre_key  = f"block_{i}_pre"
+            post_key = f"block_{i}_post"
+            if pre_key in self._stdp_cache and post_key in self._stdp_cache:
+                pre  = self._stdp_cache[pre_key]
+                post = self._stdp_cache[post_key]
+                T_dim = pre.shape[0]
+                pre_flat  = pre.reshape(T_dim, -1, self.cfg.d_model).mean(dim=1)
+                post_flat = post.reshape(T_dim, -1, self.cfg.d_model).mean(dim=1)
+                self.stdp.apply_to_layer(
+                    block.resonance.W_v, pre_flat, post_flat,
+                    current_loss=loss_val,
+                )
+        self._stdp_cache.clear()
+    def set_last_loss(self, loss: float):
+        """Store loss for reward-modulated STDP during inference."""
+        self._last_loss = loss
+    def count_params(self) -> str:
+        total = sum(p.numel() for p in self.parameters())
+        train = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return f"Total: {total/1e6:.1f}M | Trainable: {train/1e6:.1f}M"
+# ─────────────────────────────────────────────────────────────────────────────
+# §8  UTILITY
+# ─────────────────────────────────────────────────────────────────────────────
+def estimate_vram(cfg: NordConfig) -> str:
+    param_bytes = (
+        cfg.vocab_size * cfg.d_model
+        + cfg.n_layers * (
+            4 * cfg.d_model * cfg.d_model
+            + 2 * cfg.d_model * cfg.d_ff
+            + 6 * cfg.d_model
+            + cfg.n_clusters * cfg.n_clusters  # neighbor_weights
+        )
+        + cfg.vocab_size * cfg.d_model
+    ) * (2 if cfg.dtype == torch.float16 else 4)
+    act_bytes = cfg.T_total * 1 * cfg.max_seq_len * cfg.d_model * cfg.n_layers * 2 * 2
+    total_gb = (param_bytes + act_bytes) / (1024 ** 3)
+    return (
+        f"Parameters:   ~{param_bytes / 1e6:.0f} MB\n"
+        f"Activations:  ~{act_bytes / 1e6:.0f} MB  (B=1, S={cfg.max_seq_len})\n"
+        f"Total Est:    ~{total_gb:.2f} GB  (target: 8 GB RTX 5070)"
+    )

train_nord.py ADDED Viewed

	@@ -0,0 +1,456 @@

+"""
+╔══════════════════════════════════════════════════════════════════════════╗
+║         PROJECT NORD — Крок 2: Навчання SNN моделі                     ║
+║                                                                        ║
+║  Просто запусти:                                                       ║
+║      python train_nord.py                                              ║
+║                                                                        ║
+║  Воно запитає:                                                         ║
+║    1. Де лежить датасет (JSONL файл)                                   ║
+║    2. Куди зберігати модель                                            ║
+║  І все — далі тренує автоматично.                                      ║
+║                                                                        ║
+║  Можна зупинити Ctrl+C і продовжити пізніше — модель збережеться.      ║
+╚══════════════════════════════════════════════════════════════════════════╝
+Потрібно встановити один раз:
+    pip install torch transformers lmdb tqdm
+"""
+from __future__ import annotations
+import json
+import math
+import os
+import shutil
+import struct
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch.amp import autocast
+from torch.utils.data import Dataset, DataLoader
+from nord_core import NordConfig, NordModel
+# ─────────────────────────────────────────────────────────────────────────────
+# ТОКЕНІЗАТОР
+# ─────────────────────────────────────────────────────────────────────────────
+class NordTokenizer:
+    """Обгортка Llama-3.2 токенізатора для Project Nord."""
+    def __init__(self, cfg: NordConfig):
+        from transformers import AutoTokenizer
+        print(f"  [*] Завантажуємо Llama-3.2 токенізатор...")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            cfg.tokenizer_id, trust_remote_code=True,
+        )
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.max_len = cfg.max_seq_len
+        self.vocab_size = self.tokenizer.vocab_size
+        if cfg.vocab_size < self.vocab_size:
+            cfg.vocab_size = self.vocab_size
+        print(f"  [✓] Токенізатор готовий (vocab={self.vocab_size:,})")
+    def encode(self, text: str) -> torch.Tensor:
+        enc = self.tokenizer(
+            text, return_tensors="pt",
+            max_length=self.max_len, truncation=True, padding="max_length",
+        )
+        return enc.input_ids
+    def decode(self, ids) -> str:
+        return self.tokenizer.decode(ids, skip_special_tokens=True)
+    @property
+    def pad_id(self) -> int:
+        return self.tokenizer.pad_token_id
+# ─────────────────────────────────────────────────────────────────────────────
+# LMDB ДАТАСЕТ (on-disk, zero RAM)
+# ─────────────────────────────────────────────────────────────────────────────
+class LMDBDataset(Dataset):
+    def __init__(self, db_path: str, max_seq_len: int):
+        import lmdb
+        self.db_path = db_path
+        self.max_seq_len = max_seq_len
+        self._env = None  # opened lazily — can't pickle lmdb.Environment on Windows
+        # Read length once, then close
+        env = lmdb.open(db_path, readonly=True, lock=False, readahead=False, meminit=False)
+        with env.begin(write=False) as txn:
+            raw = txn.get(b"__len__")
+            self.length = struct.unpack("<Q", raw)[0]
+        env.close()
+        print(f"  [✓] LMDB: {self.length:,} зразків")
+    def _get_env(self):
+        """Lazy-open LMDB per worker process (safe for multiprocessing)."""
+        if self._env is None:
+            import lmdb
+            self._env = lmdb.open(
+                self.db_path, readonly=True, lock=False,
+                readahead=True, meminit=False, max_readers=64,
+            )
+        return self._env
+    def __len__(self): return self.length
+    def __getitem__(self, idx):
+        env = self._get_env()
+        with env.begin(write=False) as txn:
+            raw = txn.get(f"sample_{idx:010d}".encode())
+        ids = torch.frombuffer(bytearray(raw), dtype=torch.int32).long()
+        S = self.max_seq_len
+        return ids[:S] if ids.shape[0] >= S else F.pad(ids, (0, S - ids.shape[0]))
+def build_lmdb(jsonl_path: str, db_path: str, tokenizer: NordTokenizer,
+               max_seq_len: int, map_size_gb: float = 50.0):
+    """Конвертує JSONL → LMDB базу (один раз)."""
+    import lmdb
+    from tqdm import tqdm
+    print(f"\n  [*] Будуємо LMDB базу даних...")
+    print(f"      Це робиться ОДИН раз — потім тренуєшся з бази нескінченно.")
+    print(f"      Джерело:  {jsonl_path}")
+    print(f"      Ціль:     {db_path}")
+    # Підрахувати рядки
+    print(f"  [*] Рахуємо рядки...")
+    with open(jsonl_path, "r", encoding="utf-8") as f:
+        n_lines = sum(1 for _ in f)
+    print(f"      Знайдено: {n_lines:,} рядків")
+    env = lmdb.open(db_path, map_size=int(map_size_gb * (1024 ** 3)))
+    count = 0
+    total_tokens = 0
+    txn = env.begin(write=True)
+    try:
+        with open(jsonl_path, "r", encoding="utf-8") as f:
+            for line in tqdm(f, total=n_lines, desc="  Токенізація", unit=" doc"):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                text = obj.get("text") or obj.get("content") or obj.get("passage", "")
+                if len(text) < 30:
+                    continue
+                ids = tokenizer.encode(text).squeeze(0)
+                non_pad = (ids != tokenizer.pad_id).sum().item()
+                if non_pad < 10:
+                    continue
+                txn.put(f"sample_{count:010d}".encode(),
+                        ids.to(torch.int32).numpy().tobytes())
+                count += 1
+                total_tokens += non_pad
+                if count % 50_000 == 0:
+                    txn.commit()
+                    txn = env.begin(write=True)
+                    print(f"      ... {count:,} зразків, {total_tokens/1e6:.1f}M токенів")
+        txn.put(b"__len__", struct.pack("<Q", count))
+        txn.put(b"__total_tokens__", struct.pack("<Q", total_tokens))
+        txn.commit()
+    except BaseException:
+        txn.abort()
+        raise
+    env.close()
+    db_size = sum(f.stat().st_size for f in Path(db_path).rglob("*") if f.is_file())
+    print(f"\n  [✓] LMDB готова!")
+    print(f"      Зразків:  {count:,}")
+    print(f"      Токенів:  {total_tokens:,} ({total_tokens/1e6:.1f}M)")
+    print(f"      На диску:  {db_size / (1024**3):.2f} GB")
+# ─────────────────────────────────────────────────────────────────────────────
+# LR SCHEDULE
+# ─────────────────────────────────────────────────────────────────────────────
+def get_lr(step: int, cfg: NordConfig) -> float:
+    if step < cfg.warmup_steps:
+        return cfg.lr * (step + 1) / cfg.warmup_steps
+    progress = min((step - cfg.warmup_steps) / max(1, cfg.max_steps - cfg.warmup_steps), 1.0)
+    return cfg.min_lr + 0.5 * (1.0 + math.cos(math.pi * progress)) * (cfg.lr - cfg.min_lr)
+# ─────────────────────────────────────────────────────────────────────────────
+# ЧЕКПОІНТ МЕНЕДЖЕР
+# ─────────────────────────────────────────────────────────────────────────────
+class CheckpointManager:
+    def __init__(self, save_dir: str, keep_last: int = 5):
+        self.save_dir = Path(save_dir)
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+        self.keep_last = keep_last
+    def save(self, model, optimizer, scaler, step, loss, cfg):
+        path = self.save_dir / f"nord_step_{step:07d}.pt"
+        torch.save({
+            "step": step, "loss": loss,
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "scaler_state_dict": scaler.state_dict(),
+            "config": {k: v for k, v in cfg.__dict__.items()
+                       if not k.startswith("_") and k != "dtype"},
+        }, path)
+        latest = self.save_dir / "nord_latest.pt"
+        if latest.exists():
+            latest.unlink()
+        shutil.copy2(path, latest)
+        # Cleanup old
+        ckpts = sorted(self.save_dir.glob("nord_step_*.pt"), key=lambda p: p.stat().st_mtime)
+        for old in ckpts[:max(0, len(ckpts) - self.keep_last)]:
+            old.unlink()
+        print(f"  [💾] Збережено: {path.name} (loss={loss:.4f})")
+    def load(self, model, optimizer, scaler, device) -> int:
+        latest = self.save_dir / "nord_latest.pt"
+        if not latest.exists():
+            ckpts = sorted(self.save_dir.glob("nord_step_*.pt"))
+            latest = ckpts[-1] if ckpts else None
+        if latest is None:
+            return 0
+        print(f"  [*] Відновлюємо з: {latest.name}")
+        ckpt = torch.load(latest, map_location=device, weights_only=False)
+        model.load_state_dict(ckpt["model_state_dict"])
+        optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+        scaler.load_state_dict(ckpt["scaler_state_dict"])
+        step = ckpt["step"]
+        print(f"  [✓] Відновлено на кроці {step:,} (loss={ckpt.get('loss', '?')})")
+        return step
+    def save_final(self, model, cfg):
+        """Зберегти тільки модель для inference (менший файл)."""
+        path = self.save_dir / "nord_final.pt"
+        torch.save({
+            "model_state_dict": model.state_dict(),
+            "config": {k: v for k, v in cfg.__dict__.items()
+                       if not k.startswith("_") and k != "dtype"},
+        }, path)
+        print(f"  [⭐] Фінальна модель: {path}")
+        return path
+# ─────────────────────────────────────────────────────────────────────────────
+# ГОЛОВНА ФУНКЦІЯ НАВЧАННЯ
+# ─────────────────────────────────────────────────────────────────────────────
+def train(dataset_path: str, model_dir: str):
+    # ── Конфіг ──
+    cfg = NordConfig(
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        dtype=torch.float16,
+        d_model=512,
+        n_heads=8,
+        n_layers=6,
+        d_ff=1024,
+        T=8,
+        T_slow=2,
+        persistent_mem=False,  # shuffled batches → no persistent state during training
+        max_seq_len=512,
+        batch_size=4,
+        grad_accum=8,
+        lr=5e-4,
+        max_steps=100_000,
+        save_every=1000,
+        log_every=10,
+    )
+    print()
+    print("═" * 60)
+    print("  PROJECT NORD v3 — Навчання SNN моделі")
+    print("═" * 60)
+    print(f"  GPU:            {torch.cuda.get_device_name()}" if torch.cuda.is_available() else "  CPU mode")
+    print(f"  Модель:         d={cfg.d_model}, layers={cfg.n_layers}, T={cfg.T}+{cfg.T_slow}={cfg.T_total}")
+    print(f"  Ефективний батч: {cfg.batch_size} × {cfg.grad_accum} = {cfg.batch_size * cfg.grad_accum}")
+    print(f"  Кроків:         {cfg.max_steps:,}")
+    print(f"  Датасет:        {dataset_path}")
+    print(f"  Модель →        {model_dir}")
+    print()
+    # ── Токенізатор ──
+    tokenizer = NordTokenizer(cfg)
+    # ── LMDB база (будується автоматично якщо не існує) ──
+    db_path = str(Path(dataset_path).with_suffix("")) + "_lmdb"
+    if not Path(db_path).exists():
+        build_lmdb(dataset_path, db_path, tokenizer, cfg.max_seq_len)
+    dataset = LMDBDataset(db_path, cfg.max_seq_len)
+    dataloader = DataLoader(
+        dataset, batch_size=cfg.batch_size, shuffle=True,
+        num_workers=2, pin_memory=True, drop_last=True, persistent_workers=True,
+    )
+    # ── Модель ──
+    # НЕ робимо .half() — autocast сам конвертує forward pass у fp16,
+    # а параметри залишаються fp32 для коректної роботи GradScaler
+    print(f"\n  [*] Будуємо модель...")
+    model = NordModel(cfg).to(cfg.device)
+    print(f"  [✓] {model.count_params()}")
+    # ── Optimizer ──
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=cfg.lr,
+        weight_decay=cfg.weight_decay, betas=(0.9, 0.95),
+    )
+    scaler = torch.amp.GradScaler("cuda", enabled=(cfg.dtype == torch.float16))
+    # ── Чекпоінти (auto-resume) ──
+    ckpt_mgr = CheckpointManager(model_dir)
+    start_step = ckpt_mgr.load(model, optimizer, scaler, cfg.device)
+    # ── ТРЕНУВАННЯ ──
+    model.train()
+    data_iter = iter(dataloader)
+    running_loss = 0.0
+    tokens_seen = 0
+    t_start = time.time()
+    print(f"\n  {'─' * 50}")
+    print(f"  Старт з кроку {start_step:,}  |  {len(dataset):,} зразків в базі")
+    print(f"  Ctrl+C = зупинити (модель збережеться!)")
+    print(f"  {'─' * 50}\n")
+    try:
+        for step in range(start_step, cfg.max_steps):
+            accum_loss = 0.0
+            stats = {}
+            for _ in range(cfg.grad_accum):
+                try:
+                    input_ids = next(data_iter)
+                except StopIteration:
+                    data_iter = iter(dataloader)
+                    input_ids = next(data_iter)
+                input_ids = input_ids.to(cfg.device, non_blocking=True)
+                with autocast(device_type="cuda", dtype=torch.float16,
+                              enabled=(cfg.dtype == torch.float16)):
+                    logits, stats = model(input_ids)
+                    shift_logits = logits[:, :-1, :].contiguous()
+                    shift_labels = input_ids[:, 1:].contiguous()
+                    loss = F.cross_entropy(
+                        shift_logits.reshape(-1, cfg.vocab_size),
+                        shift_labels.reshape(-1),
+                        ignore_index=tokenizer.pad_id,
+                    ) / cfg.grad_accum
+                scaler.scale(loss).backward()
+                accum_loss += loss.item()
+                tokens_seen += input_ids.numel()
+            # Optimizer step
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad(set_to_none=True)
+            # LR schedule
+            lr = get_lr(step, cfg)
+            for pg in optimizer.param_groups:
+                pg["lr"] = lr
+            running_loss += accum_loss
+            # Лог
+            if step % cfg.log_every == 0 and step > start_step:
+                avg = running_loss / cfg.log_every
+                elapsed = time.time() - t_start
+                tps = tokens_seen / elapsed / 1000 if elapsed > 0 else 0
+                sp = stats.get("sparsity", 0)
+                print(
+                    f"  крок {step:>7,} │ "
+                    f"loss {avg:.4f} │ "
+                    f"lr {lr:.1e} │ "
+                    f"grad {grad_norm:.1f} │ "
+                    f"sparsity {sp:.0%} │ "
+                    f"{tps:.1f}k tok/s"
+                )
+                running_loss = 0.0
+            # Зберегти
+            if step > 0 and step % cfg.save_every == 0:
+                ckpt_mgr.save(model, optimizer, scaler, step, accum_loss, cfg)
+    except KeyboardInterrupt:
+        print(f"\n\n  [⏸] Зупинено на кроці {step:,}")
+        ckpt_mgr.save(model, optimizer, scaler, step, accum_loss, cfg)
+        print(f"  Щоб продовжити — просто запусти скрипт знову.")
+    # Зберегти фінальну модель для чату
+    ckpt_mgr.save_final(model, cfg)
+    print(f"\n  {'═' * 50}")
+    print(f"  Навчання завершено!")
+    print(f"  Модель збережена в: {model_dir}")
+    print(f"  Тепер запускай:  python chat.py")
+    print(f"  {'═' * 50}")
+# ─────────────────────────────────────────────────────────────────────────────
+# ENTRY POINT
+# ─────────────────────────────────────────────────────────────────────────────
+def main():
+    print("=" * 60)
+    print("  PROJECT NORD — Тренування SNN")
+    print("=" * 60)
+    # ── Запитати шлях до датасету ──
+    default_data = os.path.join("D:", os.sep, "nord_dataset", "train_data.jsonl")
+    print(f"\n  Де лежить датасет? (JSONL файл)")
+    print(f"  (Enter = {default_data})")
+    data_input = input("  Шлях до датасету: ").strip()
+    dataset_path = data_input if data_input else default_data
+    if not Path(dataset_path).exists():
+        print(f"\n  [✗] Файл не знайдено: {dataset_path}")
+        print(f"  Спочатку запусти:  python download_data.py")
+        sys.exit(1)
+    # ── Запитати куди зберігати модель ──
+    default_model = os.path.join("D:", os.sep, "nord_model")
+    print(f"\n  Куди зберігати модель?")
+    print(f"  (Enter = {default_model})")
+    model_input = input("  Шлях для моделі: ").strip()
+    model_dir = model_input if model_input else default_model
+    # ── Поїхали ──
+    train(dataset_path, model_dir)
+if __name__ == "__main__":
+    main()