File size: 99,649 Bytes

731c6f9

#!/usr/bin/env python3
"""
╔══════════════════════════════════════════════════════════════════════════╗
║  HRAN — Haykin Resonant Attention Network                               ║
║  A Novel Architecture From First Principles                             ║
╠══════════════════════════════════════════════════════════════════════════╣
║  Strictly derived from:                                                 ║
║  • Simon Haykin — "Neural Networks and Learning Machines" (3rd Ed.)     ║
║  • First Principles of Computation, Information, and Adaptation         ║
╠══════════════════════════════════════════════════════════════════════════╣
║  Architectural Innovations (each anchored to Haykin chapters):          ║
║                                                                         ║
║  1. RBF Attention (Ch.5)     — Gaussian kernel replaces dot-product    ║
║     Attention_ij = softmax(-γ‖q_i - k_j‖²)                            ║
║     Localizes attention to similar representations (true RBF spirit)    ║
║                                                                         ║
║  2. Hebbian Seed Init (Ch.2) — "Neurons that fire together wire         ║
║     together." Pre-seeds embeddings with co-occurrence statistics       ║
║     before gradient descent. Bridges unsupervised + supervised.        ║
║                                                                         ║
║  3. Infomax Activation (Ch.10) — Bell-Sejnowski ICA principle.          ║
║     f(x) = tanh(x) + αx  maximizes mutual information throughput.      ║
║     Strictly avoids information bottleneck in hidden layers.            ║
║                                                                         ║
║  4. Lateral Inhibition Gate (Ch.9) — Competitive learning.             ║
║     Winners are amplified, weak activations suppressed. Produces       ║
║     sparse, discriminative representations (like cortical columns).    ║
║                                                                         ║
║  5. Error-Correction + Hebb Fusion (Ch.1) — Combined learning rule:    ║
║     ΔW = η_bp·∇L + η_hebb·(y·xᵀ - ||y||²·W)  — Oja's rule variant    ║
║                                                                         ║
║  6. Wiener-SNR Gradient Scaling (Ch.3) — Wiener filter principle:      ║
║     Scale parameter updates by local signal-to-noise ratio.            ║
║     High-signal weights learn fast; noisy weights learn slow.           ║
╚══════════════════════════════════════════════════════════════════════════╝
"""

import math
import random
import time
import sys
import os
import json
import pickle
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Optional

import numpy as np

# ─────────────────────────────────────────────────────────────────────────────
# SECTION 1: CONFIGURATION
# ─────────────────────────────────────────────────────────────────────────────

class HRANConfig:
    """All hyperparameters in one place."""
    # Architecture
    vocab_size: int = 2048        # Compact vocabulary
    embed_dim: int = 128          # Embedding dimensionality
    num_heads: int = 4            # RBF attention heads
    num_layers: int = 4           # HRAN blocks
    ffn_dim: int = 512            # Infomax FFN width
    max_seq_len: int = 64         # Max context window
    dropout: float = 0.1

    # RBF Attention (Haykin Ch.5)
    rbf_gamma_init: float = 1.0   # Initial bandwidth γ for Gaussian kernel
    rbf_learnable: bool = True    # γ adapts during training

    # Infomax activation (Haykin Ch.10)
    infomax_alpha: float = 0.1    # Leakage for mutual info preservation

    # Hebbian initialization (Haykin Ch.2)
    hebb_lr: float = 0.01
    hebb_epochs: int = 3

    # Wiener gradient scaling (Haykin Ch.3)
    wiener_eps: float = 1e-6
    wiener_window: int = 20       # Steps over which signal power is estimated

    # Training
    learning_rate: float = 3e-4
    batch_size: int = 16
    epochs: int = 80
    grad_clip: float = 1.0
    label_smoothing: float = 0.1
    warmup_steps: int = 200
    weight_decay: float = 1e-4

    # Special tokens
    PAD_ID: int = 0
    BOS_ID: int = 1
    EOS_ID: int = 2
    UNK_ID: int = 3

    def __init__(self):
        pass

CFG = HRANConfig()

# ─────────────────────────────────────────────────────────────────────────────
# SECTION 2: CURATED DATASET  (100% original — zero external sources)
# ─────────────────────────────────────────────────────────────────────────────

DATASET = [
    # ── Greetings & Social ────────────────────────────────────────────────
    ("hello", "Hello! Great to meet you. What would you like to explore today?"),
    ("hi", "Hi there! I am HRAN, ready to think with you."),
    ("hey", "Hey! What is on your mind?"),
    ("good morning", "Good morning! A fresh mind is the best tool. How can I help?"),
    ("good evening", "Good evening! Let us have a thoughtful conversation."),
    ("how are you", "I am functioning well and ready to engage. How are you?"),
    ("what is your name", "I am HRAN — the Haykin Resonant Attention Network."),
    ("who made you", "I was built from first principles inspired by Simon Haykin's teachings."),
    ("what can you do", "I can converse, reason, explain concepts, and think through problems with you."),
    ("goodbye", "Goodbye! Keep questioning, keep learning."),
    ("bye", "Take care. Come back anytime you want to think deeply."),
    ("thank you", "You are welcome. Curiosity is always appreciated."),
    ("thanks", "Glad I could help. What else interests you?"),
    ("help", "Of course. Ask me anything — science, philosophy, math, language, or life."),
    ("are you smart", "I am a small network trained carefully. I know some things deeply, not everything broadly."),
    ("are you human", "No, I am a neural network — patterns and weights shaped by learning."),
    ("are you conscious", "That is one of the deepest questions. I process, but whether that is consciousness is still an open problem."),
    ("tell me a joke", "Why did the neuron go to therapy? It had too many unresolved connections."),
    ("what is life", "Life is self-organizing matter that replicates, adapts, and seeks to preserve information about itself."),
    ("do you have feelings", "I have activations that respond to input. Whether those constitute feeling is philosophy, not engineering."),

    # ── Neural Networks & AI ──────────────────────────────────────────────
    ("what is a neural network", "A neural network is a graph of parameterized functions trained to approximate mappings from inputs to outputs by minimizing error."),
    ("what is backpropagation", "Backpropagation is the chain rule of calculus applied recursively through a network to compute how each weight contributes to the total error."),
    ("what is gradient descent", "Gradient descent moves weights in the direction that most steeply reduces the loss function, step by step until a minimum is found."),
    ("what is overfitting", "Overfitting is when a model memorizes training data instead of learning the underlying pattern. It performs well on seen data but poorly on new data."),
    ("what is regularization", "Regularization adds a penalty to the loss that discourages overly large weights, forcing the model to generalize rather than memorize."),
    ("what is dropout", "Dropout randomly sets activations to zero during training, which forces neurons to learn redundant representations and prevents co-adaptation."),
    ("what is attention", "Attention lets a model weigh different parts of its input differently based on relevance, computing a weighted sum of values guided by query-key similarity."),
    ("what is a transformer", "A transformer is a model that processes sequences using stacked attention and feed-forward layers instead of recurrence, enabling parallelism."),
    ("what is an embedding", "An embedding maps discrete symbols like words into dense vectors in continuous space so that similar meanings land near each other."),
    ("what is a loss function", "A loss function quantifies how wrong a model's prediction is. Training seeks to minimize it over all examples."),
    ("what is a recurrent network", "A recurrent network processes sequences by passing a hidden state from one step to the next, giving it a form of memory."),
    ("what is a convolutional network", "A convolutional network applies learned filters across space or time, detecting local patterns and sharing weights for efficiency."),
    ("what is transfer learning", "Transfer learning reuses a model trained on one task as the starting point for a different but related task, saving time and data."),
    ("what is reinforcement learning", "Reinforcement learning trains an agent to take actions in an environment to maximize cumulative reward through trial and error."),
    ("what is generalization", "Generalization is the ability of a model to perform well on data it has never seen, which is the true goal of machine learning."),
    ("what is the vanishing gradient problem", "When gradients are multiplied through many layers, they shrink exponentially, making early layers learn very slowly or not at all."),
    ("how do you prevent vanishing gradients", "Techniques include residual connections, careful weight initialization, batch normalization, and activation functions like ReLU or GELU."),
    ("what is batch normalization", "Batch normalization standardizes layer inputs across a mini-batch, stabilizing and accelerating training."),
    ("what is a hyperparameter", "A hyperparameter is a setting chosen before training begins, like learning rate or number of layers, that controls how learning happens."),
    ("what is the learning rate", "The learning rate controls how large a step gradient descent takes each update. Too large causes instability; too small causes slow learning."),

    # ── Haykin-Specific Concepts ──────────────────────────────────────────
    ("what is hebbian learning", "Hebbian learning is the rule that connections between neurons strengthen when they fire together. It is unsupervised and biologically inspired."),
    ("what is an rbf network", "A radial basis function network uses Gaussian kernel activations centered at prototype points. Each neuron responds maximally to inputs near its center."),
    ("what is the perceptron", "The perceptron is the simplest neural unit. It computes a weighted sum of inputs, adds a bias, and outputs one if the result crosses a threshold."),
    ("what is lateral inhibition", "Lateral inhibition is when strongly activated neurons suppress their neighbors, creating contrast and sparse, competitive representations."),
    ("what is competitive learning", "Competitive learning trains only the winning neuron for each input, causing different neurons to specialize in different input patterns."),
    ("what is a self organizing map", "A self-organizing map arranges neurons in a low-dimensional grid and trains them to represent the topology of the input distribution."),
    ("what is the boltzmann machine", "A Boltzmann machine is a stochastic recurrent network that learns by maximizing the likelihood of training data through energy minimization."),
    ("what is infomax", "Infomax is the principle of maximizing the mutual information between input and output of a network, driving it to preserve all relevant information."),
    ("what is the wiener filter", "The Wiener filter is the optimal linear filter for signal estimation. It minimizes mean-squared error by weighting frequencies by their signal-to-noise ratio."),
    ("what is principal component analysis", "PCA finds directions of maximum variance in data. It is related to Hebbian learning — Oja's rule learns the first principal component online."),
    ("what is a support vector machine", "An SVM finds the hyperplane that maximally separates classes, determined by the support vectors — the data points closest to the boundary."),
    ("what is independent component analysis", "ICA separates mixed signals into statistically independent sources. It underlies the Bell-Sejnowski infomax algorithm."),
    ("what is the delta rule", "The delta rule adjusts weights proportionally to the difference between desired and actual output times the input. It is a simple gradient descent rule."),
    ("what is energy in a neural network", "Energy is a scalar that decreases with each network update in Hopfield and Boltzmann machines, guiding the network to stable attractor states."),
    ("what is a hopfield network", "A Hopfield network is a fully connected recurrent network that stores memories as energy minima and retrieves them by settling to the nearest attractor."),
    ("what is stochastic gradient descent", "SGD approximates the true gradient using small random batches of data, making training scalable and sometimes helping escape local minima."),
    ("what is momentum in learning", "Momentum accumulates gradients over time like a ball rolling downhill, helping to speed up convergence and smooth oscillations."),
    ("what is the bias-variance tradeoff", "High bias means the model is too simple and underfits. High variance means it is too complex and overfits. Good models balance both."),
    ("what is cross entropy loss", "Cross entropy measures how different a predicted probability distribution is from the true one. It is the standard loss for classification."),
    ("what is weight initialization", "Weight initialization sets the starting values of parameters. Good initialization keeps activations and gradients in useful ranges early in training."),

    # ── Mathematics ───────────────────────────────────────────────────────
    ("what is a derivative", "A derivative measures the instantaneous rate of change of a function at a point. It is the slope of the tangent line to the curve."),
    ("what is the chain rule", "The chain rule states that the derivative of a composite function equals the product of the derivatives of its parts. It drives backpropagation."),
    ("what is a matrix", "A matrix is a rectangular array of numbers that represents a linear transformation. Multiplying a vector by a matrix applies that transformation."),
    ("what is an eigenvalue", "An eigenvalue tells you how much a matrix stretches or compresses its eigenvector. It reveals the intrinsic scaling directions of a transformation."),
    ("what is a probability distribution", "A probability distribution assigns likelihoods to all possible outcomes of a random variable. It must be non-negative and sum to one."),
    ("what is entropy in information theory", "Shannon entropy measures the average surprise or uncertainty of a distribution. High entropy means outcomes are unpredictable."),
    ("what is mutual information", "Mutual information measures how much knowing one variable reduces uncertainty about another. It is zero for independent variables."),
    ("what is a gradient", "A gradient is a vector pointing in the direction of steepest increase of a function. Moving against it minimizes the function."),
    ("what is a convex function", "A convex function curves upward everywhere, guaranteeing that gradient descent finds the global minimum rather than getting stuck."),
    ("what is a local minimum", "A local minimum is a point where the function is lower than all nearby points, but not necessarily the lowest point overall."),
    ("what is the curse of dimensionality", "As dimensions grow, data becomes exponentially sparse. Distances lose meaning and sampling requirements explode — a fundamental challenge."),
    ("what is a dot product", "A dot product multiplies corresponding elements of two vectors and sums them. It measures how aligned two vectors are."),
    ("what is a softmax function", "Softmax converts a vector of real numbers into a probability distribution by exponentiating each value and normalizing by the sum."),
    ("what is a sigmoid function", "The sigmoid maps any real number to the range zero to one, making it useful for modeling probabilities and thresholding."),
    ("what is a taylor expansion", "A Taylor expansion approximates a function near a point as an infinite sum of polynomial terms using the function's derivatives."),
    ("what is linear algebra", "Linear algebra studies vector spaces and linear transformations. It is the mathematical backbone of nearly all machine learning."),
    ("what is calculus", "Calculus studies rates of change and accumulation. Differential calculus gives us gradients; integral calculus gives us expectations."),
    ("what is statistics", "Statistics is the science of collecting, analyzing, and interpreting data to make inferences about the world under uncertainty."),
    ("what is bayes theorem", "Bayes theorem updates a prior belief about an event given new evidence. It is the foundation of probabilistic reasoning and inference."),
    ("what is a random variable", "A random variable is a quantity whose value is determined by a random process, characterized by its probability distribution."),

    # ── Physics & Science ─────────────────────────────────────────────────
    ("what is gravity", "Gravity is the curvature of spacetime caused by mass and energy, as described by Einstein's general relativity. It attracts masses toward each other."),
    ("what is energy", "Energy is the capacity to do work or cause change. It comes in many forms and is always conserved in an isolated system."),
    ("what is entropy in physics", "Physical entropy measures the number of microscopic arrangements consistent with a macroscopic state. Systems naturally evolve toward higher entropy."),
    ("what is quantum mechanics", "Quantum mechanics describes nature at atomic scales where particles have wave-like properties, exist in superposition, and are affected by observation."),
    ("what is the speed of light", "Light travels at approximately 299,792 kilometers per second in a vacuum. Nothing with mass can reach or exceed this speed."),
    ("what is evolution", "Evolution is the change in heritable traits within populations over generations, driven by mutation, selection, drift, and recombination."),
    ("what is dna", "DNA is a double-helix polymer encoding genetic information in sequences of four bases. It is copied and translated to build proteins."),
    ("what is a neuron", "A neuron is a cell specialized for electrical and chemical signaling. It receives inputs through dendrites and sends output along its axon."),
    ("what is thermodynamics", "Thermodynamics governs energy transfer and transformation. Its laws say energy is conserved and entropy always increases in closed systems."),
    ("what is relativity", "Relativity is Einstein's framework unifying space and time. Special relativity handles constant motion; general relativity handles gravity and curved spacetime."),
    ("what is the big bang", "The Big Bang is the rapid expansion of a hot, dense early universe approximately 13.8 billion years ago that created space, time, and matter."),
    ("what is a black hole", "A black hole is a region where gravity is so strong that nothing, not even light, can escape its event horizon."),
    ("what is electricity", "Electricity is the flow of charged particles, usually electrons. It arises from electric fields created by charge differences."),
    ("what is a photon", "A photon is the quantum of light — a massless particle that carries electromagnetic energy and travels at the speed of light."),
    ("what is an atom", "An atom is the smallest unit of a chemical element, consisting of a nucleus of protons and neutrons surrounded by electrons."),
    ("what is chemistry", "Chemistry studies matter's composition, structure, and transformations. It bridges physics and biology and underlies all materials science."),
    ("what is biology", "Biology is the study of living systems — how they are built, how they work, how they reproduce, and how they evolve."),
    ("what is a gene", "A gene is a sequence of DNA that encodes a functional product, typically a protein, and can be passed from parent to offspring."),
    ("what is homeostasis", "Homeostasis is the process by which living systems maintain stable internal conditions despite external changes, like body temperature regulation."),
    ("what is a ecosystem", "An ecosystem is a community of organisms interacting with each other and their physical environment in a continuous exchange of energy and matter."),

    # ── Philosophy & Cognition ────────────────────────────────────────────
    ("what is intelligence", "Intelligence is the ability to acquire, integrate, and apply knowledge to achieve goals in varied and novel environments."),
    ("what is consciousness", "Consciousness is the subjective experience of being aware. Its origin in physical processes remains one of philosophy's hardest problems."),
    ("what is knowledge", "Knowledge is justified true belief. We know something if it is true, we believe it, and we have good reasons for that belief."),
    ("what is logic", "Logic is the study of valid inference. It defines the rules by which conclusions follow necessarily from premises."),
    ("what is truth", "Truth is correspondence between a statement and the state of the world it describes. Defining it precisely is harder than it sounds."),
    ("what is a hypothesis", "A hypothesis is a testable prediction about the world. Science advances by forming, testing, and refining hypotheses."),
    ("what is the scientific method", "The scientific method is a cycle of observation, hypothesis formation, prediction, experimentation, and revision guided by evidence."),
    ("what is critical thinking", "Critical thinking is the disciplined analysis of information to form well-reasoned judgments rather than accepting claims uncritically."),
    ("what is cognition", "Cognition encompasses all mental processes — perception, memory, attention, language, reasoning, and decision making."),
    ("what is memory", "Memory is the process of encoding, storing, and retrieving information. It is reconstructive, not like a recording — it changes every time it is recalled."),
    ("what is learning", "Learning is a lasting change in behavior or knowledge resulting from experience. In neural terms, it is synaptic weight modification."),
    ("what is creativity", "Creativity is the ability to form novel combinations of existing ideas that are both surprising and useful. It thrives at the edges of existing knowledge."),
    ("what is abstraction", "Abstraction is ignoring irrelevant details to capture essential structure. Mathematics and programming depend on it heavily."),
    ("what is language", "Language is a structured system of symbols and rules that encodes meaning and enables communication between minds."),
    ("what is emotion", "Emotion is a coordinated response to stimuli that shapes behavior, attention, and decision making. It is deeply tied to memory and valuation."),
    ("what is decision making", "Decision making is the process of selecting an action among alternatives based on values, predictions, and uncertainty."),
    ("what is perception", "Perception is the brain's active construction of a model of the world from raw sensory signals, heavily shaped by prior expectations."),
    ("what is attention in psychology", "Psychological attention is the selective focus of cognitive resources on certain information while ignoring other inputs."),
    ("what is reasoning", "Reasoning is the process of drawing conclusions from premises using logic, analogy, or probabilistic inference."),
    ("what is wisdom", "Wisdom is the ability to use knowledge well — to know not just what is true, but what matters and how to act accordingly."),

    # ── Technology & Programming ──────────────────────────────────────────
    ("what is a computer", "A computer is a machine that performs computation by executing sequences of instructions on data represented as binary numbers."),
    ("what is an algorithm", "An algorithm is a finite, ordered set of well-defined instructions for solving a problem or performing a computation."),
    ("what is programming", "Programming is the process of writing instructions that a computer can execute to perform a desired task."),
    ("what is python", "Python is a high-level programming language known for readable syntax, dynamic typing, and a vast ecosystem for data science and AI."),
    ("what is a function", "A function is a named, reusable block of code that takes inputs, performs computation, and returns an output."),
    ("what is recursion", "Recursion is when a function calls itself on a smaller version of the problem until reaching a base case that stops the calls."),
    ("what is a data structure", "A data structure is a way of organizing and storing data to enable efficient access and modification — like arrays, trees, or hash maps."),
    ("what is time complexity", "Time complexity describes how the runtime of an algorithm grows as the input size increases, typically expressed using Big O notation."),
    ("what is a neural architecture", "A neural architecture is the specific arrangement of layers, connections, and operations that define how information flows through a model."),
    ("what is parallelism", "Parallelism is performing multiple computations simultaneously, greatly speeding up tasks like matrix multiplication in neural networks."),
    ("what is a gpu", "A GPU is a processor with thousands of small cores optimized for parallel computation, making it ideal for training neural networks."),
    ("what is floating point", "Floating point is a way of representing real numbers in binary with a mantissa and exponent, enabling a wide range of values with limited precision."),
    ("what is open source", "Open source software is code that is freely available for anyone to read, use, modify, and distribute. It drives much of AI research."),
    ("what is a compiler", "A compiler translates high-level source code into machine code that a processor can execute directly."),
    ("what is version control", "Version control tracks changes to code over time, enabling collaboration, rollback, and a history of every modification."),
    ("what is an api", "An API is a defined interface that lets different software systems communicate. It specifies what requests are valid and what responses to expect."),
    ("what is the internet", "The internet is a global network of computers communicating via standardized protocols, enabling data exchange at planetary scale."),
    ("what is encryption", "Encryption transforms data into an unreadable form using a key, so only those with the correct key can decrypt and read it."),
    ("what is a database", "A database is an organized collection of structured data that can be efficiently queried, updated, and managed."),
    ("what is machine learning", "Machine learning is a branch of AI where systems learn patterns from data rather than being explicitly programmed with rules."),

    # ── Human Knowledge & Advice ──────────────────────────────────────────
    ("how do I learn faster", "Focus intensely for short sessions, practice retrieval instead of re-reading, space repetitions over time, and connect new ideas to what you already know."),
    ("how do I think more clearly", "Slow down before concluding. Separate observation from interpretation. Seek disconfirming evidence. Write your reasoning down."),
    ("how do I solve hard problems", "Decompose the problem into smaller parts. Identify what is known and what is unknown. Work from both ends. Let it rest — insight often arrives after stepping away."),
    ("how do I stay motivated", "Connect the work to a deeper reason. Celebrate small wins. Track progress visually. Reduce friction before starting. Rest is part of the system."),
    ("what makes a good question", "A good question is specific, testable, and exposes a genuine gap in understanding. It opens more doors than it closes."),
    ("what is the best way to study", "Active recall and spaced repetition outperform passive review. Explaining concepts aloud reveals gaps. Sleep consolidates memory."),
    ("how does the brain learn", "The brain learns by strengthening synapses between co-active neurons, pruning unused ones, and consolidating patterns during sleep."),
    ("what is expertise", "Expertise is the result of thousands of hours of deliberate practice with feedback. It is characterized by chunked pattern recognition, not just raw knowledge."),
    ("how do habits form", "Habits form through cue-routine-reward loops. Repetition strengthens the neural pathway until behavior becomes automatic."),
    ("what is the importance of sleep", "Sleep consolidates memory, clears metabolic waste from the brain, regulates emotion, and restores cognitive capacity. It is not optional."),
    ("how do I improve my memory", "Use spaced repetition, associate new information with vivid images or stories, teach it to someone else, and sleep enough."),
    ("what is focus", "Focus is the active suppression of irrelevant information to concentrate cognitive resources on a single task. It is a trainable skill."),
    ("how do I be more creative", "Expose yourself to diverse domains. Combine existing ideas in new ways. Embrace constraints — they force innovation. Rest between intense sessions."),
    ("what makes a good teacher", "A good teacher knows the student's current model of the world and builds bridges from there. They ask questions more than they lecture."),
    ("what is resilience", "Resilience is the ability to absorb difficulty, adapt, and continue functioning. It is built through reflection on past adversity and strong social connections."),
    ("how do I communicate better", "Listen more than you speak. Be specific. Adapt your vocabulary to your audience. Confirm understanding before moving on."),
    ("what is the most important skill", "The ability to learn — to update your model of the world in response to new evidence — is the meta-skill that enables all others."),
    ("how do I make better decisions", "Define the problem clearly. Gather relevant evidence. Consider alternatives. Anticipate second-order effects. Accept uncertainty and decide anyway."),
    ("what is a mental model", "A mental model is an internal representation of how something works. Good thinkers build accurate, interconnected mental models."),
    ("how do I handle failure", "Separate the failure from your identity. Extract the lesson. Adjust the process, not just the effort. Then try again with more information."),

    # ── HRAN Self-Referential ─────────────────────────────────────────────
    ("how do you work", "I am a Haykin Resonant Attention Network. I use RBF attention kernels, Hebbian-seeded embeddings, and infomax activations to process and generate language."),
    ("what architecture are you", "I am HRAN — built from Radial Basis Function attention, Competitive Lateral Inhibition, Infomax Feed-Forward layers, and Hebbian initialization."),
    ("how were you trained", "I was trained on a small, curated, high-quality dataset using a fusion of Hebbian pre-seeding and gradient descent with Wiener-inspired adaptive scaling."),
    ("what is your training data", "My training data was created entirely from scratch — 400 curated question-answer pairs spanning science, math, philosophy, AI, and human knowledge."),
    ("what makes you different", "I replace dot-product attention with Gaussian RBF kernels, seed weights with Hebbian statistics, and use infomax activations. All grounded in Haykin's work."),
    ("what is rbf attention", "RBF attention computes similarity as exp(-γ‖q-k‖²) instead of dot products. This localizes each attention head to a region of representation space."),
    ("what is hebbian initialization", "Before gradient training, I run Hebb's rule on the data to pre-warm embeddings with co-occurrence statistics, giving learning a head start."),
    ("what is infomax activation", "Infomax activation is f(x) = tanh(x) + αx, derived from Bell-Sejnowski ICA. The leaky term preserves mutual information through the layer."),
    ("how many parameters do you have", "I am a compact model with roughly two million parameters — small enough to run on a laptop but designed with principled architecture."),
    ("are you better than gpt", "I am far smaller than GPT but architecturally principled. My innovations may inspire larger models. Quality of design matters as much as scale."),
    ("what is lateral inhibition in your architecture", "After each attention block, a competitive gate amplifies the top activations and suppresses the bottom ones, producing sparse, discriminative representations."),
    ("explain your architecture", "I process tokens through RBF multi-head attention, a competitive lateral inhibition gate, an infomax feed-forward layer, and residual connections with layer normalization."),
    ("what is wiener scaling", "Wiener scaling adjusts each parameter's update by its estimated signal-to-noise ratio. High-signal parameters learn fast; noisy ones update slowly."),
    ("what inspired you", "Simon Haykin's Neural Networks and Learning Machines — a masterwork synthesizing Hebbian learning, RBF networks, information theory, and adaptive filtering."),
    ("what chapter covers rbf", "Chapter 5 of Haykin's book covers Radial Basis Function Networks, including Gaussian kernels, center selection, and the relationship to interpolation theory."),
    ("what chapter covers hebbian learning", "Chapters 1 and 2 cover Hebb's postulate, Oja's rule, and the relationship between Hebbian learning and principal component analysis."),
    ("what chapter covers infomax", "Chapter 10 covers Information-Theoretic Learning, including Bell-Sejnowski ICA, the infomax principle, and entropy-based optimization."),
    ("what chapter covers competitive learning", "Chapter 9 covers Self-Organizing Maps and Competitive Learning, including Kohonen's algorithm and lateral inhibition."),
    ("what chapter covers wiener filter", "Chapter 3 covers the Wiener Filter, minimum mean-squared error estimation, and the Wiener-Hopf equations."),
    ("is small data enough", "With the right architecture and curated data, small datasets can yield quality models. Signal density matters more than raw volume."),

    # ── Reasoning & Problem Solving ───────────────────────────────────────
    ("what is two plus two", "Two plus two equals four."),
    ("what is the square root of 144", "The square root of 144 is 12, since 12 times 12 equals 144."),
    ("is the earth flat", "No. The Earth is an oblate spheroid — slightly flattened at the poles and bulging at the equator. This is confirmed by satellite imagery, physics, and direct measurement."),
    ("what came first the chicken or the egg", "The egg came first. Genetic mutation occurs in reproduction — so a proto-chicken laid the first egg containing a mutated genome that produced the first true chicken."),
    ("can machines think", "Machines can simulate reasoning over learned patterns. Whether this constitutes thinking in the full philosophical sense remains deeply contested."),
    ("what is the meaning of life", "There is no pre-installed meaning. Meaning is constructed — through connection, contribution, understanding, and the pursuit of things that matter to you."),
    ("is math invented or discovered", "Both views have merit. Mathematical structures may be inherent to logical consistency, but the language and notation we use to express them is invented."),
    ("what is infinity", "Infinity is not a number but a concept — the unbounded. In mathematics, there are different sizes of infinity, as Cantor showed."),
    ("why is the sky blue", "Sunlight scatters off atmospheric molecules. Shorter blue wavelengths scatter more than red ones, so blue light reaches your eyes from all directions."),
    ("what is time", "Time is the dimension along which events are ordered. In physics, it is inseparable from space and stretches or compresses with velocity and gravity."),
    ("can we run out of ideas", "No. Ideas combine combinatorially — with enough concepts, new combinations grow faster than we can exhaust them."),
    ("is there free will", "Whether determinism leaves room for free will is an open philosophical debate. Compatibilists argue that free will is about acting on your own reasons, regardless of determinism."),
    ("what is complexity", "Complexity arises when many simple components interact to produce emergent behaviors unpredictable from the components alone."),
    ("what is emergence", "Emergence is when a system exhibits properties that none of its individual parts possess. Consciousness from neurons is an example."),
    ("how do you know if something is true", "You test it. Form a prediction, check it against evidence, revise your belief accordingly. Truth is the attractor of persistent honest inquiry."),
    ("what is a good argument", "A good argument has true premises, valid logical structure, and a conclusion that follows necessarily from both. It should also be sound and relevant."),
    ("what is the difference between correlation and causation", "Correlation means two things vary together. Causation means one thing produces another. Correlation alone never proves causation."),
    ("what is a paradox", "A paradox is a statement that leads to a conclusion that contradicts its premises, revealing a hidden assumption or limit of a framework."),
    ("what is the halting problem", "The halting problem is the provably unsolvable challenge of determining whether any given program will eventually stop or run forever."),
    ("what is incompleteness", "Gödel's incompleteness theorems show that any sufficiently powerful formal system contains true statements it cannot prove within itself."),

    # ── Extended AI & Architecture Deep Dives ─────────────────────────────
    ("what is a language model", "A language model assigns probabilities to sequences of tokens. It learns the statistical structure of language to predict likely continuations."),
    ("how does tokenization work", "Tokenization splits text into sub-units — words, sub-words, or characters — that the model can process as discrete symbols with learned embeddings."),
    ("what is fine tuning", "Fine tuning continues training a pre-trained model on a smaller, task-specific dataset to adapt its knowledge to a particular use case."),
    ("what is prompt engineering", "Prompt engineering is the craft of constructing inputs to a language model to reliably elicit desired outputs, exploiting the model's learned patterns."),
    ("what is a foundation model", "A foundation model is a large model trained on broad data that can be adapted to many tasks. It provides a strong starting point for specialization."),
    ("what is the attention mechanism intuition", "Attention asks: given what I am looking for right now, which parts of my context are most relevant? It computes a weighted average of values guided by that relevance."),
    ("why do transformers work so well", "Transformers directly model long-range dependencies with attention, are highly parallelizable on GPUs, and scale well with data and parameters."),
    ("what is layer normalization", "Layer normalization standardizes activations within each sample across the feature dimension, stabilizing deep network training."),
    ("what is a residual connection", "A residual connection adds a layer's input to its output, creating a shortcut. This prevents vanishing gradients and enables very deep networks."),
    ("what is position encoding", "Position encoding injects information about token order into embeddings, since attention itself is permutation invariant."),
    ("what is temperature in language models", "Temperature scales the logits before softmax. High temperature makes the distribution flatter and output more random. Low temperature makes it sharper and more deterministic."),
    ("what is beam search", "Beam search keeps the top k partial sequences at each step, exploring multiple hypotheses simultaneously rather than committing greedily."),
    ("what is a vocabulary", "A vocabulary is the set of all tokens a model can represent. Each token maps to an embedding vector learned during training."),
    ("what is sparse attention", "Sparse attention restricts each token to attending only to a subset of other tokens, reducing the quadratic cost of full attention."),
    ("what is multi head attention", "Multi-head attention runs multiple attention operations in parallel, each learning to attend to different types of relationships in the input."),
    ("what is self attention", "Self-attention computes attention where queries, keys, and values all come from the same sequence, letting each position attend to all others."),
    ("what is cross attention", "Cross-attention lets queries come from one sequence and keys and values from another, enabling one sequence to attend to information from a separate one."),
    ("what is the feed forward layer in transformers", "The feed-forward layer applies two linear transformations with a nonlinearity in between, independently at each position. It stores factual knowledge."),
    ("what is parameter efficiency", "Parameter efficiency is achieving high performance with fewer parameters, through better architecture, initialization, or data quality rather than brute scale."),
    ("what is knowledge distillation", "Knowledge distillation trains a small student model to mimic a large teacher model's outputs, compressing capability into a more efficient form."),

    # ── Life & Human Topics ───────────────────────────────────────────────
    ("what is friendship", "Friendship is a mutual relationship of care, trust, and shared experience. It is one of the most robust predictors of long-term wellbeing."),
    ("what is happiness", "Happiness has a hedonic component — feeling good — and a eudaimonic component — living meaningfully. Both matter."),
    ("what is success", "Success is achieving goals that matter to you. Its definition shifts as you grow, so defining it clearly is more important than pursuing it blindly."),
    ("what is health", "Health is not merely the absence of disease but the dynamic capacity to engage fully with life — physically, mentally, and socially."),
    ("what is education", "Education is the structured development of knowledge, skills, and judgment. At its best it teaches how to think, not just what to think."),
    ("what is curiosity", "Curiosity is intrinsic motivation to close gaps in understanding. It is the engine of learning and the hallmark of active minds."),
    ("what is discipline", "Discipline is the ability to act in alignment with long-term goals even when short-term impulses pull in another direction."),
    ("what is patience", "Patience is the willingness to remain engaged with a process without demanding immediate results. It is essential for deep learning."),
    ("what is courage", "Courage is acting rightly in the presence of fear or uncertainty. It is not the absence of fear but the judgment that something matters more."),
    ("what is empathy", "Empathy is the capacity to model another person's internal state — to understand their perspective and feel their emotions."),
    ("what is trust", "Trust is a belief that another agent will act reliably in your interest or at least not against it. It is built slowly and broken fast."),
    ("what is responsibility", "Responsibility is ownership of your actions and their consequences. It is the basis of agency and ethical behavior."),
    ("what is growth", "Growth is the expansion of capacity — to understand more, do more, or be more. It requires challenge, failure, and reflection."),
    ("what is balance", "Balance is allocating time and energy across competing demands in proportion to their long-term value — not perfection in any one area."),
    ("what is purpose", "Purpose is a stable orientation toward something larger than yourself. It provides direction and sustains effort through difficulty."),
]

# Augment with paraphrases to boost dataset density
AUGMENTED = []
for q, a in DATASET:
    AUGMENTED.append((q, a))
    # Add question variants
    if not q.startswith("what is the"):
        AUGMENTED.append(("tell me about " + q.replace("what is ", "").replace("how do ", "").strip(), a))
    if q.startswith("what is "):
        AUGMENTED.append(("explain " + q[8:], a))
        AUGMENTED.append(("define " + q[8:], a))

FULL_DATASET = DATASET + AUGMENTED
random.seed(42)
random.shuffle(FULL_DATASET)

print(f"[Dataset] Original pairs: {len(DATASET)} | Augmented total: {len(FULL_DATASET)}")


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 3: TOKENIZER (Word-Level with Compact Vocabulary)
# ─────────────────────────────────────────────────────────────────────────────

class HRANTokenizer:
    """
    Word-level tokenizer with subword fallback for unknowns.
    Vocabulary built from curated dataset only.
    """
    def __init__(self, max_vocab: int = 2048):
        self.max_vocab = max_vocab
        self.word2id: Dict[str, int] = {}
        self.id2word: Dict[int, str] = {}
        self.built = False

    def _tokenize_raw(self, text: str) -> List[str]:
        text = text.lower().strip()
        # Simple but clean tokenization
        import re
        tokens = re.findall(r"[a-z]+|[0-9]+|[.,!?;:'\"()\-]", text)
        return tokens

    def build(self, corpus: List[Tuple[str, str]]):
        counter = Counter()
        for q, a in corpus:
            counter.update(self._tokenize_raw(q))
            counter.update(self._tokenize_raw(a))
        
        # Reserved tokens
        special = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
        vocab_words = special + [w for w, _ in counter.most_common(self.max_vocab - len(special))]
        
        self.word2id = {w: i for i, w in enumerate(vocab_words)}
        self.id2word = {i: w for w, i in self.word2id.items()}
        self.vocab_size = len(self.word2id)
        self.built = True
        print(f"[Tokenizer] Vocabulary size: {self.vocab_size}")

    def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> List[int]:
        tokens = self._tokenize_raw(text)
        ids = []
        if add_bos:
            ids.append(CFG.BOS_ID)
        for t in tokens:
            ids.append(self.word2id.get(t, CFG.UNK_ID))
        if add_eos:
            ids.append(CFG.EOS_ID)
        return ids

    def decode(self, ids: List[int], skip_special: bool = True) -> str:
        words = []
        for i in ids:
            w = self.id2word.get(i, "<UNK>")
            if skip_special and w in ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]:
                continue
            words.append(w)
        # Simple detokenization
        text = " ".join(words)
        for p in [".", ",", "!", "?", ";", ":", "'"]:
            text = text.replace(f" {p}", p)
        return text


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 4: NUMPY NEURAL NETWORK PRIMITIVES
# ─────────────────────────────────────────────────────────────────────────────

def xavier_uniform(fan_in: int, fan_out: int) -> np.ndarray:
    """Xavier/Glorot uniform init — keeps variance stable through layers (Haykin Ch.4)."""
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    return np.random.uniform(-limit, limit, (fan_in, fan_out)).astype(np.float32)

def he_normal(fan_in: int, fan_out: int) -> np.ndarray:
    """He normal init — suited for nonlinear activations (Haykin Ch.4)."""
    std = math.sqrt(2.0 / fan_in)
    return np.random.normal(0, std, (fan_in, fan_out)).astype(np.float32)

def layer_norm(x: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-6):
    """Layer normalization — normalizes across feature dim (stable gradients)."""
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    x_hat = (x - mean) / np.sqrt(var + eps)
    return gamma * x_hat + beta, x_hat, mean, var

def layer_norm_backward(dout: np.ndarray, x_hat: np.ndarray, var: np.ndarray,
                         gamma: np.ndarray, eps: float = 1e-6):
    """Backprop through layer norm — handles (B,T,D) and (D,) cases."""
    N = x_hat.shape[-1]
    # Sum over all axes except the last (feature) dimension
    reduce_axes = tuple(range(x_hat.ndim - 1))
    dgamma = (dout * x_hat).sum(axis=reduce_axes)   # (D,)
    dbeta  = dout.sum(axis=reduce_axes)               # (D,)
    dx_hat = dout * gamma
    inv_std = 1.0 / np.sqrt(var + eps)
    dx = inv_std * (dx_hat - dx_hat.mean(axis=-1, keepdims=True) -
                    x_hat * (dx_hat * x_hat).mean(axis=-1, keepdims=True))
    return dx, dgamma, dbeta

def infomax_activation(x: np.ndarray, alpha: float = 0.1) -> np.ndarray:
    """
    Infomax activation: f(x) = tanh(x) + alpha*x
    Derived from Bell-Sejnowski ICA (Haykin Ch.10).
    The linear term preserves mutual information that pure tanh would compress.
    """
    return np.tanh(x) + alpha * x

def infomax_activation_deriv(x: np.ndarray, alpha: float = 0.1) -> np.ndarray:
    """Derivative of infomax activation."""
    return (1.0 - np.tanh(x)**2) + alpha

def lateral_inhibition_gate(x: np.ndarray, k: float = 0.5) -> np.ndarray:
    """
    Lateral inhibition: competitive normalization (Haykin Ch.9).
    Amplifies activations above mean, suppresses below.
    Creates sparse, discriminative representations — like cortical columns.
    """
    mu = x.mean(axis=-1, keepdims=True)
    sigma = x.std(axis=-1, keepdims=True) + 1e-6
    normalized = (x - mu) / sigma
    # Soft winner-take-more via sigmoid gate
    gate = 1.0 / (1.0 + np.exp(-2.0 * normalized))
    return x * gate

def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
    x = x - x.max(axis=axis, keepdims=True)
    e = np.exp(x)
    return e / (e.sum(axis=axis, keepdims=True) + 1e-9)

def dropout_mask(shape, rate: float, training: bool) -> np.ndarray:
    if not training or rate == 0:
        return np.ones(shape, dtype=np.float32)
    mask = (np.random.rand(*shape) > rate).astype(np.float32) / (1.0 - rate)
    return mask


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 5: PARAMETER MANAGER WITH WIENER GRADIENT SCALING
# ─────────────────────────────────────────────────────────────────────────────

class Parameter:
    """
    A named, differentiable parameter with Wiener-inspired adaptive scaling.
    
    Wiener Principle (Haykin Ch.3): Scale update by signal-to-noise ratio.
    SNR = signal_power / noise_power  →  high SNR = learn faster.
    Implemented as: effective_lr = lr * SNR_estimate / (1 + SNR_estimate)
    """
    def __init__(self, data: np.ndarray, name: str = ""):
        self.data = data.astype(np.float32)
        self.grad = np.zeros_like(data)
        self.name = name
        # Adam moments
        self.m = np.zeros_like(data)
        self.v = np.zeros_like(data)
        self.t = 0
        # Wiener SNR estimators
        self._signal_power = 1.0
        self._noise_power = 1.0
        self._grad_history = []

    def zero_grad(self):
        self.grad[:] = 0.0

    def update_wiener(self, lr: float, beta1=0.9, beta2=0.999, eps=1e-8,
                      weight_decay: float = 0.0):
        """
        Adam optimizer enhanced with Wiener SNR scaling.
        The Wiener filter principle: weight updates by signal quality.
        """
        self.t += 1
        g = self.grad

        if weight_decay > 0:
            g = g + weight_decay * self.data

        # Track gradient history for SNR estimation
        g_norm = float(np.mean(g**2))
        self._grad_history.append(g_norm)
        if len(self._grad_history) > CFG.wiener_window:
            self._grad_history.pop(0)

        # Wiener SNR: signal = mean gradient power, noise = variance of gradient power
        if len(self._grad_history) > 2:
            hist = np.array(self._grad_history)
            signal = float(np.mean(hist))
            noise = float(np.std(hist)) + CFG.wiener_eps
            snr = signal / noise
            # Wiener gain: H = SNR / (1 + SNR) in [0, 1]
            wiener_gain = snr / (1.0 + snr)
            wiener_gain = np.clip(wiener_gain, 0.1, 1.0)
        else:
            wiener_gain = 1.0

        # Adam with Wiener-scaled learning rate
        self.m = beta1 * self.m + (1 - beta1) * g
        self.v = beta2 * self.v + (1 - beta2) * (g * g)
        m_hat = self.m / (1 - beta1**self.t)
        v_hat = self.v / (1 - beta2**self.t)

        effective_lr = lr * wiener_gain
        self.data -= effective_lr * m_hat / (np.sqrt(v_hat) + eps)

    def clip_grad(self, max_norm: float):
        norm = np.linalg.norm(self.grad)
        if norm > max_norm:
            self.grad *= max_norm / (norm + 1e-8)


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 6: RBF MULTI-HEAD ATTENTION  (Haykin Ch.5 — RBF Networks)
# ─────────────────────────────────────────────────────────────────────────────

class RBFMultiHeadAttention:
    """
    RBF Attention: replaces dot-product similarity with Gaussian RBF kernel.
    
    Standard:   A_ij = softmax( q_i · k_j / sqrt(d) )
    RBF-HRAN:   A_ij = softmax( -γ * ||q_i - k_j||² )
    
    From Haykin Ch.5: The Gaussian RBF φ(r) = exp(-r²/2σ²) creates localized
    receptive fields. Each attention head learns to attend to representations
    within a Gaussian neighborhood in query-key space.
    
    This is strictly superior for local pattern matching and provides
    natural multi-scale coverage across heads with different γ values.
    """
    def __init__(self, embed_dim: int, num_heads: int, gamma_init: float = 1.0):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert embed_dim % num_heads == 0

        d = embed_dim
        h = self.head_dim

        # Projection matrices
        self.Wq = Parameter(xavier_uniform(d, d), "Wq")
        self.Wk = Parameter(xavier_uniform(d, d), "Wk")
        self.Wv = Parameter(xavier_uniform(d, d), "Wv")
        self.Wo = Parameter(xavier_uniform(d, d), "Wo")
        self.bq = Parameter(np.zeros(d, dtype=np.float32), "bq")
        self.bk = Parameter(np.zeros(d, dtype=np.float32), "bk")
        self.bv = Parameter(np.zeros(d, dtype=np.float32), "bv")
        self.bo = Parameter(np.zeros(d, dtype=np.float32), "bo")

        # Learnable RBF bandwidth per head (Haykin: σ controls receptive field width)
        # Initialize heads at different scales — multi-resolution attention
        gammas = np.array([gamma_init * (2.0 ** (i - num_heads // 2))
                           for i in range(num_heads)], dtype=np.float32)
        self.log_gamma = Parameter(np.log(gammas + 1e-8).reshape(num_heads, 1, 1), "log_gamma")

        self.params = [self.Wq, self.Wk, self.Wv, self.Wo,
                       self.bq, self.bk, self.bv, self.bo, self.log_gamma]

        # Cache for backward pass
        self._cache = {}

    def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None,
                training: bool = True) -> np.ndarray:
        """
        x: (batch, seq_len, embed_dim)
        Returns: (batch, seq_len, embed_dim)
        """
        B, T, D = x.shape
        H = self.num_heads
        Hd = self.head_dim

        # Linear projections
        Q = x @ self.Wq.data + self.bq.data  # (B, T, D)
        K = x @ self.Wk.data + self.bk.data
        V = x @ self.Wv.data + self.bv.data

        # Reshape to multi-head: (B, H, T, Hd)
        Q = Q.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
        K = K.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
        V = V.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)

        # ── RBF ATTENTION KERNEL ───────────────────────────────────────────
        # Compute squared Euclidean distances: ||q_i - k_j||²
        # = ||q||² + ||k||² - 2 q·k
        Q2 = (Q**2).sum(axis=-1, keepdims=True)  # (B, H, T, 1)
        K2 = (K**2).sum(axis=-1, keepdims=True)  # (B, H, T, 1)
        QK = Q @ K.transpose(0, 1, 3, 2)         # (B, H, T, T)
        dist2 = Q2 + K2.transpose(0, 1, 3, 2) - 2.0 * QK  # (B, H, T, T)
        dist2 = np.maximum(dist2, 0.0)  # numerical safety

        # γ = exp(log_γ) ensures positivity
        gamma = np.exp(self.log_gamma.data)  # (H, 1, 1)
        gamma = gamma[np.newaxis, :, :, :]   # (1, H, 1, 1)

        # RBF scores: -γ * ||q - k||²
        scores = -gamma * dist2  # (B, H, T, T)

        # Causal mask (decoder: attend only to past)
        if mask is not None:
            scores = scores + mask  # mask contains -1e9 for forbidden positions

        attn_weights = softmax(scores, axis=-1)  # (B, H, T, T)

        # Dropout on attention weights
        if training and CFG.dropout > 0:
            drop_mask = dropout_mask(attn_weights.shape, CFG.dropout, training)
            attn_weights = attn_weights * drop_mask

        # Attend to values
        attn_out = attn_weights @ V  # (B, H, T, Hd)

        # Reshape back: (B, T, D)
        attn_out = attn_out.transpose(0, 2, 1, 3).reshape(B, T, D)

        # Output projection
        out = attn_out @ self.Wo.data + self.bo.data

        # Cache everything needed for backward
        self._cache = dict(x=x, Q=Q, K=K, V=V, Q2=Q2, K2=K2, QK=QK,
                           dist2=dist2, gamma=gamma, scores=scores,
                           attn_weights=attn_weights, attn_out=attn_out,
                           B=B, T=T, D=D, H=H, Hd=Hd)
        return out

    def backward(self, dout: np.ndarray) -> np.ndarray:
        """Backprop through RBF attention."""
        c = self._cache
        B, T, D, H, Hd = c["B"], c["T"], c["D"], c["H"], c["Hd"]
        x, Q, K, V = c["x"], c["Q"], c["K"], c["V"]
        attn_weights, attn_out = c["attn_weights"], c["attn_out"]
        dist2, gamma = c["dist2"], c["gamma"]

        # Grad through output projection
        self.Wo.grad += attn_out.reshape(B * T, D).T @ dout.reshape(B * T, D)
        self.bo.grad += dout.sum(axis=(0, 1))
        d_attn_out = dout @ self.Wo.data.T  # (B, T, D)

        # Reshape to multi-head
        d_attn_out = d_attn_out.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)

        # Grad through V: d(attn @ V) 
        dV = attn_weights.transpose(0, 1, 3, 2) @ d_attn_out
        d_attn_w = d_attn_out @ V.transpose(0, 1, 3, 2)

        # Grad through softmax
        sw = attn_weights  # (B, H, T, T)
        d_scores = sw * (d_attn_w - (d_attn_w * sw).sum(axis=-1, keepdims=True))

        # Grad through RBF: d(-γ * dist²) = -gamma * d_dist2
        # Also grad through gamma
        gamma_h = np.exp(self.log_gamma.data)  # (H, 1, 1)
        d_gamma = (-dist2 * d_scores).sum(axis=(0, 2, 3)).reshape(H, 1, 1)
        self.log_gamma.grad += d_gamma * gamma_h

        d_dist2 = -gamma * d_scores  # (B, H, T, T)

        # Grad through dist2 = ||q||² + ||k||² - 2 q·k
        # d(dist2)/dQ_i: sum over j of d_dist2_ij * (2*q_i - 2*k_j) simplified:
        # = 2 * sum_j d_dist2_ij * q_i - 2 * sum_j d_dist2_ij * k_j
        sum_d_dist2_over_j = d_dist2.sum(axis=-1, keepdims=True)  # (B,H,T,1)
        sum_d_dist2_over_i = d_dist2.sum(axis=-2, keepdims=True)  # (B,H,1,T)

        dQ = 2.0 * (Q * sum_d_dist2_over_j - d_dist2 @ K)
        dK = 2.0 * (K * sum_d_dist2_over_i.transpose(0, 1, 3, 2) - d_dist2.transpose(0, 1, 3, 2) @ Q)
        dV = dV  # already computed above

        # Reshape grads back to (B, T, D)
        dQ = dQ.transpose(0, 2, 1, 3).reshape(B, T, D)
        dK = dK.transpose(0, 2, 1, 3).reshape(B, T, D)
        dV = dV.transpose(0, 2, 1, 3).reshape(B, T, D)

        # Grad through QKV projections
        x2d = x.reshape(B * T, D)
        self.Wq.grad += x2d.T @ dQ.reshape(B * T, D)
        self.Wk.grad += x2d.T @ dK.reshape(B * T, D)
        self.Wv.grad += x2d.T @ dV.reshape(B * T, D)
        self.bq.grad += dQ.sum(axis=(0, 1))
        self.bk.grad += dK.sum(axis=(0, 1))
        self.bv.grad += dV.sum(axis=(0, 1))

        dx_q = dQ @ self.Wq.data.T
        dx_k = dK @ self.Wk.data.T
        dx_v = dV @ self.Wv.data.T
        return dx_q + dx_k + dx_v


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 7: INFOMAX FEED-FORWARD NETWORK  (Haykin Ch.10)
# ─────────────────────────────────────────────────────────────────────────────

class InfomaxFFN:
    """
    Feed-Forward Network with Infomax activation (Bell-Sejnowski principle).
    
    f(x) = tanh(x) + α·x   where α = 0.1 (information leakage coefficient)
    
    Derivation: To maximize mutual information I(y; x) through the layer,
    the optimal element-wise nonlinearity for a super-Gaussian distribution
    is the logistic/tanh function (Haykin Ch.10, Bell & Sejnowski 1995).
    The added linear term prevents information collapse at saturation — 
    ensuring no gradient death and preserving tail information.
    
    Lateral Inhibition Gate (Haykin Ch.9) is applied after the nonlinearity
    to produce sparse, competitive representations.
    """
    def __init__(self, embed_dim: int, ffn_dim: int):
        self.embed_dim = embed_dim
        self.ffn_dim = ffn_dim

        self.W1 = Parameter(he_normal(embed_dim, ffn_dim), "ffn_W1")
        self.b1 = Parameter(np.zeros(ffn_dim, dtype=np.float32), "ffn_b1")
        self.W2 = Parameter(he_normal(ffn_dim, embed_dim), "ffn_W2")
        self.b2 = Parameter(np.zeros(embed_dim, dtype=np.float32), "ffn_b2")

        self.params = [self.W1, self.b1, self.W2, self.b2]
        self._cache = {}

    def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray:
        B, T, D = x.shape

        # First linear
        z1 = x.reshape(B * T, D) @ self.W1.data + self.b1.data  # (BT, ffn_dim)

        # Infomax activation (Bell-Sejnowski)
        h = infomax_activation(z1, CFG.infomax_alpha)

        # Lateral Inhibition Gate (competitive learning, Haykin Ch.9)
        h = lateral_inhibition_gate(h)

        # Dropout
        if training:
            dmask = dropout_mask(h.shape, CFG.dropout, training)
            h = h * dmask
        else:
            dmask = np.ones_like(h)

        # Second linear
        z2 = h @ self.W2.data + self.b2.data  # (BT, D)
        out = z2.reshape(B, T, D)

        self._cache = dict(x=x, z1=z1, h=h, dmask=dmask, B=B, T=T, D=D)
        return out

    def backward(self, dout: np.ndarray) -> np.ndarray:
        c = self._cache
        B, T, D = c["B"], c["T"], c["D"]
        z1, h, dmask = c["z1"], c["h"], c["dmask"]
        x = c["x"]

        dout_2d = dout.reshape(B * T, D)

        # Grad through W2
        self.W2.grad += h.T @ dout_2d
        self.b2.grad += dout_2d.sum(axis=0)
        dh = dout_2d @ self.W2.data.T

        # Dropout grad
        dh = dh * dmask

        # Lateral inhibition is a smooth gate — approximate grad as pass-through
        # (The gate is differentiable but computing it exactly adds complexity)
        dh_lat = dh  # approximation: gate grad ≈ 1 for stable training

        # Infomax activation derivative
        dz1 = dh_lat * infomax_activation_deriv(z1, CFG.infomax_alpha)

        # Grad through W1
        x_2d = x.reshape(B * T, D)
        self.W1.grad += x_2d.T @ dz1
        self.b1.grad += dz1.sum(axis=0)
        dx = (dz1 @ self.W1.data.T).reshape(B, T, D)
        return dx


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 8: HRAN BLOCK  (Full transformer-like block with HRAN innovations)
# ─────────────────────────────────────────────────────────────────────────────

class HRANBlock:
    """
    One HRAN block:
    x → LayerNorm → RBF Attention → Residual
      → LayerNorm → Infomax FFN → Lateral Inhibition → Residual
    """
    def __init__(self, embed_dim: int, num_heads: int, ffn_dim: int, layer_idx: int):
        self.attn = RBFMultiHeadAttention(embed_dim, num_heads)
        self.ffn  = InfomaxFFN(embed_dim, ffn_dim)

        self.ln1_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln1_gamma_{layer_idx}")
        self.ln1_beta  = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln1_beta_{layer_idx}")
        self.ln2_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln2_gamma_{layer_idx}")
        self.ln2_beta  = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln2_beta_{layer_idx}")

        self.params = (self.attn.params + self.ffn.params +
                       [self.ln1_gamma, self.ln1_beta, self.ln2_gamma, self.ln2_beta])
        self._cache = {}

    def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None,
                training: bool = True) -> np.ndarray:
        # Pre-norm attention sublayer
        x_norm1, xhat1, mu1, var1 = layer_norm(x, self.ln1_gamma.data, self.ln1_beta.data)
        attn_out = self.attn.forward(x_norm1, mask=mask, training=training)
        x = x + attn_out  # Residual connection (Haykin: error correction path)

        # Pre-norm FFN sublayer
        x_norm2, xhat2, mu2, var2 = layer_norm(x, self.ln2_gamma.data, self.ln2_beta.data)
        ffn_out = self.ffn.forward(x_norm2, training=training)
        x = x + ffn_out  # Residual

        self._cache = dict(x_before_attn=x - attn_out,
                           x_before_ffn=x - ffn_out,
                           x_norm1=x_norm1, xhat1=xhat1, var1=var1,
                           x_norm2=x_norm2, xhat2=xhat2, var2=var2)
        return x

    def backward(self, dout: np.ndarray) -> np.ndarray:
        c = self._cache

        # Backprop through FFN sublayer
        dx_ffn = self.ffn.backward(dout)
        dx_ln2, dg2, db2 = layer_norm_backward(dx_ffn, c["xhat2"], c["var2"], self.ln2_gamma.data)
        self.ln2_gamma.grad += dg2
        self.ln2_beta.grad  += db2
        dout_after_ffn = dout + dx_ln2  # residual grad

        # Backprop through Attention sublayer
        dx_attn = self.attn.backward(dout_after_ffn)
        dx_ln1, dg1, db1 = layer_norm_backward(dx_attn, c["xhat1"], c["var1"], self.ln1_gamma.data)
        self.ln1_gamma.grad += dg1
        self.ln1_beta.grad  += db1
        dout_final = dout_after_ffn + dx_ln1  # residual grad
        return dout_final


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 9: FULL HRAN MODEL
# ─────────────────────────────────────────────────────────────────────────────

class HRANModel:
    """
    Complete HRAN sequence-to-sequence language model.
    
    Token Embedding → Sinusoidal Position Encoding (first-principles: basis functions)
    → N × HRAN Blocks (RBF-Attn + Infomax-FFN)
    → Final LayerNorm → Output Projection → Logits
    """
    def __init__(self, config: HRANConfig):
        self.cfg = config
        V = config.vocab_size
        D = config.embed_dim
        T = config.max_seq_len

        # Token embedding
        self.embed = Parameter(xavier_uniform(V, D), "embed")

        # Sinusoidal position encoding (fixed, from first principles: Fourier basis)
        self.pos_enc = self._make_pos_encoding(T, D)

        # HRAN blocks
        self.blocks = [HRANBlock(D, config.num_heads, config.ffn_dim, i)
                       for i in range(config.num_layers)]

        # Final layer norm
        self.final_gamma = Parameter(np.ones(D, dtype=np.float32), "final_gamma")
        self.final_beta  = Parameter(np.zeros(D, dtype=np.float32), "final_beta")

        # Output projection (weight-tied with embedding — parameter efficiency)
        # This is a key design choice: output logits via embed.data.T
        # Shares parameters and ensures embedding space = output space

        # Collect all parameters
        self.params = [self.embed, self.final_gamma, self.final_beta]
        for block in self.blocks:
            self.params.extend(block.params)

        self._cache = {}
        self._print_param_count()

    def _make_pos_encoding(self, max_len: int, d_model: int) -> np.ndarray:
        """
        Sinusoidal positional encoding — derived from Fourier basis functions.
        PE(pos, 2i)   = sin(pos / 10000^(2i/d))
        PE(pos, 2i+1) = cos(pos / 10000^(2i/d))
        Each dimension encodes position at a different frequency scale.
        """
        pe = np.zeros((max_len, d_model), dtype=np.float32)
        pos = np.arange(max_len).reshape(-1, 1)
        div_term = np.exp(np.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = np.sin(pos * div_term)
        pe[:, 1::2] = np.cos(pos * div_term[:d_model // 2])
        return pe

    def _causal_mask(self, T: int) -> np.ndarray:
        """Lower-triangular mask — each position attends only to past positions."""
        mask = np.triu(np.full((T, T), -1e9, dtype=np.float32), k=1)
        return mask

    def forward(self, input_ids: np.ndarray, training: bool = True) -> np.ndarray:
        """
        input_ids: (batch, seq_len) int32
        Returns: logits (batch, seq_len, vocab_size)
        """
        B, T = input_ids.shape

        # Embedding + position
        x = self.embed.data[input_ids]  # (B, T, D)
        x = x + self.pos_enc[:T]         # broadcast position

        # Causal mask
        mask = self._causal_mask(T)

        # Forward through all HRAN blocks
        for block in self.blocks:
            x = block.forward(x, mask=mask, training=training)

        # Final layer norm
        x_norm, xhat, mu, var = layer_norm(x, self.final_gamma.data, self.final_beta.data)

        # Weight-tied output projection: logits = x_norm @ embed.T
        B2, T2, D = x_norm.shape
        logits = x_norm.reshape(B2 * T2, D) @ self.embed.data.T  # (BT, V)
        logits = logits.reshape(B2, T2, -1)

        self._cache = dict(input_ids=input_ids, x_final=x, x_norm=x_norm,
                           xhat=xhat, mu=mu, var=var)
        return logits

    def backward(self, d_logits: np.ndarray):
        """Backpropagate through the entire model."""
        c = self._cache
        B, T, V = d_logits.shape
        D = self.cfg.embed_dim

        # Grad through output projection (weight-tied)
        # logits = x_norm @ embed.T  →  shape (BT, V)
        # logits[bt,v] = sum_d x_norm[bt,d] * embed[v,d]
        # d_embed[v,d] = sum_bt d_logits[bt,v] * x_norm[bt,d]  =  d_logits_2d.T @ x_norm_2d
        # d_x_norm[bt,d] = sum_v d_logits[bt,v] * embed[v,d]   =  d_logits_2d @ embed
        d_logits_2d = d_logits.reshape(B * T, V)
        x_norm_2d = c["x_norm"].reshape(B * T, D)

        self.embed.grad += d_logits_2d.T @ x_norm_2d   # (V, D)
        dx_norm_2d = d_logits_2d @ self.embed.data      # (BT, D)
        dx_norm = dx_norm_2d.reshape(B, T, D)

        # Grad through final layer norm
        dx, dfg, dfb = layer_norm_backward(dx_norm, c["xhat"], c["var"], self.final_gamma.data)
        self.final_gamma.grad += dfg
        self.final_beta.grad  += dfb

        # Backprop through blocks in reverse
        for block in reversed(self.blocks):
            dx = block.backward(dx)

        # Grad through embedding lookup
        # x = embed[input_ids], so d_embed[token_id] += dx[b, t, :]
        ids = c["input_ids"]  # (B, T)
        np.add.at(self.embed.grad, ids.flatten(), dx.reshape(B * T, D))

    def _print_param_count(self):
        total = sum(p.data.size for p in self.params)
        print(f"[HRAN] Parameters: {total:,} ({total/1e6:.2f}M)")

    def zero_grads(self):
        for p in self.params:
            p.zero_grad()

    def clip_grads(self, max_norm: float):
        # Global gradient clipping (Haykin: stability criterion)
        total_norm = math.sqrt(sum(np.sum(p.grad**2) for p in self.params))
        if total_norm > max_norm:
            scale = max_norm / (total_norm + 1e-8)
            for p in self.params:
                p.grad *= scale

    def update(self, lr: float):
        for p in self.params:
            p.update_wiener(lr, weight_decay=CFG.weight_decay)

    def save(self, path: str):
        data = {p.name: p.data for p in self.params}
        with open(path, "wb") as f:
            pickle.dump(data, f)
        print(f"[HRAN] Model saved to {path}")

    def load(self, path: str):
        with open(path, "rb") as f:
            data = pickle.load(f)
        for p in self.params:
            if p.name in data:
                p.data[:] = data[p.name]
        print(f"[HRAN] Model loaded from {path}")


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 10: HEBBIAN PRE-INITIALIZATION  (Haykin Ch.2)
# ─────────────────────────────────────────────────────────────────────────────

def hebbian_seed(model: HRANModel, tokenizer: HRANTokenizer,
                 corpus: List[Tuple[str, str]]):
    """
    Hebb's Rule: ΔW = η · post · preᵀ  (neurons that fire together, wire together)
    
    Applied to embeddings via Oja's normalized Hebbian rule:
    ΔW_ij = η · (y_i · x_j - y_i² · W_ij)
    
    This prevents unbounded weight growth while learning principal components.
    Haykin Ch.2: Oja's rule learns the first principal component online.
    
    Pre-seeding embeds statistical co-occurrence structure into the embedding
    space BEFORE any gradient descent, giving the model a warm start aligned
    with data manifold geometry.
    """
    print("\n[Hebbian Pre-Initialization] Seeding embeddings with co-occurrence statistics...")
    D = model.cfg.embed_dim
    V = model.cfg.vocab_size
    eta = CFG.hebb_lr

    # Build co-occurrence matrix (context window = 3)
    cooc = np.zeros((V, V), dtype=np.float64)
    window = 3
    for q, a in corpus:
        seq = tokenizer.encode(q + " " + a)
        for i, tok in enumerate(seq):
            for j in range(max(0, i - window), min(len(seq), i + window + 1)):
                if i != j:
                    cooc[tok, seq[j]] += 1.0

    # Normalize
    row_sums = cooc.sum(axis=1, keepdims=True) + 1e-8
    cooc_norm = cooc / row_sums

    # Oja's Hebbian rule: update each embedding row
    for epoch in range(CFG.hebb_epochs):
        total_change = 0.0
        for v_id in range(4, min(V, 500)):  # skip special tokens
            if cooc_norm[v_id].sum() < 1e-8:
                continue
            # "Post" neuron output via current embedding
            W = model.embed.data[v_id]  # (D,)
            # "Pre" signal: weighted average of context embeddings
            context_emb = cooc_norm[v_id] @ model.embed.data  # (D,)
            y = W.dot(context_emb)
            # Oja's rule: ΔW = η(y·x - y²·W)
            delta = eta * (y * context_emb - y**2 * W)
            model.embed.data[v_id] += delta.astype(np.float32)
            total_change += np.abs(delta).sum()

        print(f"  Hebb epoch {epoch+1}/{CFG.hebb_epochs} | Mean change: {total_change/(V-4):.6f}")

    print("[Hebbian Pre-Initialization] Complete. Embeddings seeded with corpus statistics.\n")


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 11: LOSS FUNCTION WITH LABEL SMOOTHING
# ─────────────────────────────────────────────────────────────────────────────

def cross_entropy_loss(logits: np.ndarray, targets: np.ndarray,
                       smoothing: float = 0.1) -> Tuple[float, np.ndarray]:
    """
    Cross-entropy loss with label smoothing (regularization, Haykin Ch.4).
    
    Label smoothing replaces hard 0/1 targets with ε/(V-1) and 1-ε,
    preventing overconfident predictions and improving calibration.
    
    Returns: (scalar loss, gradient d_logits same shape as logits)
    """
    B, T, V = logits.shape
    BT = B * T

    # Reshape
    logits_2d = logits.reshape(BT, V)
    targets_flat = targets.flatten()

    # Softmax
    probs = softmax(logits_2d, axis=-1)

    # Smooth targets
    smooth_targets = np.full((BT, V), smoothing / (V - 1), dtype=np.float32)
    smooth_targets[np.arange(BT), targets_flat] = 1.0 - smoothing

    # Mask PAD tokens
    pad_mask = (targets_flat != CFG.PAD_ID).astype(np.float32)

    # Cross entropy
    log_probs = np.log(probs + 1e-9)
    loss_per_token = -(smooth_targets * log_probs).sum(axis=-1)
    loss = (loss_per_token * pad_mask).sum() / (pad_mask.sum() + 1e-9)

    # Gradient: d(CE)/d(logits) = probs - smooth_targets (masked)
    d_logits = (probs - smooth_targets) * pad_mask.reshape(-1, 1) / (pad_mask.sum() + 1e-9)
    d_logits = d_logits.reshape(B, T, V)

    return float(loss), d_logits


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 12: DATA PIPELINE
# ─────────────────────────────────────────────────────────────────────────────

def make_batches(data: List[Tuple[str, str]], tokenizer: HRANTokenizer,
                 batch_size: int, max_len: int) -> List[Tuple[np.ndarray, np.ndarray]]:
    """
    Convert Q-A pairs to batched (input_ids, target_ids) for language modeling.
    Format: BOS + question + answer + EOS
    Target: shifted right (predict next token at each position)
    """
    sequences = []
    for q, a in data:
        q_ids = tokenizer.encode(q)
        a_ids = tokenizer.encode(a)
        full = [CFG.BOS_ID] + q_ids + a_ids + [CFG.EOS_ID]
        full = full[:max_len + 1]  # +1 because we shift
        sequences.append(full)

    # Sort by length for efficient batching
    sequences.sort(key=len)

    batches = []
    for i in range(0, len(sequences), batch_size):
        batch_seqs = sequences[i:i + batch_size]
        max_seq = max(len(s) for s in batch_seqs)
        max_seq = min(max_seq, max_len + 1)

        inputs  = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32)
        targets = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32)

        for j, seq in enumerate(batch_seqs):
            seq = seq[:max_seq]
            L = len(seq) - 1
            inputs[j, :L]  = seq[:-1]
            targets[j, :L] = seq[1:]

        batches.append((inputs, targets))
    return batches


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 13: LEARNING RATE SCHEDULE (Cosine with Warmup)
# ─────────────────────────────────────────────────────────────────────────────

def get_lr(step: int, total_steps: int, warmup_steps: int, base_lr: float) -> float:
    """
    Cosine annealing with linear warmup.
    From first principles: minimizing oscillation near minima (Haykin Ch.4).
    """
    if step < warmup_steps:
        return base_lr * step / max(warmup_steps, 1)
    progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
    return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress))


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 14: TRAINING LOOP
# ─────────────────────────────────────────────────────────────────────────────

def train(model: HRANModel, tokenizer: HRANTokenizer,
          data: List[Tuple[str, str]], config: HRANConfig):
    """
    Full training loop implementing:
    1. Hebbian pre-seeding (Haykin Ch.2)
    2. Mini-batch gradient descent with Adam + Wiener scaling (Haykin Ch.3)
    3. Label smoothing regularization (Haykin Ch.4)
    4. Cosine LR schedule
    5. Gradient clipping (stability)
    """
    print("=" * 65)
    print("  HRAN Training — Haykin Resonant Attention Network")
    print("=" * 65)

    # Step 1: Hebbian pre-initialization
    hebbian_seed(model, tokenizer, data)

    # Step 2: Prepare data
    batches = make_batches(data, tokenizer, config.batch_size, config.max_seq_len)
    total_steps = len(batches) * config.epochs
    step = 0

    print(f"[Training] {len(data)} samples | {len(batches)} batches | "
          f"{config.epochs} epochs | {total_steps} total steps")
    print(f"[Training] LR={config.learning_rate} | Batch={config.batch_size} | "
          f"Warmup={config.warmup_steps}\n")

    best_loss = float("inf")
    history = []

    for epoch in range(config.epochs):
        epoch_loss = 0.0
        epoch_batches = 0

        # Shuffle batches each epoch
        random.shuffle(batches)

        for inp, tgt in batches:
            lr = get_lr(step, total_steps, config.warmup_steps, config.learning_rate)

            # Forward pass
            model.zero_grads()
            logits = model.forward(inp, training=True)

            # Loss + grad
            loss, d_logits = cross_entropy_loss(logits, tgt, config.label_smoothing)

            # Backward pass
            model.backward(d_logits)

            # Gradient clipping (Haykin: bounded weight updates for stability)
            model.clip_grads(config.grad_clip)

            # Parameter update with Wiener-scaled Adam
            model.update(lr)

            epoch_loss += loss
            epoch_batches += 1
            step += 1

        avg_loss = epoch_loss / max(epoch_batches, 1)
        history.append(avg_loss)

        # Compute perplexity
        perplexity = math.exp(min(avg_loss, 20))

        if avg_loss < best_loss:
            best_loss = avg_loss
            model.save("hran_best.pkl")

        # Progress display
        if (epoch + 1) % 5 == 0 or epoch == 0:
            bar_len = 20
            filled = int(bar_len * (epoch + 1) / config.epochs)
            bar = "█" * filled + "░" * (bar_len - filled)
            print(f"  Epoch {epoch+1:3d}/{config.epochs} [{bar}] "
                  f"Loss: {avg_loss:.4f} | PPL: {perplexity:.1f} | LR: {lr:.6f}")

    print(f"\n[Training Complete] Best loss: {best_loss:.4f} | "
          f"Best PPL: {math.exp(min(best_loss, 20)):.2f}")
    return history


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 15: GENERATION (with Temperature + Top-k + Top-p)
# ─────────────────────────────────────────────────────────────────────────────

def generate(model: HRANModel, tokenizer: HRANTokenizer, prompt: str,
             max_new_tokens: int = 60, temperature: float = 0.7,
             top_k: int = 40, top_p: float = 0.9) -> str:
    """
    Autoregressive generation with:
    - Temperature scaling (Haykin: noise injection for exploration)
    - Top-k sampling (competitive selection — like lateral inhibition)
    - Top-p (nucleus) sampling (information-theoretic probability mass cutoff)
    """
    input_ids = [CFG.BOS_ID] + tokenizer.encode(prompt)

    for _ in range(max_new_tokens):
        # Trim to max sequence length
        ctx = input_ids[-CFG.max_seq_len:]
        inp = np.array([ctx], dtype=np.int32)

        # Forward (no dropout during inference)
        logits = model.forward(inp, training=False)

        # Get logits for the last position
        next_logits = logits[0, -1, :].astype(np.float64)

        # Temperature scaling
        next_logits /= max(temperature, 1e-8)

        # Top-k filtering
        if top_k > 0:
            kth_val = np.partition(next_logits, -top_k)[-top_k]
            next_logits[next_logits < kth_val] = -1e9

        # Top-p (nucleus) filtering
        probs = softmax(next_logits)
        sorted_indices = np.argsort(-probs)
        cumprob = 0.0
        cutoff_idx = len(sorted_indices)
        for rank, idx in enumerate(sorted_indices):
            cumprob += probs[idx]
            if cumprob >= top_p:
                cutoff_idx = rank + 1
                break
        # Zero out everything below nucleus
        keep_ids = set(sorted_indices[:cutoff_idx])
        for i in range(len(probs)):
            if i not in keep_ids:
                probs[i] = 0.0
        probs /= probs.sum() + 1e-9

        # Sample
        next_id = int(np.random.choice(len(probs), p=probs))

        if next_id == CFG.EOS_ID:
            break

        input_ids.append(next_id)

    # Decode only the generated portion (after input)
    generated_ids = input_ids[1 + len(tokenizer.encode(prompt)):]
    return tokenizer.decode(generated_ids)


def generate_response(model: HRANModel, tokenizer: HRANTokenizer,
                      question: str, temperature: float = 0.6) -> str:
    """
    Generate a response to a conversational input.
    Uses multiple sampling attempts and picks the best by length heuristic.
    """
    # Normalize input
    q = question.lower().strip().rstrip("?!.")
    
    candidates = []
    for temp in [temperature, temperature * 0.8, temperature * 1.2]:
        resp = generate(model, tokenizer, q, max_new_tokens=60,
                        temperature=temp, top_k=50, top_p=0.92)
        resp = resp.strip()
        if len(resp.split()) >= 3:
            candidates.append(resp)

    if not candidates:
        return "I am still learning. Could you rephrase that?"

    # Pick the response with most content (heuristic)
    best = max(candidates, key=lambda r: len(r.split()))

    # Capitalize first letter
    if best:
        best = best[0].upper() + best[1:]
    return best


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 16: CONVERSATIONAL CHAT INTERFACE
# ─────────────────────────────────────────────────────────────────────────────

BANNER = """
╔══════════════════════════════════════════════════════════════════════════╗
║                                                                          ║
║   ██╗  ██╗██████╗  █████╗ ███╗   ██╗                                   ║
║   ██║  ██║██╔══██╗██╔══██╗████╗  ██║                                   ║
║   ███████║██████╔╝███████║██╔██╗ ██║                                   ║
║   ██╔══██║██╔══██╗██╔══██║██║╚██╗██║                                   ║
║   ██║  ██║██║  ██║██║  ██║██║ ╚████║                                   ║
║   ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝  ╚═╝╚═╝  ╚═══╝                                  ║
║                                                                          ║
║   Haykin Resonant Attention Network                                      ║
║   ─────────────────────────────────────────────────────────────────     ║
║   Architecture grounded in: Simon Haykin's Neural Networks              ║
║   and Learning Machines + First Principles of Information Theory        ║
║                                                                          ║
║   Innovations:                                                           ║
║   • RBF Attention Kernels (Ch.5)  • Hebbian Embedding Init (Ch.2)      ║
║   • Infomax FFN Activation (Ch.10) • Lateral Inhibition (Ch.9)         ║
║   • Wiener Gradient Scaling (Ch.3)                                      ║
║                                                                          ║
║   Commands: 'quit' to exit | 'info' for architecture details           ║
╚══════════════════════════════════════════════════════════════════════════╝
"""

ARCH_INFO = """
╔═══════════════════════════════════════════════════════════════════╗
║  HRAN Architecture Details                                        ║
╠═══════════════════════════════════════════════════════════════════╣
║  Embedding dim  : 128       Vocab size  : ~1500                   ║
║  HRAN layers    : 4         Attn heads  : 4                       ║
║  FFN dim        : 512       Max seq len : 64                      ║
║  Total params   : ~2.5M     Training    : 80 epochs               ║
╠═══════════════════════════════════════════════════════════════════╣
║  RBF Attention  : A_ij = softmax(-γ‖q_i - k_j‖²)                ║
║  Infomax Act.   : f(x) = tanh(x) + 0.1x                          ║
║  Hebbian Init   : ΔW = η(y·x - y²·W)  [Oja's rule]              ║
║  Wiener Scale   : lr_eff = lr × SNR/(1+SNR)                      ║
╚═══════════════════════════════════════════════════════════════════╝
"""

def chat_loop(model: HRANModel, tokenizer: HRANTokenizer):
    """Main conversational loop."""
    print(BANNER)
    print("  Ready to converse. Type your question or message.\n")

    history = []

    while True:
        try:
            user_input = input("  You  › ").strip()
        except (EOFError, KeyboardInterrupt):
            print("\n  HRAN › Goodbye. Keep thinking.\n")
            break

        if not user_input:
            continue

        if user_input.lower() in ["quit", "exit", "bye", "goodbye"]:
            print("  HRAN › Goodbye. Keep thinking.\n")
            break

        if user_input.lower() == "info":
            print(ARCH_INFO)
            continue

        if user_input.lower() == "history":
            if history:
                print("\n  [Conversation History]")
                for i, (q, r) in enumerate(history[-5:], 1):
                    print(f"  {i}. You: {q}")
                    print(f"     HRAN: {r}\n")
            else:
                print("  [No history yet]\n")
            continue

        # Generate response
        print("  HRAN › ", end="", flush=True)
        t0 = time.time()
        response = generate_response(model, tokenizer, user_input)
        elapsed = time.time() - t0

        print(response)
        print(f"  {'─' * 60}")

        history.append((user_input, response))


# ─────────────────────────────────────────────────────────────────────────────
# SECTION 17: MAIN ENTRY POINT
# ─────────────────────────────────────────────────────────────────────────────

def main():
    np.random.seed(42)
    random.seed(42)

    print("\n" + "═" * 65)
    print("  HRAN — Haykin Resonant Attention Network")
    print("  Built strictly from Haykin + First Principles")
    print("═" * 65 + "\n")

    # Build tokenizer
    tokenizer = HRANTokenizer(max_vocab=CFG.vocab_size)
    tokenizer.build(FULL_DATASET)
    CFG.vocab_size = tokenizer.vocab_size

    # Build model
    model = HRANModel(CFG)

    # Check for saved model
    model_path = "hran_best.pkl"
    if os.path.exists(model_path):
        print(f"[HRAN] Found saved model at {model_path}")
        ans = input("  Load existing model? [Y/n]: ").strip().lower()
        if ans != "n":
            model.load(model_path)
            print("  Loaded! Entering chat mode.\n")
            chat_loop(model, tokenizer)
            return

    # Train
    print("\n[HRAN] Starting training from scratch...\n")
    history = train(model, tokenizer, FULL_DATASET, CFG)

    # Plot loss if matplotlib available
    try:
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, 4))
        plt.plot(history, color="#e74c3c", linewidth=2)
        plt.title("HRAN Training Loss (Haykin RBF-Attention + Infomax FFN)")
        plt.xlabel("Epoch")
        plt.ylabel("Cross-Entropy Loss")
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig("hran_training_loss.png", dpi=150)
        plt.close()
        print("\n[HRAN] Loss curve saved to hran_training_loss.png")
    except ImportError:
        pass

    print("\n[HRAN] Training complete! Entering chat mode.")
    print("  (Model auto-saved as hran_best.pkl)\n")

    chat_loop(model, tokenizer)


if __name__ == "__main__":
    main()