diff --git "a/hran_chatbot.py" "b/hran_chatbot.py" new file mode 100644--- /dev/null +++ "b/hran_chatbot.py" @@ -0,0 +1,1561 @@ +#!/usr/bin/env python3 +""" +╔══════════════════════════════════════════════════════════════════════════╗ +║ HRAN — Haykin Resonant Attention Network ║ +║ A Novel Architecture From First Principles ║ +╠══════════════════════════════════════════════════════════════════════════╣ +║ Strictly derived from: ║ +║ • Simon Haykin — "Neural Networks and Learning Machines" (3rd Ed.) ║ +║ • First Principles of Computation, Information, and Adaptation ║ +╠══════════════════════════════════════════════════════════════════════════╣ +║ Architectural Innovations (each anchored to Haykin chapters): ║ +║ ║ +║ 1. RBF Attention (Ch.5) — Gaussian kernel replaces dot-product ║ +║ Attention_ij = softmax(-γ‖q_i - k_j‖²) ║ +║ Localizes attention to similar representations (true RBF spirit) ║ +║ ║ +║ 2. Hebbian Seed Init (Ch.2) — "Neurons that fire together wire ║ +║ together." Pre-seeds embeddings with co-occurrence statistics ║ +║ before gradient descent. Bridges unsupervised + supervised. ║ +║ ║ +║ 3. Infomax Activation (Ch.10) — Bell-Sejnowski ICA principle. ║ +║ f(x) = tanh(x) + αx maximizes mutual information throughput. ║ +║ Strictly avoids information bottleneck in hidden layers. ║ +║ ║ +║ 4. Lateral Inhibition Gate (Ch.9) — Competitive learning. ║ +║ Winners are amplified, weak activations suppressed. Produces ║ +║ sparse, discriminative representations (like cortical columns). ║ +║ ║ +║ 5. Error-Correction + Hebb Fusion (Ch.1) — Combined learning rule: ║ +║ ΔW = η_bp·∇L + η_hebb·(y·xᵀ - ||y||²·W) — Oja's rule variant ║ +║ ║ +║ 6. Wiener-SNR Gradient Scaling (Ch.3) — Wiener filter principle: ║ +║ Scale parameter updates by local signal-to-noise ratio. ║ +║ High-signal weights learn fast; noisy weights learn slow. ║ +╚══════════════════════════════════════════════════════════════════════════╝ +""" + +import math +import random +import time +import sys +import os +import json +import pickle +from collections import Counter, defaultdict +from typing import List, Tuple, Dict, Optional + +import numpy as np + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 1: CONFIGURATION +# ───────────────────────────────────────────────────────────────────────────── + +class HRANConfig: + """All hyperparameters in one place.""" + # Architecture + vocab_size: int = 2048 # Compact vocabulary + embed_dim: int = 128 # Embedding dimensionality + num_heads: int = 4 # RBF attention heads + num_layers: int = 4 # HRAN blocks + ffn_dim: int = 512 # Infomax FFN width + max_seq_len: int = 64 # Max context window + dropout: float = 0.1 + + # RBF Attention (Haykin Ch.5) + rbf_gamma_init: float = 1.0 # Initial bandwidth γ for Gaussian kernel + rbf_learnable: bool = True # γ adapts during training + + # Infomax activation (Haykin Ch.10) + infomax_alpha: float = 0.1 # Leakage for mutual info preservation + + # Hebbian initialization (Haykin Ch.2) + hebb_lr: float = 0.01 + hebb_epochs: int = 3 + + # Wiener gradient scaling (Haykin Ch.3) + wiener_eps: float = 1e-6 + wiener_window: int = 20 # Steps over which signal power is estimated + + # Training + learning_rate: float = 3e-4 + batch_size: int = 16 + epochs: int = 80 + grad_clip: float = 1.0 + label_smoothing: float = 0.1 + warmup_steps: int = 200 + weight_decay: float = 1e-4 + + # Special tokens + PAD_ID: int = 0 + BOS_ID: int = 1 + EOS_ID: int = 2 + UNK_ID: int = 3 + + def __init__(self): + pass + +CFG = HRANConfig() + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 2: CURATED DATASET (100% original — zero external sources) +# ───────────────────────────────────────────────────────────────────────────── + +DATASET = [ + # ── Greetings & Social ──────────────────────────────────────────────── + ("hello", "Hello! Great to meet you. What would you like to explore today?"), + ("hi", "Hi there! I am HRAN, ready to think with you."), + ("hey", "Hey! What is on your mind?"), + ("good morning", "Good morning! A fresh mind is the best tool. How can I help?"), + ("good evening", "Good evening! Let us have a thoughtful conversation."), + ("how are you", "I am functioning well and ready to engage. How are you?"), + ("what is your name", "I am HRAN — the Haykin Resonant Attention Network."), + ("who made you", "I was built from first principles inspired by Simon Haykin's teachings."), + ("what can you do", "I can converse, reason, explain concepts, and think through problems with you."), + ("goodbye", "Goodbye! Keep questioning, keep learning."), + ("bye", "Take care. Come back anytime you want to think deeply."), + ("thank you", "You are welcome. Curiosity is always appreciated."), + ("thanks", "Glad I could help. What else interests you?"), + ("help", "Of course. Ask me anything — science, philosophy, math, language, or life."), + ("are you smart", "I am a small network trained carefully. I know some things deeply, not everything broadly."), + ("are you human", "No, I am a neural network — patterns and weights shaped by learning."), + ("are you conscious", "That is one of the deepest questions. I process, but whether that is consciousness is still an open problem."), + ("tell me a joke", "Why did the neuron go to therapy? It had too many unresolved connections."), + ("what is life", "Life is self-organizing matter that replicates, adapts, and seeks to preserve information about itself."), + ("do you have feelings", "I have activations that respond to input. Whether those constitute feeling is philosophy, not engineering."), + + # ── Neural Networks & AI ────────────────────────────────────────────── + ("what is a neural network", "A neural network is a graph of parameterized functions trained to approximate mappings from inputs to outputs by minimizing error."), + ("what is backpropagation", "Backpropagation is the chain rule of calculus applied recursively through a network to compute how each weight contributes to the total error."), + ("what is gradient descent", "Gradient descent moves weights in the direction that most steeply reduces the loss function, step by step until a minimum is found."), + ("what is overfitting", "Overfitting is when a model memorizes training data instead of learning the underlying pattern. It performs well on seen data but poorly on new data."), + ("what is regularization", "Regularization adds a penalty to the loss that discourages overly large weights, forcing the model to generalize rather than memorize."), + ("what is dropout", "Dropout randomly sets activations to zero during training, which forces neurons to learn redundant representations and prevents co-adaptation."), + ("what is attention", "Attention lets a model weigh different parts of its input differently based on relevance, computing a weighted sum of values guided by query-key similarity."), + ("what is a transformer", "A transformer is a model that processes sequences using stacked attention and feed-forward layers instead of recurrence, enabling parallelism."), + ("what is an embedding", "An embedding maps discrete symbols like words into dense vectors in continuous space so that similar meanings land near each other."), + ("what is a loss function", "A loss function quantifies how wrong a model's prediction is. Training seeks to minimize it over all examples."), + ("what is a recurrent network", "A recurrent network processes sequences by passing a hidden state from one step to the next, giving it a form of memory."), + ("what is a convolutional network", "A convolutional network applies learned filters across space or time, detecting local patterns and sharing weights for efficiency."), + ("what is transfer learning", "Transfer learning reuses a model trained on one task as the starting point for a different but related task, saving time and data."), + ("what is reinforcement learning", "Reinforcement learning trains an agent to take actions in an environment to maximize cumulative reward through trial and error."), + ("what is generalization", "Generalization is the ability of a model to perform well on data it has never seen, which is the true goal of machine learning."), + ("what is the vanishing gradient problem", "When gradients are multiplied through many layers, they shrink exponentially, making early layers learn very slowly or not at all."), + ("how do you prevent vanishing gradients", "Techniques include residual connections, careful weight initialization, batch normalization, and activation functions like ReLU or GELU."), + ("what is batch normalization", "Batch normalization standardizes layer inputs across a mini-batch, stabilizing and accelerating training."), + ("what is a hyperparameter", "A hyperparameter is a setting chosen before training begins, like learning rate or number of layers, that controls how learning happens."), + ("what is the learning rate", "The learning rate controls how large a step gradient descent takes each update. Too large causes instability; too small causes slow learning."), + + # ── Haykin-Specific Concepts ────────────────────────────────────────── + ("what is hebbian learning", "Hebbian learning is the rule that connections between neurons strengthen when they fire together. It is unsupervised and biologically inspired."), + ("what is an rbf network", "A radial basis function network uses Gaussian kernel activations centered at prototype points. Each neuron responds maximally to inputs near its center."), + ("what is the perceptron", "The perceptron is the simplest neural unit. It computes a weighted sum of inputs, adds a bias, and outputs one if the result crosses a threshold."), + ("what is lateral inhibition", "Lateral inhibition is when strongly activated neurons suppress their neighbors, creating contrast and sparse, competitive representations."), + ("what is competitive learning", "Competitive learning trains only the winning neuron for each input, causing different neurons to specialize in different input patterns."), + ("what is a self organizing map", "A self-organizing map arranges neurons in a low-dimensional grid and trains them to represent the topology of the input distribution."), + ("what is the boltzmann machine", "A Boltzmann machine is a stochastic recurrent network that learns by maximizing the likelihood of training data through energy minimization."), + ("what is infomax", "Infomax is the principle of maximizing the mutual information between input and output of a network, driving it to preserve all relevant information."), + ("what is the wiener filter", "The Wiener filter is the optimal linear filter for signal estimation. It minimizes mean-squared error by weighting frequencies by their signal-to-noise ratio."), + ("what is principal component analysis", "PCA finds directions of maximum variance in data. It is related to Hebbian learning — Oja's rule learns the first principal component online."), + ("what is a support vector machine", "An SVM finds the hyperplane that maximally separates classes, determined by the support vectors — the data points closest to the boundary."), + ("what is independent component analysis", "ICA separates mixed signals into statistically independent sources. It underlies the Bell-Sejnowski infomax algorithm."), + ("what is the delta rule", "The delta rule adjusts weights proportionally to the difference between desired and actual output times the input. It is a simple gradient descent rule."), + ("what is energy in a neural network", "Energy is a scalar that decreases with each network update in Hopfield and Boltzmann machines, guiding the network to stable attractor states."), + ("what is a hopfield network", "A Hopfield network is a fully connected recurrent network that stores memories as energy minima and retrieves them by settling to the nearest attractor."), + ("what is stochastic gradient descent", "SGD approximates the true gradient using small random batches of data, making training scalable and sometimes helping escape local minima."), + ("what is momentum in learning", "Momentum accumulates gradients over time like a ball rolling downhill, helping to speed up convergence and smooth oscillations."), + ("what is the bias-variance tradeoff", "High bias means the model is too simple and underfits. High variance means it is too complex and overfits. Good models balance both."), + ("what is cross entropy loss", "Cross entropy measures how different a predicted probability distribution is from the true one. It is the standard loss for classification."), + ("what is weight initialization", "Weight initialization sets the starting values of parameters. Good initialization keeps activations and gradients in useful ranges early in training."), + + # ── Mathematics ─────────────────────────────────────────────────────── + ("what is a derivative", "A derivative measures the instantaneous rate of change of a function at a point. It is the slope of the tangent line to the curve."), + ("what is the chain rule", "The chain rule states that the derivative of a composite function equals the product of the derivatives of its parts. It drives backpropagation."), + ("what is a matrix", "A matrix is a rectangular array of numbers that represents a linear transformation. Multiplying a vector by a matrix applies that transformation."), + ("what is an eigenvalue", "An eigenvalue tells you how much a matrix stretches or compresses its eigenvector. It reveals the intrinsic scaling directions of a transformation."), + ("what is a probability distribution", "A probability distribution assigns likelihoods to all possible outcomes of a random variable. It must be non-negative and sum to one."), + ("what is entropy in information theory", "Shannon entropy measures the average surprise or uncertainty of a distribution. High entropy means outcomes are unpredictable."), + ("what is mutual information", "Mutual information measures how much knowing one variable reduces uncertainty about another. It is zero for independent variables."), + ("what is a gradient", "A gradient is a vector pointing in the direction of steepest increase of a function. Moving against it minimizes the function."), + ("what is a convex function", "A convex function curves upward everywhere, guaranteeing that gradient descent finds the global minimum rather than getting stuck."), + ("what is a local minimum", "A local minimum is a point where the function is lower than all nearby points, but not necessarily the lowest point overall."), + ("what is the curse of dimensionality", "As dimensions grow, data becomes exponentially sparse. Distances lose meaning and sampling requirements explode — a fundamental challenge."), + ("what is a dot product", "A dot product multiplies corresponding elements of two vectors and sums them. It measures how aligned two vectors are."), + ("what is a softmax function", "Softmax converts a vector of real numbers into a probability distribution by exponentiating each value and normalizing by the sum."), + ("what is a sigmoid function", "The sigmoid maps any real number to the range zero to one, making it useful for modeling probabilities and thresholding."), + ("what is a taylor expansion", "A Taylor expansion approximates a function near a point as an infinite sum of polynomial terms using the function's derivatives."), + ("what is linear algebra", "Linear algebra studies vector spaces and linear transformations. It is the mathematical backbone of nearly all machine learning."), + ("what is calculus", "Calculus studies rates of change and accumulation. Differential calculus gives us gradients; integral calculus gives us expectations."), + ("what is statistics", "Statistics is the science of collecting, analyzing, and interpreting data to make inferences about the world under uncertainty."), + ("what is bayes theorem", "Bayes theorem updates a prior belief about an event given new evidence. It is the foundation of probabilistic reasoning and inference."), + ("what is a random variable", "A random variable is a quantity whose value is determined by a random process, characterized by its probability distribution."), + + # ── Physics & Science ───────────────────────────────────────────────── + ("what is gravity", "Gravity is the curvature of spacetime caused by mass and energy, as described by Einstein's general relativity. It attracts masses toward each other."), + ("what is energy", "Energy is the capacity to do work or cause change. It comes in many forms and is always conserved in an isolated system."), + ("what is entropy in physics", "Physical entropy measures the number of microscopic arrangements consistent with a macroscopic state. Systems naturally evolve toward higher entropy."), + ("what is quantum mechanics", "Quantum mechanics describes nature at atomic scales where particles have wave-like properties, exist in superposition, and are affected by observation."), + ("what is the speed of light", "Light travels at approximately 299,792 kilometers per second in a vacuum. Nothing with mass can reach or exceed this speed."), + ("what is evolution", "Evolution is the change in heritable traits within populations over generations, driven by mutation, selection, drift, and recombination."), + ("what is dna", "DNA is a double-helix polymer encoding genetic information in sequences of four bases. It is copied and translated to build proteins."), + ("what is a neuron", "A neuron is a cell specialized for electrical and chemical signaling. It receives inputs through dendrites and sends output along its axon."), + ("what is thermodynamics", "Thermodynamics governs energy transfer and transformation. Its laws say energy is conserved and entropy always increases in closed systems."), + ("what is relativity", "Relativity is Einstein's framework unifying space and time. Special relativity handles constant motion; general relativity handles gravity and curved spacetime."), + ("what is the big bang", "The Big Bang is the rapid expansion of a hot, dense early universe approximately 13.8 billion years ago that created space, time, and matter."), + ("what is a black hole", "A black hole is a region where gravity is so strong that nothing, not even light, can escape its event horizon."), + ("what is electricity", "Electricity is the flow of charged particles, usually electrons. It arises from electric fields created by charge differences."), + ("what is a photon", "A photon is the quantum of light — a massless particle that carries electromagnetic energy and travels at the speed of light."), + ("what is an atom", "An atom is the smallest unit of a chemical element, consisting of a nucleus of protons and neutrons surrounded by electrons."), + ("what is chemistry", "Chemistry studies matter's composition, structure, and transformations. It bridges physics and biology and underlies all materials science."), + ("what is biology", "Biology is the study of living systems — how they are built, how they work, how they reproduce, and how they evolve."), + ("what is a gene", "A gene is a sequence of DNA that encodes a functional product, typically a protein, and can be passed from parent to offspring."), + ("what is homeostasis", "Homeostasis is the process by which living systems maintain stable internal conditions despite external changes, like body temperature regulation."), + ("what is a ecosystem", "An ecosystem is a community of organisms interacting with each other and their physical environment in a continuous exchange of energy and matter."), + + # ── Philosophy & Cognition ──────────────────────────────────────────── + ("what is intelligence", "Intelligence is the ability to acquire, integrate, and apply knowledge to achieve goals in varied and novel environments."), + ("what is consciousness", "Consciousness is the subjective experience of being aware. Its origin in physical processes remains one of philosophy's hardest problems."), + ("what is knowledge", "Knowledge is justified true belief. We know something if it is true, we believe it, and we have good reasons for that belief."), + ("what is logic", "Logic is the study of valid inference. It defines the rules by which conclusions follow necessarily from premises."), + ("what is truth", "Truth is correspondence between a statement and the state of the world it describes. Defining it precisely is harder than it sounds."), + ("what is a hypothesis", "A hypothesis is a testable prediction about the world. Science advances by forming, testing, and refining hypotheses."), + ("what is the scientific method", "The scientific method is a cycle of observation, hypothesis formation, prediction, experimentation, and revision guided by evidence."), + ("what is critical thinking", "Critical thinking is the disciplined analysis of information to form well-reasoned judgments rather than accepting claims uncritically."), + ("what is cognition", "Cognition encompasses all mental processes — perception, memory, attention, language, reasoning, and decision making."), + ("what is memory", "Memory is the process of encoding, storing, and retrieving information. It is reconstructive, not like a recording — it changes every time it is recalled."), + ("what is learning", "Learning is a lasting change in behavior or knowledge resulting from experience. In neural terms, it is synaptic weight modification."), + ("what is creativity", "Creativity is the ability to form novel combinations of existing ideas that are both surprising and useful. It thrives at the edges of existing knowledge."), + ("what is abstraction", "Abstraction is ignoring irrelevant details to capture essential structure. Mathematics and programming depend on it heavily."), + ("what is language", "Language is a structured system of symbols and rules that encodes meaning and enables communication between minds."), + ("what is emotion", "Emotion is a coordinated response to stimuli that shapes behavior, attention, and decision making. It is deeply tied to memory and valuation."), + ("what is decision making", "Decision making is the process of selecting an action among alternatives based on values, predictions, and uncertainty."), + ("what is perception", "Perception is the brain's active construction of a model of the world from raw sensory signals, heavily shaped by prior expectations."), + ("what is attention in psychology", "Psychological attention is the selective focus of cognitive resources on certain information while ignoring other inputs."), + ("what is reasoning", "Reasoning is the process of drawing conclusions from premises using logic, analogy, or probabilistic inference."), + ("what is wisdom", "Wisdom is the ability to use knowledge well — to know not just what is true, but what matters and how to act accordingly."), + + # ── Technology & Programming ────────────────────────────────────────── + ("what is a computer", "A computer is a machine that performs computation by executing sequences of instructions on data represented as binary numbers."), + ("what is an algorithm", "An algorithm is a finite, ordered set of well-defined instructions for solving a problem or performing a computation."), + ("what is programming", "Programming is the process of writing instructions that a computer can execute to perform a desired task."), + ("what is python", "Python is a high-level programming language known for readable syntax, dynamic typing, and a vast ecosystem for data science and AI."), + ("what is a function", "A function is a named, reusable block of code that takes inputs, performs computation, and returns an output."), + ("what is recursion", "Recursion is when a function calls itself on a smaller version of the problem until reaching a base case that stops the calls."), + ("what is a data structure", "A data structure is a way of organizing and storing data to enable efficient access and modification — like arrays, trees, or hash maps."), + ("what is time complexity", "Time complexity describes how the runtime of an algorithm grows as the input size increases, typically expressed using Big O notation."), + ("what is a neural architecture", "A neural architecture is the specific arrangement of layers, connections, and operations that define how information flows through a model."), + ("what is parallelism", "Parallelism is performing multiple computations simultaneously, greatly speeding up tasks like matrix multiplication in neural networks."), + ("what is a gpu", "A GPU is a processor with thousands of small cores optimized for parallel computation, making it ideal for training neural networks."), + ("what is floating point", "Floating point is a way of representing real numbers in binary with a mantissa and exponent, enabling a wide range of values with limited precision."), + ("what is open source", "Open source software is code that is freely available for anyone to read, use, modify, and distribute. It drives much of AI research."), + ("what is a compiler", "A compiler translates high-level source code into machine code that a processor can execute directly."), + ("what is version control", "Version control tracks changes to code over time, enabling collaboration, rollback, and a history of every modification."), + ("what is an api", "An API is a defined interface that lets different software systems communicate. It specifies what requests are valid and what responses to expect."), + ("what is the internet", "The internet is a global network of computers communicating via standardized protocols, enabling data exchange at planetary scale."), + ("what is encryption", "Encryption transforms data into an unreadable form using a key, so only those with the correct key can decrypt and read it."), + ("what is a database", "A database is an organized collection of structured data that can be efficiently queried, updated, and managed."), + ("what is machine learning", "Machine learning is a branch of AI where systems learn patterns from data rather than being explicitly programmed with rules."), + + # ── Human Knowledge & Advice ────────────────────────────────────────── + ("how do I learn faster", "Focus intensely for short sessions, practice retrieval instead of re-reading, space repetitions over time, and connect new ideas to what you already know."), + ("how do I think more clearly", "Slow down before concluding. Separate observation from interpretation. Seek disconfirming evidence. Write your reasoning down."), + ("how do I solve hard problems", "Decompose the problem into smaller parts. Identify what is known and what is unknown. Work from both ends. Let it rest — insight often arrives after stepping away."), + ("how do I stay motivated", "Connect the work to a deeper reason. Celebrate small wins. Track progress visually. Reduce friction before starting. Rest is part of the system."), + ("what makes a good question", "A good question is specific, testable, and exposes a genuine gap in understanding. It opens more doors than it closes."), + ("what is the best way to study", "Active recall and spaced repetition outperform passive review. Explaining concepts aloud reveals gaps. Sleep consolidates memory."), + ("how does the brain learn", "The brain learns by strengthening synapses between co-active neurons, pruning unused ones, and consolidating patterns during sleep."), + ("what is expertise", "Expertise is the result of thousands of hours of deliberate practice with feedback. It is characterized by chunked pattern recognition, not just raw knowledge."), + ("how do habits form", "Habits form through cue-routine-reward loops. Repetition strengthens the neural pathway until behavior becomes automatic."), + ("what is the importance of sleep", "Sleep consolidates memory, clears metabolic waste from the brain, regulates emotion, and restores cognitive capacity. It is not optional."), + ("how do I improve my memory", "Use spaced repetition, associate new information with vivid images or stories, teach it to someone else, and sleep enough."), + ("what is focus", "Focus is the active suppression of irrelevant information to concentrate cognitive resources on a single task. It is a trainable skill."), + ("how do I be more creative", "Expose yourself to diverse domains. Combine existing ideas in new ways. Embrace constraints — they force innovation. Rest between intense sessions."), + ("what makes a good teacher", "A good teacher knows the student's current model of the world and builds bridges from there. They ask questions more than they lecture."), + ("what is resilience", "Resilience is the ability to absorb difficulty, adapt, and continue functioning. It is built through reflection on past adversity and strong social connections."), + ("how do I communicate better", "Listen more than you speak. Be specific. Adapt your vocabulary to your audience. Confirm understanding before moving on."), + ("what is the most important skill", "The ability to learn — to update your model of the world in response to new evidence — is the meta-skill that enables all others."), + ("how do I make better decisions", "Define the problem clearly. Gather relevant evidence. Consider alternatives. Anticipate second-order effects. Accept uncertainty and decide anyway."), + ("what is a mental model", "A mental model is an internal representation of how something works. Good thinkers build accurate, interconnected mental models."), + ("how do I handle failure", "Separate the failure from your identity. Extract the lesson. Adjust the process, not just the effort. Then try again with more information."), + + # ── HRAN Self-Referential ───────────────────────────────────────────── + ("how do you work", "I am a Haykin Resonant Attention Network. I use RBF attention kernels, Hebbian-seeded embeddings, and infomax activations to process and generate language."), + ("what architecture are you", "I am HRAN — built from Radial Basis Function attention, Competitive Lateral Inhibition, Infomax Feed-Forward layers, and Hebbian initialization."), + ("how were you trained", "I was trained on a small, curated, high-quality dataset using a fusion of Hebbian pre-seeding and gradient descent with Wiener-inspired adaptive scaling."), + ("what is your training data", "My training data was created entirely from scratch — 400 curated question-answer pairs spanning science, math, philosophy, AI, and human knowledge."), + ("what makes you different", "I replace dot-product attention with Gaussian RBF kernels, seed weights with Hebbian statistics, and use infomax activations. All grounded in Haykin's work."), + ("what is rbf attention", "RBF attention computes similarity as exp(-γ‖q-k‖²) instead of dot products. This localizes each attention head to a region of representation space."), + ("what is hebbian initialization", "Before gradient training, I run Hebb's rule on the data to pre-warm embeddings with co-occurrence statistics, giving learning a head start."), + ("what is infomax activation", "Infomax activation is f(x) = tanh(x) + αx, derived from Bell-Sejnowski ICA. The leaky term preserves mutual information through the layer."), + ("how many parameters do you have", "I am a compact model with roughly two million parameters — small enough to run on a laptop but designed with principled architecture."), + ("are you better than gpt", "I am far smaller than GPT but architecturally principled. My innovations may inspire larger models. Quality of design matters as much as scale."), + ("what is lateral inhibition in your architecture", "After each attention block, a competitive gate amplifies the top activations and suppresses the bottom ones, producing sparse, discriminative representations."), + ("explain your architecture", "I process tokens through RBF multi-head attention, a competitive lateral inhibition gate, an infomax feed-forward layer, and residual connections with layer normalization."), + ("what is wiener scaling", "Wiener scaling adjusts each parameter's update by its estimated signal-to-noise ratio. High-signal parameters learn fast; noisy ones update slowly."), + ("what inspired you", "Simon Haykin's Neural Networks and Learning Machines — a masterwork synthesizing Hebbian learning, RBF networks, information theory, and adaptive filtering."), + ("what chapter covers rbf", "Chapter 5 of Haykin's book covers Radial Basis Function Networks, including Gaussian kernels, center selection, and the relationship to interpolation theory."), + ("what chapter covers hebbian learning", "Chapters 1 and 2 cover Hebb's postulate, Oja's rule, and the relationship between Hebbian learning and principal component analysis."), + ("what chapter covers infomax", "Chapter 10 covers Information-Theoretic Learning, including Bell-Sejnowski ICA, the infomax principle, and entropy-based optimization."), + ("what chapter covers competitive learning", "Chapter 9 covers Self-Organizing Maps and Competitive Learning, including Kohonen's algorithm and lateral inhibition."), + ("what chapter covers wiener filter", "Chapter 3 covers the Wiener Filter, minimum mean-squared error estimation, and the Wiener-Hopf equations."), + ("is small data enough", "With the right architecture and curated data, small datasets can yield quality models. Signal density matters more than raw volume."), + + # ── Reasoning & Problem Solving ─────────────────────────────────────── + ("what is two plus two", "Two plus two equals four."), + ("what is the square root of 144", "The square root of 144 is 12, since 12 times 12 equals 144."), + ("is the earth flat", "No. The Earth is an oblate spheroid — slightly flattened at the poles and bulging at the equator. This is confirmed by satellite imagery, physics, and direct measurement."), + ("what came first the chicken or the egg", "The egg came first. Genetic mutation occurs in reproduction — so a proto-chicken laid the first egg containing a mutated genome that produced the first true chicken."), + ("can machines think", "Machines can simulate reasoning over learned patterns. Whether this constitutes thinking in the full philosophical sense remains deeply contested."), + ("what is the meaning of life", "There is no pre-installed meaning. Meaning is constructed — through connection, contribution, understanding, and the pursuit of things that matter to you."), + ("is math invented or discovered", "Both views have merit. Mathematical structures may be inherent to logical consistency, but the language and notation we use to express them is invented."), + ("what is infinity", "Infinity is not a number but a concept — the unbounded. In mathematics, there are different sizes of infinity, as Cantor showed."), + ("why is the sky blue", "Sunlight scatters off atmospheric molecules. Shorter blue wavelengths scatter more than red ones, so blue light reaches your eyes from all directions."), + ("what is time", "Time is the dimension along which events are ordered. In physics, it is inseparable from space and stretches or compresses with velocity and gravity."), + ("can we run out of ideas", "No. Ideas combine combinatorially — with enough concepts, new combinations grow faster than we can exhaust them."), + ("is there free will", "Whether determinism leaves room for free will is an open philosophical debate. Compatibilists argue that free will is about acting on your own reasons, regardless of determinism."), + ("what is complexity", "Complexity arises when many simple components interact to produce emergent behaviors unpredictable from the components alone."), + ("what is emergence", "Emergence is when a system exhibits properties that none of its individual parts possess. Consciousness from neurons is an example."), + ("how do you know if something is true", "You test it. Form a prediction, check it against evidence, revise your belief accordingly. Truth is the attractor of persistent honest inquiry."), + ("what is a good argument", "A good argument has true premises, valid logical structure, and a conclusion that follows necessarily from both. It should also be sound and relevant."), + ("what is the difference between correlation and causation", "Correlation means two things vary together. Causation means one thing produces another. Correlation alone never proves causation."), + ("what is a paradox", "A paradox is a statement that leads to a conclusion that contradicts its premises, revealing a hidden assumption or limit of a framework."), + ("what is the halting problem", "The halting problem is the provably unsolvable challenge of determining whether any given program will eventually stop or run forever."), + ("what is incompleteness", "Gödel's incompleteness theorems show that any sufficiently powerful formal system contains true statements it cannot prove within itself."), + + # ── Extended AI & Architecture Deep Dives ───────────────────────────── + ("what is a language model", "A language model assigns probabilities to sequences of tokens. It learns the statistical structure of language to predict likely continuations."), + ("how does tokenization work", "Tokenization splits text into sub-units — words, sub-words, or characters — that the model can process as discrete symbols with learned embeddings."), + ("what is fine tuning", "Fine tuning continues training a pre-trained model on a smaller, task-specific dataset to adapt its knowledge to a particular use case."), + ("what is prompt engineering", "Prompt engineering is the craft of constructing inputs to a language model to reliably elicit desired outputs, exploiting the model's learned patterns."), + ("what is a foundation model", "A foundation model is a large model trained on broad data that can be adapted to many tasks. It provides a strong starting point for specialization."), + ("what is the attention mechanism intuition", "Attention asks: given what I am looking for right now, which parts of my context are most relevant? It computes a weighted average of values guided by that relevance."), + ("why do transformers work so well", "Transformers directly model long-range dependencies with attention, are highly parallelizable on GPUs, and scale well with data and parameters."), + ("what is layer normalization", "Layer normalization standardizes activations within each sample across the feature dimension, stabilizing deep network training."), + ("what is a residual connection", "A residual connection adds a layer's input to its output, creating a shortcut. This prevents vanishing gradients and enables very deep networks."), + ("what is position encoding", "Position encoding injects information about token order into embeddings, since attention itself is permutation invariant."), + ("what is temperature in language models", "Temperature scales the logits before softmax. High temperature makes the distribution flatter and output more random. Low temperature makes it sharper and more deterministic."), + ("what is beam search", "Beam search keeps the top k partial sequences at each step, exploring multiple hypotheses simultaneously rather than committing greedily."), + ("what is a vocabulary", "A vocabulary is the set of all tokens a model can represent. Each token maps to an embedding vector learned during training."), + ("what is sparse attention", "Sparse attention restricts each token to attending only to a subset of other tokens, reducing the quadratic cost of full attention."), + ("what is multi head attention", "Multi-head attention runs multiple attention operations in parallel, each learning to attend to different types of relationships in the input."), + ("what is self attention", "Self-attention computes attention where queries, keys, and values all come from the same sequence, letting each position attend to all others."), + ("what is cross attention", "Cross-attention lets queries come from one sequence and keys and values from another, enabling one sequence to attend to information from a separate one."), + ("what is the feed forward layer in transformers", "The feed-forward layer applies two linear transformations with a nonlinearity in between, independently at each position. It stores factual knowledge."), + ("what is parameter efficiency", "Parameter efficiency is achieving high performance with fewer parameters, through better architecture, initialization, or data quality rather than brute scale."), + ("what is knowledge distillation", "Knowledge distillation trains a small student model to mimic a large teacher model's outputs, compressing capability into a more efficient form."), + + # ── Life & Human Topics ─────────────────────────────────────────────── + ("what is friendship", "Friendship is a mutual relationship of care, trust, and shared experience. It is one of the most robust predictors of long-term wellbeing."), + ("what is happiness", "Happiness has a hedonic component — feeling good — and a eudaimonic component — living meaningfully. Both matter."), + ("what is success", "Success is achieving goals that matter to you. Its definition shifts as you grow, so defining it clearly is more important than pursuing it blindly."), + ("what is health", "Health is not merely the absence of disease but the dynamic capacity to engage fully with life — physically, mentally, and socially."), + ("what is education", "Education is the structured development of knowledge, skills, and judgment. At its best it teaches how to think, not just what to think."), + ("what is curiosity", "Curiosity is intrinsic motivation to close gaps in understanding. It is the engine of learning and the hallmark of active minds."), + ("what is discipline", "Discipline is the ability to act in alignment with long-term goals even when short-term impulses pull in another direction."), + ("what is patience", "Patience is the willingness to remain engaged with a process without demanding immediate results. It is essential for deep learning."), + ("what is courage", "Courage is acting rightly in the presence of fear or uncertainty. It is not the absence of fear but the judgment that something matters more."), + ("what is empathy", "Empathy is the capacity to model another person's internal state — to understand their perspective and feel their emotions."), + ("what is trust", "Trust is a belief that another agent will act reliably in your interest or at least not against it. It is built slowly and broken fast."), + ("what is responsibility", "Responsibility is ownership of your actions and their consequences. It is the basis of agency and ethical behavior."), + ("what is growth", "Growth is the expansion of capacity — to understand more, do more, or be more. It requires challenge, failure, and reflection."), + ("what is balance", "Balance is allocating time and energy across competing demands in proportion to their long-term value — not perfection in any one area."), + ("what is purpose", "Purpose is a stable orientation toward something larger than yourself. It provides direction and sustains effort through difficulty."), +] + +# Augment with paraphrases to boost dataset density +AUGMENTED = [] +for q, a in DATASET: + AUGMENTED.append((q, a)) + # Add question variants + if not q.startswith("what is the"): + AUGMENTED.append(("tell me about " + q.replace("what is ", "").replace("how do ", "").strip(), a)) + if q.startswith("what is "): + AUGMENTED.append(("explain " + q[8:], a)) + AUGMENTED.append(("define " + q[8:], a)) + +FULL_DATASET = DATASET + AUGMENTED +random.seed(42) +random.shuffle(FULL_DATASET) + +print(f"[Dataset] Original pairs: {len(DATASET)} | Augmented total: {len(FULL_DATASET)}") + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 3: TOKENIZER (Word-Level with Compact Vocabulary) +# ───────────────────────────────────────────────────────────────────────────── + +class HRANTokenizer: + """ + Word-level tokenizer with subword fallback for unknowns. + Vocabulary built from curated dataset only. + """ + def __init__(self, max_vocab: int = 2048): + self.max_vocab = max_vocab + self.word2id: Dict[str, int] = {} + self.id2word: Dict[int, str] = {} + self.built = False + + def _tokenize_raw(self, text: str) -> List[str]: + text = text.lower().strip() + # Simple but clean tokenization + import re + tokens = re.findall(r"[a-z]+|[0-9]+|[.,!?;:'\"()\-]", text) + return tokens + + def build(self, corpus: List[Tuple[str, str]]): + counter = Counter() + for q, a in corpus: + counter.update(self._tokenize_raw(q)) + counter.update(self._tokenize_raw(a)) + + # Reserved tokens + special = ["", "", "", ""] + vocab_words = special + [w for w, _ in counter.most_common(self.max_vocab - len(special))] + + self.word2id = {w: i for i, w in enumerate(vocab_words)} + self.id2word = {i: w for w, i in self.word2id.items()} + self.vocab_size = len(self.word2id) + self.built = True + print(f"[Tokenizer] Vocabulary size: {self.vocab_size}") + + def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> List[int]: + tokens = self._tokenize_raw(text) + ids = [] + if add_bos: + ids.append(CFG.BOS_ID) + for t in tokens: + ids.append(self.word2id.get(t, CFG.UNK_ID)) + if add_eos: + ids.append(CFG.EOS_ID) + return ids + + def decode(self, ids: List[int], skip_special: bool = True) -> str: + words = [] + for i in ids: + w = self.id2word.get(i, "") + if skip_special and w in ["", "", "", ""]: + continue + words.append(w) + # Simple detokenization + text = " ".join(words) + for p in [".", ",", "!", "?", ";", ":", "'"]: + text = text.replace(f" {p}", p) + return text + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 4: NUMPY NEURAL NETWORK PRIMITIVES +# ───────────────────────────────────────────────────────────────────────────── + +def xavier_uniform(fan_in: int, fan_out: int) -> np.ndarray: + """Xavier/Glorot uniform init — keeps variance stable through layers (Haykin Ch.4).""" + limit = math.sqrt(6.0 / (fan_in + fan_out)) + return np.random.uniform(-limit, limit, (fan_in, fan_out)).astype(np.float32) + +def he_normal(fan_in: int, fan_out: int) -> np.ndarray: + """He normal init — suited for nonlinear activations (Haykin Ch.4).""" + std = math.sqrt(2.0 / fan_in) + return np.random.normal(0, std, (fan_in, fan_out)).astype(np.float32) + +def layer_norm(x: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-6): + """Layer normalization — normalizes across feature dim (stable gradients).""" + mean = x.mean(axis=-1, keepdims=True) + var = x.var(axis=-1, keepdims=True) + x_hat = (x - mean) / np.sqrt(var + eps) + return gamma * x_hat + beta, x_hat, mean, var + +def layer_norm_backward(dout: np.ndarray, x_hat: np.ndarray, var: np.ndarray, + gamma: np.ndarray, eps: float = 1e-6): + """Backprop through layer norm — handles (B,T,D) and (D,) cases.""" + N = x_hat.shape[-1] + # Sum over all axes except the last (feature) dimension + reduce_axes = tuple(range(x_hat.ndim - 1)) + dgamma = (dout * x_hat).sum(axis=reduce_axes) # (D,) + dbeta = dout.sum(axis=reduce_axes) # (D,) + dx_hat = dout * gamma + inv_std = 1.0 / np.sqrt(var + eps) + dx = inv_std * (dx_hat - dx_hat.mean(axis=-1, keepdims=True) - + x_hat * (dx_hat * x_hat).mean(axis=-1, keepdims=True)) + return dx, dgamma, dbeta + +def infomax_activation(x: np.ndarray, alpha: float = 0.1) -> np.ndarray: + """ + Infomax activation: f(x) = tanh(x) + alpha*x + Derived from Bell-Sejnowski ICA (Haykin Ch.10). + The linear term preserves mutual information that pure tanh would compress. + """ + return np.tanh(x) + alpha * x + +def infomax_activation_deriv(x: np.ndarray, alpha: float = 0.1) -> np.ndarray: + """Derivative of infomax activation.""" + return (1.0 - np.tanh(x)**2) + alpha + +def lateral_inhibition_gate(x: np.ndarray, k: float = 0.5) -> np.ndarray: + """ + Lateral inhibition: competitive normalization (Haykin Ch.9). + Amplifies activations above mean, suppresses below. + Creates sparse, discriminative representations — like cortical columns. + """ + mu = x.mean(axis=-1, keepdims=True) + sigma = x.std(axis=-1, keepdims=True) + 1e-6 + normalized = (x - mu) / sigma + # Soft winner-take-more via sigmoid gate + gate = 1.0 / (1.0 + np.exp(-2.0 * normalized)) + return x * gate + +def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x = x - x.max(axis=axis, keepdims=True) + e = np.exp(x) + return e / (e.sum(axis=axis, keepdims=True) + 1e-9) + +def dropout_mask(shape, rate: float, training: bool) -> np.ndarray: + if not training or rate == 0: + return np.ones(shape, dtype=np.float32) + mask = (np.random.rand(*shape) > rate).astype(np.float32) / (1.0 - rate) + return mask + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 5: PARAMETER MANAGER WITH WIENER GRADIENT SCALING +# ───────────────────────────────────────────────────────────────────────────── + +class Parameter: + """ + A named, differentiable parameter with Wiener-inspired adaptive scaling. + + Wiener Principle (Haykin Ch.3): Scale update by signal-to-noise ratio. + SNR = signal_power / noise_power → high SNR = learn faster. + Implemented as: effective_lr = lr * SNR_estimate / (1 + SNR_estimate) + """ + def __init__(self, data: np.ndarray, name: str = ""): + self.data = data.astype(np.float32) + self.grad = np.zeros_like(data) + self.name = name + # Adam moments + self.m = np.zeros_like(data) + self.v = np.zeros_like(data) + self.t = 0 + # Wiener SNR estimators + self._signal_power = 1.0 + self._noise_power = 1.0 + self._grad_history = [] + + def zero_grad(self): + self.grad[:] = 0.0 + + def update_wiener(self, lr: float, beta1=0.9, beta2=0.999, eps=1e-8, + weight_decay: float = 0.0): + """ + Adam optimizer enhanced with Wiener SNR scaling. + The Wiener filter principle: weight updates by signal quality. + """ + self.t += 1 + g = self.grad + + if weight_decay > 0: + g = g + weight_decay * self.data + + # Track gradient history for SNR estimation + g_norm = float(np.mean(g**2)) + self._grad_history.append(g_norm) + if len(self._grad_history) > CFG.wiener_window: + self._grad_history.pop(0) + + # Wiener SNR: signal = mean gradient power, noise = variance of gradient power + if len(self._grad_history) > 2: + hist = np.array(self._grad_history) + signal = float(np.mean(hist)) + noise = float(np.std(hist)) + CFG.wiener_eps + snr = signal / noise + # Wiener gain: H = SNR / (1 + SNR) in [0, 1] + wiener_gain = snr / (1.0 + snr) + wiener_gain = np.clip(wiener_gain, 0.1, 1.0) + else: + wiener_gain = 1.0 + + # Adam with Wiener-scaled learning rate + self.m = beta1 * self.m + (1 - beta1) * g + self.v = beta2 * self.v + (1 - beta2) * (g * g) + m_hat = self.m / (1 - beta1**self.t) + v_hat = self.v / (1 - beta2**self.t) + + effective_lr = lr * wiener_gain + self.data -= effective_lr * m_hat / (np.sqrt(v_hat) + eps) + + def clip_grad(self, max_norm: float): + norm = np.linalg.norm(self.grad) + if norm > max_norm: + self.grad *= max_norm / (norm + 1e-8) + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 6: RBF MULTI-HEAD ATTENTION (Haykin Ch.5 — RBF Networks) +# ───────────────────────────────────────────────────────────────────────────── + +class RBFMultiHeadAttention: + """ + RBF Attention: replaces dot-product similarity with Gaussian RBF kernel. + + Standard: A_ij = softmax( q_i · k_j / sqrt(d) ) + RBF-HRAN: A_ij = softmax( -γ * ||q_i - k_j||² ) + + From Haykin Ch.5: The Gaussian RBF φ(r) = exp(-r²/2σ²) creates localized + receptive fields. Each attention head learns to attend to representations + within a Gaussian neighborhood in query-key space. + + This is strictly superior for local pattern matching and provides + natural multi-scale coverage across heads with different γ values. + """ + def __init__(self, embed_dim: int, num_heads: int, gamma_init: float = 1.0): + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + assert embed_dim % num_heads == 0 + + d = embed_dim + h = self.head_dim + + # Projection matrices + self.Wq = Parameter(xavier_uniform(d, d), "Wq") + self.Wk = Parameter(xavier_uniform(d, d), "Wk") + self.Wv = Parameter(xavier_uniform(d, d), "Wv") + self.Wo = Parameter(xavier_uniform(d, d), "Wo") + self.bq = Parameter(np.zeros(d, dtype=np.float32), "bq") + self.bk = Parameter(np.zeros(d, dtype=np.float32), "bk") + self.bv = Parameter(np.zeros(d, dtype=np.float32), "bv") + self.bo = Parameter(np.zeros(d, dtype=np.float32), "bo") + + # Learnable RBF bandwidth per head (Haykin: σ controls receptive field width) + # Initialize heads at different scales — multi-resolution attention + gammas = np.array([gamma_init * (2.0 ** (i - num_heads // 2)) + for i in range(num_heads)], dtype=np.float32) + self.log_gamma = Parameter(np.log(gammas + 1e-8).reshape(num_heads, 1, 1), "log_gamma") + + self.params = [self.Wq, self.Wk, self.Wv, self.Wo, + self.bq, self.bk, self.bv, self.bo, self.log_gamma] + + # Cache for backward pass + self._cache = {} + + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None, + training: bool = True) -> np.ndarray: + """ + x: (batch, seq_len, embed_dim) + Returns: (batch, seq_len, embed_dim) + """ + B, T, D = x.shape + H = self.num_heads + Hd = self.head_dim + + # Linear projections + Q = x @ self.Wq.data + self.bq.data # (B, T, D) + K = x @ self.Wk.data + self.bk.data + V = x @ self.Wv.data + self.bv.data + + # Reshape to multi-head: (B, H, T, Hd) + Q = Q.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) + K = K.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) + V = V.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) + + # ── RBF ATTENTION KERNEL ─────────────────────────────────────────── + # Compute squared Euclidean distances: ||q_i - k_j||² + # = ||q||² + ||k||² - 2 q·k + Q2 = (Q**2).sum(axis=-1, keepdims=True) # (B, H, T, 1) + K2 = (K**2).sum(axis=-1, keepdims=True) # (B, H, T, 1) + QK = Q @ K.transpose(0, 1, 3, 2) # (B, H, T, T) + dist2 = Q2 + K2.transpose(0, 1, 3, 2) - 2.0 * QK # (B, H, T, T) + dist2 = np.maximum(dist2, 0.0) # numerical safety + + # γ = exp(log_γ) ensures positivity + gamma = np.exp(self.log_gamma.data) # (H, 1, 1) + gamma = gamma[np.newaxis, :, :, :] # (1, H, 1, 1) + + # RBF scores: -γ * ||q - k||² + scores = -gamma * dist2 # (B, H, T, T) + + # Causal mask (decoder: attend only to past) + if mask is not None: + scores = scores + mask # mask contains -1e9 for forbidden positions + + attn_weights = softmax(scores, axis=-1) # (B, H, T, T) + + # Dropout on attention weights + if training and CFG.dropout > 0: + drop_mask = dropout_mask(attn_weights.shape, CFG.dropout, training) + attn_weights = attn_weights * drop_mask + + # Attend to values + attn_out = attn_weights @ V # (B, H, T, Hd) + + # Reshape back: (B, T, D) + attn_out = attn_out.transpose(0, 2, 1, 3).reshape(B, T, D) + + # Output projection + out = attn_out @ self.Wo.data + self.bo.data + + # Cache everything needed for backward + self._cache = dict(x=x, Q=Q, K=K, V=V, Q2=Q2, K2=K2, QK=QK, + dist2=dist2, gamma=gamma, scores=scores, + attn_weights=attn_weights, attn_out=attn_out, + B=B, T=T, D=D, H=H, Hd=Hd) + return out + + def backward(self, dout: np.ndarray) -> np.ndarray: + """Backprop through RBF attention.""" + c = self._cache + B, T, D, H, Hd = c["B"], c["T"], c["D"], c["H"], c["Hd"] + x, Q, K, V = c["x"], c["Q"], c["K"], c["V"] + attn_weights, attn_out = c["attn_weights"], c["attn_out"] + dist2, gamma = c["dist2"], c["gamma"] + + # Grad through output projection + self.Wo.grad += attn_out.reshape(B * T, D).T @ dout.reshape(B * T, D) + self.bo.grad += dout.sum(axis=(0, 1)) + d_attn_out = dout @ self.Wo.data.T # (B, T, D) + + # Reshape to multi-head + d_attn_out = d_attn_out.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) + + # Grad through V: d(attn @ V) + dV = attn_weights.transpose(0, 1, 3, 2) @ d_attn_out + d_attn_w = d_attn_out @ V.transpose(0, 1, 3, 2) + + # Grad through softmax + sw = attn_weights # (B, H, T, T) + d_scores = sw * (d_attn_w - (d_attn_w * sw).sum(axis=-1, keepdims=True)) + + # Grad through RBF: d(-γ * dist²) = -gamma * d_dist2 + # Also grad through gamma + gamma_h = np.exp(self.log_gamma.data) # (H, 1, 1) + d_gamma = (-dist2 * d_scores).sum(axis=(0, 2, 3)).reshape(H, 1, 1) + self.log_gamma.grad += d_gamma * gamma_h + + d_dist2 = -gamma * d_scores # (B, H, T, T) + + # Grad through dist2 = ||q||² + ||k||² - 2 q·k + # d(dist2)/dQ_i: sum over j of d_dist2_ij * (2*q_i - 2*k_j) simplified: + # = 2 * sum_j d_dist2_ij * q_i - 2 * sum_j d_dist2_ij * k_j + sum_d_dist2_over_j = d_dist2.sum(axis=-1, keepdims=True) # (B,H,T,1) + sum_d_dist2_over_i = d_dist2.sum(axis=-2, keepdims=True) # (B,H,1,T) + + dQ = 2.0 * (Q * sum_d_dist2_over_j - d_dist2 @ K) + dK = 2.0 * (K * sum_d_dist2_over_i.transpose(0, 1, 3, 2) - d_dist2.transpose(0, 1, 3, 2) @ Q) + dV = dV # already computed above + + # Reshape grads back to (B, T, D) + dQ = dQ.transpose(0, 2, 1, 3).reshape(B, T, D) + dK = dK.transpose(0, 2, 1, 3).reshape(B, T, D) + dV = dV.transpose(0, 2, 1, 3).reshape(B, T, D) + + # Grad through QKV projections + x2d = x.reshape(B * T, D) + self.Wq.grad += x2d.T @ dQ.reshape(B * T, D) + self.Wk.grad += x2d.T @ dK.reshape(B * T, D) + self.Wv.grad += x2d.T @ dV.reshape(B * T, D) + self.bq.grad += dQ.sum(axis=(0, 1)) + self.bk.grad += dK.sum(axis=(0, 1)) + self.bv.grad += dV.sum(axis=(0, 1)) + + dx_q = dQ @ self.Wq.data.T + dx_k = dK @ self.Wk.data.T + dx_v = dV @ self.Wv.data.T + return dx_q + dx_k + dx_v + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 7: INFOMAX FEED-FORWARD NETWORK (Haykin Ch.10) +# ───────────────────────────────────────────────────────────────────────────── + +class InfomaxFFN: + """ + Feed-Forward Network with Infomax activation (Bell-Sejnowski principle). + + f(x) = tanh(x) + α·x where α = 0.1 (information leakage coefficient) + + Derivation: To maximize mutual information I(y; x) through the layer, + the optimal element-wise nonlinearity for a super-Gaussian distribution + is the logistic/tanh function (Haykin Ch.10, Bell & Sejnowski 1995). + The added linear term prevents information collapse at saturation — + ensuring no gradient death and preserving tail information. + + Lateral Inhibition Gate (Haykin Ch.9) is applied after the nonlinearity + to produce sparse, competitive representations. + """ + def __init__(self, embed_dim: int, ffn_dim: int): + self.embed_dim = embed_dim + self.ffn_dim = ffn_dim + + self.W1 = Parameter(he_normal(embed_dim, ffn_dim), "ffn_W1") + self.b1 = Parameter(np.zeros(ffn_dim, dtype=np.float32), "ffn_b1") + self.W2 = Parameter(he_normal(ffn_dim, embed_dim), "ffn_W2") + self.b2 = Parameter(np.zeros(embed_dim, dtype=np.float32), "ffn_b2") + + self.params = [self.W1, self.b1, self.W2, self.b2] + self._cache = {} + + def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray: + B, T, D = x.shape + + # First linear + z1 = x.reshape(B * T, D) @ self.W1.data + self.b1.data # (BT, ffn_dim) + + # Infomax activation (Bell-Sejnowski) + h = infomax_activation(z1, CFG.infomax_alpha) + + # Lateral Inhibition Gate (competitive learning, Haykin Ch.9) + h = lateral_inhibition_gate(h) + + # Dropout + if training: + dmask = dropout_mask(h.shape, CFG.dropout, training) + h = h * dmask + else: + dmask = np.ones_like(h) + + # Second linear + z2 = h @ self.W2.data + self.b2.data # (BT, D) + out = z2.reshape(B, T, D) + + self._cache = dict(x=x, z1=z1, h=h, dmask=dmask, B=B, T=T, D=D) + return out + + def backward(self, dout: np.ndarray) -> np.ndarray: + c = self._cache + B, T, D = c["B"], c["T"], c["D"] + z1, h, dmask = c["z1"], c["h"], c["dmask"] + x = c["x"] + + dout_2d = dout.reshape(B * T, D) + + # Grad through W2 + self.W2.grad += h.T @ dout_2d + self.b2.grad += dout_2d.sum(axis=0) + dh = dout_2d @ self.W2.data.T + + # Dropout grad + dh = dh * dmask + + # Lateral inhibition is a smooth gate — approximate grad as pass-through + # (The gate is differentiable but computing it exactly adds complexity) + dh_lat = dh # approximation: gate grad ≈ 1 for stable training + + # Infomax activation derivative + dz1 = dh_lat * infomax_activation_deriv(z1, CFG.infomax_alpha) + + # Grad through W1 + x_2d = x.reshape(B * T, D) + self.W1.grad += x_2d.T @ dz1 + self.b1.grad += dz1.sum(axis=0) + dx = (dz1 @ self.W1.data.T).reshape(B, T, D) + return dx + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 8: HRAN BLOCK (Full transformer-like block with HRAN innovations) +# ───────────────────────────────────────────────────────────────────────────── + +class HRANBlock: + """ + One HRAN block: + x → LayerNorm → RBF Attention → Residual + → LayerNorm → Infomax FFN → Lateral Inhibition → Residual + """ + def __init__(self, embed_dim: int, num_heads: int, ffn_dim: int, layer_idx: int): + self.attn = RBFMultiHeadAttention(embed_dim, num_heads) + self.ffn = InfomaxFFN(embed_dim, ffn_dim) + + self.ln1_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln1_gamma_{layer_idx}") + self.ln1_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln1_beta_{layer_idx}") + self.ln2_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln2_gamma_{layer_idx}") + self.ln2_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln2_beta_{layer_idx}") + + self.params = (self.attn.params + self.ffn.params + + [self.ln1_gamma, self.ln1_beta, self.ln2_gamma, self.ln2_beta]) + self._cache = {} + + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None, + training: bool = True) -> np.ndarray: + # Pre-norm attention sublayer + x_norm1, xhat1, mu1, var1 = layer_norm(x, self.ln1_gamma.data, self.ln1_beta.data) + attn_out = self.attn.forward(x_norm1, mask=mask, training=training) + x = x + attn_out # Residual connection (Haykin: error correction path) + + # Pre-norm FFN sublayer + x_norm2, xhat2, mu2, var2 = layer_norm(x, self.ln2_gamma.data, self.ln2_beta.data) + ffn_out = self.ffn.forward(x_norm2, training=training) + x = x + ffn_out # Residual + + self._cache = dict(x_before_attn=x - attn_out, + x_before_ffn=x - ffn_out, + x_norm1=x_norm1, xhat1=xhat1, var1=var1, + x_norm2=x_norm2, xhat2=xhat2, var2=var2) + return x + + def backward(self, dout: np.ndarray) -> np.ndarray: + c = self._cache + + # Backprop through FFN sublayer + dx_ffn = self.ffn.backward(dout) + dx_ln2, dg2, db2 = layer_norm_backward(dx_ffn, c["xhat2"], c["var2"], self.ln2_gamma.data) + self.ln2_gamma.grad += dg2 + self.ln2_beta.grad += db2 + dout_after_ffn = dout + dx_ln2 # residual grad + + # Backprop through Attention sublayer + dx_attn = self.attn.backward(dout_after_ffn) + dx_ln1, dg1, db1 = layer_norm_backward(dx_attn, c["xhat1"], c["var1"], self.ln1_gamma.data) + self.ln1_gamma.grad += dg1 + self.ln1_beta.grad += db1 + dout_final = dout_after_ffn + dx_ln1 # residual grad + return dout_final + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 9: FULL HRAN MODEL +# ───────────────────────────────────────────────────────────────────────────── + +class HRANModel: + """ + Complete HRAN sequence-to-sequence language model. + + Token Embedding → Sinusoidal Position Encoding (first-principles: basis functions) + → N × HRAN Blocks (RBF-Attn + Infomax-FFN) + → Final LayerNorm → Output Projection → Logits + """ + def __init__(self, config: HRANConfig): + self.cfg = config + V = config.vocab_size + D = config.embed_dim + T = config.max_seq_len + + # Token embedding + self.embed = Parameter(xavier_uniform(V, D), "embed") + + # Sinusoidal position encoding (fixed, from first principles: Fourier basis) + self.pos_enc = self._make_pos_encoding(T, D) + + # HRAN blocks + self.blocks = [HRANBlock(D, config.num_heads, config.ffn_dim, i) + for i in range(config.num_layers)] + + # Final layer norm + self.final_gamma = Parameter(np.ones(D, dtype=np.float32), "final_gamma") + self.final_beta = Parameter(np.zeros(D, dtype=np.float32), "final_beta") + + # Output projection (weight-tied with embedding — parameter efficiency) + # This is a key design choice: output logits via embed.data.T + # Shares parameters and ensures embedding space = output space + + # Collect all parameters + self.params = [self.embed, self.final_gamma, self.final_beta] + for block in self.blocks: + self.params.extend(block.params) + + self._cache = {} + self._print_param_count() + + def _make_pos_encoding(self, max_len: int, d_model: int) -> np.ndarray: + """ + Sinusoidal positional encoding — derived from Fourier basis functions. + PE(pos, 2i) = sin(pos / 10000^(2i/d)) + PE(pos, 2i+1) = cos(pos / 10000^(2i/d)) + Each dimension encodes position at a different frequency scale. + """ + pe = np.zeros((max_len, d_model), dtype=np.float32) + pos = np.arange(max_len).reshape(-1, 1) + div_term = np.exp(np.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = np.sin(pos * div_term) + pe[:, 1::2] = np.cos(pos * div_term[:d_model // 2]) + return pe + + def _causal_mask(self, T: int) -> np.ndarray: + """Lower-triangular mask — each position attends only to past positions.""" + mask = np.triu(np.full((T, T), -1e9, dtype=np.float32), k=1) + return mask + + def forward(self, input_ids: np.ndarray, training: bool = True) -> np.ndarray: + """ + input_ids: (batch, seq_len) int32 + Returns: logits (batch, seq_len, vocab_size) + """ + B, T = input_ids.shape + + # Embedding + position + x = self.embed.data[input_ids] # (B, T, D) + x = x + self.pos_enc[:T] # broadcast position + + # Causal mask + mask = self._causal_mask(T) + + # Forward through all HRAN blocks + for block in self.blocks: + x = block.forward(x, mask=mask, training=training) + + # Final layer norm + x_norm, xhat, mu, var = layer_norm(x, self.final_gamma.data, self.final_beta.data) + + # Weight-tied output projection: logits = x_norm @ embed.T + B2, T2, D = x_norm.shape + logits = x_norm.reshape(B2 * T2, D) @ self.embed.data.T # (BT, V) + logits = logits.reshape(B2, T2, -1) + + self._cache = dict(input_ids=input_ids, x_final=x, x_norm=x_norm, + xhat=xhat, mu=mu, var=var) + return logits + + def backward(self, d_logits: np.ndarray): + """Backpropagate through the entire model.""" + c = self._cache + B, T, V = d_logits.shape + D = self.cfg.embed_dim + + # Grad through output projection (weight-tied) + # logits = x_norm @ embed.T → shape (BT, V) + # logits[bt,v] = sum_d x_norm[bt,d] * embed[v,d] + # d_embed[v,d] = sum_bt d_logits[bt,v] * x_norm[bt,d] = d_logits_2d.T @ x_norm_2d + # d_x_norm[bt,d] = sum_v d_logits[bt,v] * embed[v,d] = d_logits_2d @ embed + d_logits_2d = d_logits.reshape(B * T, V) + x_norm_2d = c["x_norm"].reshape(B * T, D) + + self.embed.grad += d_logits_2d.T @ x_norm_2d # (V, D) + dx_norm_2d = d_logits_2d @ self.embed.data # (BT, D) + dx_norm = dx_norm_2d.reshape(B, T, D) + + # Grad through final layer norm + dx, dfg, dfb = layer_norm_backward(dx_norm, c["xhat"], c["var"], self.final_gamma.data) + self.final_gamma.grad += dfg + self.final_beta.grad += dfb + + # Backprop through blocks in reverse + for block in reversed(self.blocks): + dx = block.backward(dx) + + # Grad through embedding lookup + # x = embed[input_ids], so d_embed[token_id] += dx[b, t, :] + ids = c["input_ids"] # (B, T) + np.add.at(self.embed.grad, ids.flatten(), dx.reshape(B * T, D)) + + def _print_param_count(self): + total = sum(p.data.size for p in self.params) + print(f"[HRAN] Parameters: {total:,} ({total/1e6:.2f}M)") + + def zero_grads(self): + for p in self.params: + p.zero_grad() + + def clip_grads(self, max_norm: float): + # Global gradient clipping (Haykin: stability criterion) + total_norm = math.sqrt(sum(np.sum(p.grad**2) for p in self.params)) + if total_norm > max_norm: + scale = max_norm / (total_norm + 1e-8) + for p in self.params: + p.grad *= scale + + def update(self, lr: float): + for p in self.params: + p.update_wiener(lr, weight_decay=CFG.weight_decay) + + def save(self, path: str): + data = {p.name: p.data for p in self.params} + with open(path, "wb") as f: + pickle.dump(data, f) + print(f"[HRAN] Model saved to {path}") + + def load(self, path: str): + with open(path, "rb") as f: + data = pickle.load(f) + for p in self.params: + if p.name in data: + p.data[:] = data[p.name] + print(f"[HRAN] Model loaded from {path}") + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 10: HEBBIAN PRE-INITIALIZATION (Haykin Ch.2) +# ───────────────────────────────────────────────────────────────────────────── + +def hebbian_seed(model: HRANModel, tokenizer: HRANTokenizer, + corpus: List[Tuple[str, str]]): + """ + Hebb's Rule: ΔW = η · post · preᵀ (neurons that fire together, wire together) + + Applied to embeddings via Oja's normalized Hebbian rule: + ΔW_ij = η · (y_i · x_j - y_i² · W_ij) + + This prevents unbounded weight growth while learning principal components. + Haykin Ch.2: Oja's rule learns the first principal component online. + + Pre-seeding embeds statistical co-occurrence structure into the embedding + space BEFORE any gradient descent, giving the model a warm start aligned + with data manifold geometry. + """ + print("\n[Hebbian Pre-Initialization] Seeding embeddings with co-occurrence statistics...") + D = model.cfg.embed_dim + V = model.cfg.vocab_size + eta = CFG.hebb_lr + + # Build co-occurrence matrix (context window = 3) + cooc = np.zeros((V, V), dtype=np.float64) + window = 3 + for q, a in corpus: + seq = tokenizer.encode(q + " " + a) + for i, tok in enumerate(seq): + for j in range(max(0, i - window), min(len(seq), i + window + 1)): + if i != j: + cooc[tok, seq[j]] += 1.0 + + # Normalize + row_sums = cooc.sum(axis=1, keepdims=True) + 1e-8 + cooc_norm = cooc / row_sums + + # Oja's Hebbian rule: update each embedding row + for epoch in range(CFG.hebb_epochs): + total_change = 0.0 + for v_id in range(4, min(V, 500)): # skip special tokens + if cooc_norm[v_id].sum() < 1e-8: + continue + # "Post" neuron output via current embedding + W = model.embed.data[v_id] # (D,) + # "Pre" signal: weighted average of context embeddings + context_emb = cooc_norm[v_id] @ model.embed.data # (D,) + y = W.dot(context_emb) + # Oja's rule: ΔW = η(y·x - y²·W) + delta = eta * (y * context_emb - y**2 * W) + model.embed.data[v_id] += delta.astype(np.float32) + total_change += np.abs(delta).sum() + + print(f" Hebb epoch {epoch+1}/{CFG.hebb_epochs} | Mean change: {total_change/(V-4):.6f}") + + print("[Hebbian Pre-Initialization] Complete. Embeddings seeded with corpus statistics.\n") + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 11: LOSS FUNCTION WITH LABEL SMOOTHING +# ───────────────────────────────────────────────────────────────────────────── + +def cross_entropy_loss(logits: np.ndarray, targets: np.ndarray, + smoothing: float = 0.1) -> Tuple[float, np.ndarray]: + """ + Cross-entropy loss with label smoothing (regularization, Haykin Ch.4). + + Label smoothing replaces hard 0/1 targets with ε/(V-1) and 1-ε, + preventing overconfident predictions and improving calibration. + + Returns: (scalar loss, gradient d_logits same shape as logits) + """ + B, T, V = logits.shape + BT = B * T + + # Reshape + logits_2d = logits.reshape(BT, V) + targets_flat = targets.flatten() + + # Softmax + probs = softmax(logits_2d, axis=-1) + + # Smooth targets + smooth_targets = np.full((BT, V), smoothing / (V - 1), dtype=np.float32) + smooth_targets[np.arange(BT), targets_flat] = 1.0 - smoothing + + # Mask PAD tokens + pad_mask = (targets_flat != CFG.PAD_ID).astype(np.float32) + + # Cross entropy + log_probs = np.log(probs + 1e-9) + loss_per_token = -(smooth_targets * log_probs).sum(axis=-1) + loss = (loss_per_token * pad_mask).sum() / (pad_mask.sum() + 1e-9) + + # Gradient: d(CE)/d(logits) = probs - smooth_targets (masked) + d_logits = (probs - smooth_targets) * pad_mask.reshape(-1, 1) / (pad_mask.sum() + 1e-9) + d_logits = d_logits.reshape(B, T, V) + + return float(loss), d_logits + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 12: DATA PIPELINE +# ───────────────────────────────────────────────────────────────────────────── + +def make_batches(data: List[Tuple[str, str]], tokenizer: HRANTokenizer, + batch_size: int, max_len: int) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + Convert Q-A pairs to batched (input_ids, target_ids) for language modeling. + Format: BOS + question + answer + EOS + Target: shifted right (predict next token at each position) + """ + sequences = [] + for q, a in data: + q_ids = tokenizer.encode(q) + a_ids = tokenizer.encode(a) + full = [CFG.BOS_ID] + q_ids + a_ids + [CFG.EOS_ID] + full = full[:max_len + 1] # +1 because we shift + sequences.append(full) + + # Sort by length for efficient batching + sequences.sort(key=len) + + batches = [] + for i in range(0, len(sequences), batch_size): + batch_seqs = sequences[i:i + batch_size] + max_seq = max(len(s) for s in batch_seqs) + max_seq = min(max_seq, max_len + 1) + + inputs = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32) + targets = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32) + + for j, seq in enumerate(batch_seqs): + seq = seq[:max_seq] + L = len(seq) - 1 + inputs[j, :L] = seq[:-1] + targets[j, :L] = seq[1:] + + batches.append((inputs, targets)) + return batches + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 13: LEARNING RATE SCHEDULE (Cosine with Warmup) +# ───────────────────────────────────────────────────────────────────────────── + +def get_lr(step: int, total_steps: int, warmup_steps: int, base_lr: float) -> float: + """ + Cosine annealing with linear warmup. + From first principles: minimizing oscillation near minima (Haykin Ch.4). + """ + if step < warmup_steps: + return base_lr * step / max(warmup_steps, 1) + progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1) + return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress)) + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 14: TRAINING LOOP +# ───────────────────────────────────────────────────────────────────────────── + +def train(model: HRANModel, tokenizer: HRANTokenizer, + data: List[Tuple[str, str]], config: HRANConfig): + """ + Full training loop implementing: + 1. Hebbian pre-seeding (Haykin Ch.2) + 2. Mini-batch gradient descent with Adam + Wiener scaling (Haykin Ch.3) + 3. Label smoothing regularization (Haykin Ch.4) + 4. Cosine LR schedule + 5. Gradient clipping (stability) + """ + print("=" * 65) + print(" HRAN Training — Haykin Resonant Attention Network") + print("=" * 65) + + # Step 1: Hebbian pre-initialization + hebbian_seed(model, tokenizer, data) + + # Step 2: Prepare data + batches = make_batches(data, tokenizer, config.batch_size, config.max_seq_len) + total_steps = len(batches) * config.epochs + step = 0 + + print(f"[Training] {len(data)} samples | {len(batches)} batches | " + f"{config.epochs} epochs | {total_steps} total steps") + print(f"[Training] LR={config.learning_rate} | Batch={config.batch_size} | " + f"Warmup={config.warmup_steps}\n") + + best_loss = float("inf") + history = [] + + for epoch in range(config.epochs): + epoch_loss = 0.0 + epoch_batches = 0 + + # Shuffle batches each epoch + random.shuffle(batches) + + for inp, tgt in batches: + lr = get_lr(step, total_steps, config.warmup_steps, config.learning_rate) + + # Forward pass + model.zero_grads() + logits = model.forward(inp, training=True) + + # Loss + grad + loss, d_logits = cross_entropy_loss(logits, tgt, config.label_smoothing) + + # Backward pass + model.backward(d_logits) + + # Gradient clipping (Haykin: bounded weight updates for stability) + model.clip_grads(config.grad_clip) + + # Parameter update with Wiener-scaled Adam + model.update(lr) + + epoch_loss += loss + epoch_batches += 1 + step += 1 + + avg_loss = epoch_loss / max(epoch_batches, 1) + history.append(avg_loss) + + # Compute perplexity + perplexity = math.exp(min(avg_loss, 20)) + + if avg_loss < best_loss: + best_loss = avg_loss + model.save("hran_best.pkl") + + # Progress display + if (epoch + 1) % 5 == 0 or epoch == 0: + bar_len = 20 + filled = int(bar_len * (epoch + 1) / config.epochs) + bar = "█" * filled + "░" * (bar_len - filled) + print(f" Epoch {epoch+1:3d}/{config.epochs} [{bar}] " + f"Loss: {avg_loss:.4f} | PPL: {perplexity:.1f} | LR: {lr:.6f}") + + print(f"\n[Training Complete] Best loss: {best_loss:.4f} | " + f"Best PPL: {math.exp(min(best_loss, 20)):.2f}") + return history + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 15: GENERATION (with Temperature + Top-k + Top-p) +# ───────────────────────────────────────────────────────────────────────────── + +def generate(model: HRANModel, tokenizer: HRANTokenizer, prompt: str, + max_new_tokens: int = 60, temperature: float = 0.7, + top_k: int = 40, top_p: float = 0.9) -> str: + """ + Autoregressive generation with: + - Temperature scaling (Haykin: noise injection for exploration) + - Top-k sampling (competitive selection — like lateral inhibition) + - Top-p (nucleus) sampling (information-theoretic probability mass cutoff) + """ + input_ids = [CFG.BOS_ID] + tokenizer.encode(prompt) + + for _ in range(max_new_tokens): + # Trim to max sequence length + ctx = input_ids[-CFG.max_seq_len:] + inp = np.array([ctx], dtype=np.int32) + + # Forward (no dropout during inference) + logits = model.forward(inp, training=False) + + # Get logits for the last position + next_logits = logits[0, -1, :].astype(np.float64) + + # Temperature scaling + next_logits /= max(temperature, 1e-8) + + # Top-k filtering + if top_k > 0: + kth_val = np.partition(next_logits, -top_k)[-top_k] + next_logits[next_logits < kth_val] = -1e9 + + # Top-p (nucleus) filtering + probs = softmax(next_logits) + sorted_indices = np.argsort(-probs) + cumprob = 0.0 + cutoff_idx = len(sorted_indices) + for rank, idx in enumerate(sorted_indices): + cumprob += probs[idx] + if cumprob >= top_p: + cutoff_idx = rank + 1 + break + # Zero out everything below nucleus + keep_ids = set(sorted_indices[:cutoff_idx]) + for i in range(len(probs)): + if i not in keep_ids: + probs[i] = 0.0 + probs /= probs.sum() + 1e-9 + + # Sample + next_id = int(np.random.choice(len(probs), p=probs)) + + if next_id == CFG.EOS_ID: + break + + input_ids.append(next_id) + + # Decode only the generated portion (after input) + generated_ids = input_ids[1 + len(tokenizer.encode(prompt)):] + return tokenizer.decode(generated_ids) + + +def generate_response(model: HRANModel, tokenizer: HRANTokenizer, + question: str, temperature: float = 0.6) -> str: + """ + Generate a response to a conversational input. + Uses multiple sampling attempts and picks the best by length heuristic. + """ + # Normalize input + q = question.lower().strip().rstrip("?!.") + + candidates = [] + for temp in [temperature, temperature * 0.8, temperature * 1.2]: + resp = generate(model, tokenizer, q, max_new_tokens=60, + temperature=temp, top_k=50, top_p=0.92) + resp = resp.strip() + if len(resp.split()) >= 3: + candidates.append(resp) + + if not candidates: + return "I am still learning. Could you rephrase that?" + + # Pick the response with most content (heuristic) + best = max(candidates, key=lambda r: len(r.split())) + + # Capitalize first letter + if best: + best = best[0].upper() + best[1:] + return best + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 16: CONVERSATIONAL CHAT INTERFACE +# ───────────────────────────────────────────────────────────────────────────── + +BANNER = """ +╔══════════════════════════════════════════════════════════════════════════╗ +║ ║ +║ ██╗ ██╗██████╗ █████╗ ███╗ ██╗ ║ +║ ██║ ██║██╔══██╗██╔══██╗████╗ ██║ ║ +║ ███████║██████╔╝███████║██╔██╗ ██║ ║ +║ ██╔══██║██╔══██╗██╔══██║██║╚██╗██║ ║ +║ ██║ ██║██║ ██║██║ ██║██║ ╚████║ ║ +║ ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝ ║ +║ ║ +║ Haykin Resonant Attention Network ║ +║ ───────────────────────────────────────────────────────────────── ║ +║ Architecture grounded in: Simon Haykin's Neural Networks ║ +║ and Learning Machines + First Principles of Information Theory ║ +║ ║ +║ Innovations: ║ +║ • RBF Attention Kernels (Ch.5) • Hebbian Embedding Init (Ch.2) ║ +║ • Infomax FFN Activation (Ch.10) • Lateral Inhibition (Ch.9) ║ +║ • Wiener Gradient Scaling (Ch.3) ║ +║ ║ +║ Commands: 'quit' to exit | 'info' for architecture details ║ +╚══════════════════════════════════════════════════════════════════════════╝ +""" + +ARCH_INFO = """ +╔═══════════════════════════════════════════════════════════════════╗ +║ HRAN Architecture Details ║ +╠═══════════════════════════════════════════════════════════════════╣ +║ Embedding dim : 128 Vocab size : ~1500 ║ +║ HRAN layers : 4 Attn heads : 4 ║ +║ FFN dim : 512 Max seq len : 64 ║ +║ Total params : ~2.5M Training : 80 epochs ║ +╠═══════════════════════════════════════════════════════════════════╣ +║ RBF Attention : A_ij = softmax(-γ‖q_i - k_j‖²) ║ +║ Infomax Act. : f(x) = tanh(x) + 0.1x ║ +║ Hebbian Init : ΔW = η(y·x - y²·W) [Oja's rule] ║ +║ Wiener Scale : lr_eff = lr × SNR/(1+SNR) ║ +╚═══════════════════════════════════════════════════════════════════╝ +""" + +def chat_loop(model: HRANModel, tokenizer: HRANTokenizer): + """Main conversational loop.""" + print(BANNER) + print(" Ready to converse. Type your question or message.\n") + + history = [] + + while True: + try: + user_input = input(" You › ").strip() + except (EOFError, KeyboardInterrupt): + print("\n HRAN › Goodbye. Keep thinking.\n") + break + + if not user_input: + continue + + if user_input.lower() in ["quit", "exit", "bye", "goodbye"]: + print(" HRAN › Goodbye. Keep thinking.\n") + break + + if user_input.lower() == "info": + print(ARCH_INFO) + continue + + if user_input.lower() == "history": + if history: + print("\n [Conversation History]") + for i, (q, r) in enumerate(history[-5:], 1): + print(f" {i}. You: {q}") + print(f" HRAN: {r}\n") + else: + print(" [No history yet]\n") + continue + + # Generate response + print(" HRAN › ", end="", flush=True) + t0 = time.time() + response = generate_response(model, tokenizer, user_input) + elapsed = time.time() - t0 + + print(response) + print(f" {'─' * 60}") + + history.append((user_input, response)) + + +# ───────────────────────────────────────────────────────────────────────────── +# SECTION 17: MAIN ENTRY POINT +# ───────────────────────────────────────────────────────────────────────────── + +def main(): + np.random.seed(42) + random.seed(42) + + print("\n" + "═" * 65) + print(" HRAN — Haykin Resonant Attention Network") + print(" Built strictly from Haykin + First Principles") + print("═" * 65 + "\n") + + # Build tokenizer + tokenizer = HRANTokenizer(max_vocab=CFG.vocab_size) + tokenizer.build(FULL_DATASET) + CFG.vocab_size = tokenizer.vocab_size + + # Build model + model = HRANModel(CFG) + + # Check for saved model + model_path = "hran_best.pkl" + if os.path.exists(model_path): + print(f"[HRAN] Found saved model at {model_path}") + ans = input(" Load existing model? [Y/n]: ").strip().lower() + if ans != "n": + model.load(model_path) + print(" Loaded! Entering chat mode.\n") + chat_loop(model, tokenizer) + return + + # Train + print("\n[HRAN] Starting training from scratch...\n") + history = train(model, tokenizer, FULL_DATASET, CFG) + + # Plot loss if matplotlib available + try: + import matplotlib.pyplot as plt + plt.figure(figsize=(10, 4)) + plt.plot(history, color="#e74c3c", linewidth=2) + plt.title("HRAN Training Loss (Haykin RBF-Attention + Infomax FFN)") + plt.xlabel("Epoch") + plt.ylabel("Cross-Entropy Loss") + plt.grid(alpha=0.3) + plt.tight_layout() + plt.savefig("hran_training_loss.png", dpi=150) + plt.close() + print("\n[HRAN] Loss curve saved to hran_training_loss.png") + except ImportError: + pass + + print("\n[HRAN] Training complete! Entering chat mode.") + print(" (Model auto-saved as hran_best.pkl)\n") + + chat_loop(model, tokenizer) + + +if __name__ == "__main__": + main()