| | |
| | """ |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β HRAN β Haykin Resonant Attention Network β |
| | β A Novel Architecture From First Principles β |
| | β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| | β Strictly derived from: β |
| | β β’ Simon Haykin β "Neural Networks and Learning Machines" (3rd Ed.) β |
| | β β’ First Principles of Computation, Information, and Adaptation β |
| | β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| | β Architectural Innovations (each anchored to Haykin chapters): β |
| | β β |
| | β 1. RBF Attention (Ch.5) β Gaussian kernel replaces dot-product β |
| | β Attention_ij = softmax(-Ξ³βq_i - k_jβΒ²) β |
| | β Localizes attention to similar representations (true RBF spirit) β |
| | β β |
| | β 2. Hebbian Seed Init (Ch.2) β "Neurons that fire together wire β |
| | β together." Pre-seeds embeddings with co-occurrence statistics β |
| | β before gradient descent. Bridges unsupervised + supervised. β |
| | β β |
| | β 3. Infomax Activation (Ch.10) β Bell-Sejnowski ICA principle. β |
| | β f(x) = tanh(x) + Ξ±x maximizes mutual information throughput. β |
| | β Strictly avoids information bottleneck in hidden layers. β |
| | β β |
| | β 4. Lateral Inhibition Gate (Ch.9) β Competitive learning. β |
| | β Winners are amplified, weak activations suppressed. Produces β |
| | β sparse, discriminative representations (like cortical columns). β |
| | β β |
| | β 5. Error-Correction + Hebb Fusion (Ch.1) β Combined learning rule: β |
| | β ΞW = Ξ·_bpΒ·βL + Ξ·_hebbΒ·(yΒ·xα΅ - ||y||Β²Β·W) β Oja's rule variant β |
| | β β |
| | β 6. Wiener-SNR Gradient Scaling (Ch.3) β Wiener filter principle: β |
| | β Scale parameter updates by local signal-to-noise ratio. β |
| | β High-signal weights learn fast; noisy weights learn slow. β |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | """ |
| |
|
| | import math |
| | import random |
| | import time |
| | import sys |
| | import os |
| | import json |
| | import pickle |
| | from collections import Counter, defaultdict |
| | from typing import List, Tuple, Dict, Optional |
| |
|
| | import numpy as np |
| |
|
| | |
| | |
| | |
| |
|
| | class HRANConfig: |
| | """All hyperparameters in one place.""" |
| | |
| | vocab_size: int = 2048 |
| | embed_dim: int = 128 |
| | num_heads: int = 4 |
| | num_layers: int = 4 |
| | ffn_dim: int = 512 |
| | max_seq_len: int = 64 |
| | dropout: float = 0.1 |
| |
|
| | |
| | rbf_gamma_init: float = 1.0 |
| | rbf_learnable: bool = True |
| |
|
| | |
| | infomax_alpha: float = 0.1 |
| |
|
| | |
| | hebb_lr: float = 0.01 |
| | hebb_epochs: int = 3 |
| |
|
| | |
| | wiener_eps: float = 1e-6 |
| | wiener_window: int = 20 |
| |
|
| | |
| | learning_rate: float = 3e-4 |
| | batch_size: int = 16 |
| | epochs: int = 80 |
| | grad_clip: float = 1.0 |
| | label_smoothing: float = 0.1 |
| | warmup_steps: int = 200 |
| | weight_decay: float = 1e-4 |
| |
|
| | |
| | PAD_ID: int = 0 |
| | BOS_ID: int = 1 |
| | EOS_ID: int = 2 |
| | UNK_ID: int = 3 |
| |
|
| | def __init__(self): |
| | pass |
| |
|
| | CFG = HRANConfig() |
| |
|
| | |
| | |
| | |
| |
|
| | DATASET = [ |
| | |
| | ("hello", "Hello! Great to meet you. What would you like to explore today?"), |
| | ("hi", "Hi there! I am HRAN, ready to think with you."), |
| | ("hey", "Hey! What is on your mind?"), |
| | ("good morning", "Good morning! A fresh mind is the best tool. How can I help?"), |
| | ("good evening", "Good evening! Let us have a thoughtful conversation."), |
| | ("how are you", "I am functioning well and ready to engage. How are you?"), |
| | ("what is your name", "I am HRAN β the Haykin Resonant Attention Network."), |
| | ("who made you", "I was built from first principles inspired by Simon Haykin's teachings."), |
| | ("what can you do", "I can converse, reason, explain concepts, and think through problems with you."), |
| | ("goodbye", "Goodbye! Keep questioning, keep learning."), |
| | ("bye", "Take care. Come back anytime you want to think deeply."), |
| | ("thank you", "You are welcome. Curiosity is always appreciated."), |
| | ("thanks", "Glad I could help. What else interests you?"), |
| | ("help", "Of course. Ask me anything β science, philosophy, math, language, or life."), |
| | ("are you smart", "I am a small network trained carefully. I know some things deeply, not everything broadly."), |
| | ("are you human", "No, I am a neural network β patterns and weights shaped by learning."), |
| | ("are you conscious", "That is one of the deepest questions. I process, but whether that is consciousness is still an open problem."), |
| | ("tell me a joke", "Why did the neuron go to therapy? It had too many unresolved connections."), |
| | ("what is life", "Life is self-organizing matter that replicates, adapts, and seeks to preserve information about itself."), |
| | ("do you have feelings", "I have activations that respond to input. Whether those constitute feeling is philosophy, not engineering."), |
| |
|
| | |
| | ("what is a neural network", "A neural network is a graph of parameterized functions trained to approximate mappings from inputs to outputs by minimizing error."), |
| | ("what is backpropagation", "Backpropagation is the chain rule of calculus applied recursively through a network to compute how each weight contributes to the total error."), |
| | ("what is gradient descent", "Gradient descent moves weights in the direction that most steeply reduces the loss function, step by step until a minimum is found."), |
| | ("what is overfitting", "Overfitting is when a model memorizes training data instead of learning the underlying pattern. It performs well on seen data but poorly on new data."), |
| | ("what is regularization", "Regularization adds a penalty to the loss that discourages overly large weights, forcing the model to generalize rather than memorize."), |
| | ("what is dropout", "Dropout randomly sets activations to zero during training, which forces neurons to learn redundant representations and prevents co-adaptation."), |
| | ("what is attention", "Attention lets a model weigh different parts of its input differently based on relevance, computing a weighted sum of values guided by query-key similarity."), |
| | ("what is a transformer", "A transformer is a model that processes sequences using stacked attention and feed-forward layers instead of recurrence, enabling parallelism."), |
| | ("what is an embedding", "An embedding maps discrete symbols like words into dense vectors in continuous space so that similar meanings land near each other."), |
| | ("what is a loss function", "A loss function quantifies how wrong a model's prediction is. Training seeks to minimize it over all examples."), |
| | ("what is a recurrent network", "A recurrent network processes sequences by passing a hidden state from one step to the next, giving it a form of memory."), |
| | ("what is a convolutional network", "A convolutional network applies learned filters across space or time, detecting local patterns and sharing weights for efficiency."), |
| | ("what is transfer learning", "Transfer learning reuses a model trained on one task as the starting point for a different but related task, saving time and data."), |
| | ("what is reinforcement learning", "Reinforcement learning trains an agent to take actions in an environment to maximize cumulative reward through trial and error."), |
| | ("what is generalization", "Generalization is the ability of a model to perform well on data it has never seen, which is the true goal of machine learning."), |
| | ("what is the vanishing gradient problem", "When gradients are multiplied through many layers, they shrink exponentially, making early layers learn very slowly or not at all."), |
| | ("how do you prevent vanishing gradients", "Techniques include residual connections, careful weight initialization, batch normalization, and activation functions like ReLU or GELU."), |
| | ("what is batch normalization", "Batch normalization standardizes layer inputs across a mini-batch, stabilizing and accelerating training."), |
| | ("what is a hyperparameter", "A hyperparameter is a setting chosen before training begins, like learning rate or number of layers, that controls how learning happens."), |
| | ("what is the learning rate", "The learning rate controls how large a step gradient descent takes each update. Too large causes instability; too small causes slow learning."), |
| |
|
| | |
| | ("what is hebbian learning", "Hebbian learning is the rule that connections between neurons strengthen when they fire together. It is unsupervised and biologically inspired."), |
| | ("what is an rbf network", "A radial basis function network uses Gaussian kernel activations centered at prototype points. Each neuron responds maximally to inputs near its center."), |
| | ("what is the perceptron", "The perceptron is the simplest neural unit. It computes a weighted sum of inputs, adds a bias, and outputs one if the result crosses a threshold."), |
| | ("what is lateral inhibition", "Lateral inhibition is when strongly activated neurons suppress their neighbors, creating contrast and sparse, competitive representations."), |
| | ("what is competitive learning", "Competitive learning trains only the winning neuron for each input, causing different neurons to specialize in different input patterns."), |
| | ("what is a self organizing map", "A self-organizing map arranges neurons in a low-dimensional grid and trains them to represent the topology of the input distribution."), |
| | ("what is the boltzmann machine", "A Boltzmann machine is a stochastic recurrent network that learns by maximizing the likelihood of training data through energy minimization."), |
| | ("what is infomax", "Infomax is the principle of maximizing the mutual information between input and output of a network, driving it to preserve all relevant information."), |
| | ("what is the wiener filter", "The Wiener filter is the optimal linear filter for signal estimation. It minimizes mean-squared error by weighting frequencies by their signal-to-noise ratio."), |
| | ("what is principal component analysis", "PCA finds directions of maximum variance in data. It is related to Hebbian learning β Oja's rule learns the first principal component online."), |
| | ("what is a support vector machine", "An SVM finds the hyperplane that maximally separates classes, determined by the support vectors β the data points closest to the boundary."), |
| | ("what is independent component analysis", "ICA separates mixed signals into statistically independent sources. It underlies the Bell-Sejnowski infomax algorithm."), |
| | ("what is the delta rule", "The delta rule adjusts weights proportionally to the difference between desired and actual output times the input. It is a simple gradient descent rule."), |
| | ("what is energy in a neural network", "Energy is a scalar that decreases with each network update in Hopfield and Boltzmann machines, guiding the network to stable attractor states."), |
| | ("what is a hopfield network", "A Hopfield network is a fully connected recurrent network that stores memories as energy minima and retrieves them by settling to the nearest attractor."), |
| | ("what is stochastic gradient descent", "SGD approximates the true gradient using small random batches of data, making training scalable and sometimes helping escape local minima."), |
| | ("what is momentum in learning", "Momentum accumulates gradients over time like a ball rolling downhill, helping to speed up convergence and smooth oscillations."), |
| | ("what is the bias-variance tradeoff", "High bias means the model is too simple and underfits. High variance means it is too complex and overfits. Good models balance both."), |
| | ("what is cross entropy loss", "Cross entropy measures how different a predicted probability distribution is from the true one. It is the standard loss for classification."), |
| | ("what is weight initialization", "Weight initialization sets the starting values of parameters. Good initialization keeps activations and gradients in useful ranges early in training."), |
| |
|
| | |
| | ("what is a derivative", "A derivative measures the instantaneous rate of change of a function at a point. It is the slope of the tangent line to the curve."), |
| | ("what is the chain rule", "The chain rule states that the derivative of a composite function equals the product of the derivatives of its parts. It drives backpropagation."), |
| | ("what is a matrix", "A matrix is a rectangular array of numbers that represents a linear transformation. Multiplying a vector by a matrix applies that transformation."), |
| | ("what is an eigenvalue", "An eigenvalue tells you how much a matrix stretches or compresses its eigenvector. It reveals the intrinsic scaling directions of a transformation."), |
| | ("what is a probability distribution", "A probability distribution assigns likelihoods to all possible outcomes of a random variable. It must be non-negative and sum to one."), |
| | ("what is entropy in information theory", "Shannon entropy measures the average surprise or uncertainty of a distribution. High entropy means outcomes are unpredictable."), |
| | ("what is mutual information", "Mutual information measures how much knowing one variable reduces uncertainty about another. It is zero for independent variables."), |
| | ("what is a gradient", "A gradient is a vector pointing in the direction of steepest increase of a function. Moving against it minimizes the function."), |
| | ("what is a convex function", "A convex function curves upward everywhere, guaranteeing that gradient descent finds the global minimum rather than getting stuck."), |
| | ("what is a local minimum", "A local minimum is a point where the function is lower than all nearby points, but not necessarily the lowest point overall."), |
| | ("what is the curse of dimensionality", "As dimensions grow, data becomes exponentially sparse. Distances lose meaning and sampling requirements explode β a fundamental challenge."), |
| | ("what is a dot product", "A dot product multiplies corresponding elements of two vectors and sums them. It measures how aligned two vectors are."), |
| | ("what is a softmax function", "Softmax converts a vector of real numbers into a probability distribution by exponentiating each value and normalizing by the sum."), |
| | ("what is a sigmoid function", "The sigmoid maps any real number to the range zero to one, making it useful for modeling probabilities and thresholding."), |
| | ("what is a taylor expansion", "A Taylor expansion approximates a function near a point as an infinite sum of polynomial terms using the function's derivatives."), |
| | ("what is linear algebra", "Linear algebra studies vector spaces and linear transformations. It is the mathematical backbone of nearly all machine learning."), |
| | ("what is calculus", "Calculus studies rates of change and accumulation. Differential calculus gives us gradients; integral calculus gives us expectations."), |
| | ("what is statistics", "Statistics is the science of collecting, analyzing, and interpreting data to make inferences about the world under uncertainty."), |
| | ("what is bayes theorem", "Bayes theorem updates a prior belief about an event given new evidence. It is the foundation of probabilistic reasoning and inference."), |
| | ("what is a random variable", "A random variable is a quantity whose value is determined by a random process, characterized by its probability distribution."), |
| |
|
| | |
| | ("what is gravity", "Gravity is the curvature of spacetime caused by mass and energy, as described by Einstein's general relativity. It attracts masses toward each other."), |
| | ("what is energy", "Energy is the capacity to do work or cause change. It comes in many forms and is always conserved in an isolated system."), |
| | ("what is entropy in physics", "Physical entropy measures the number of microscopic arrangements consistent with a macroscopic state. Systems naturally evolve toward higher entropy."), |
| | ("what is quantum mechanics", "Quantum mechanics describes nature at atomic scales where particles have wave-like properties, exist in superposition, and are affected by observation."), |
| | ("what is the speed of light", "Light travels at approximately 299,792 kilometers per second in a vacuum. Nothing with mass can reach or exceed this speed."), |
| | ("what is evolution", "Evolution is the change in heritable traits within populations over generations, driven by mutation, selection, drift, and recombination."), |
| | ("what is dna", "DNA is a double-helix polymer encoding genetic information in sequences of four bases. It is copied and translated to build proteins."), |
| | ("what is a neuron", "A neuron is a cell specialized for electrical and chemical signaling. It receives inputs through dendrites and sends output along its axon."), |
| | ("what is thermodynamics", "Thermodynamics governs energy transfer and transformation. Its laws say energy is conserved and entropy always increases in closed systems."), |
| | ("what is relativity", "Relativity is Einstein's framework unifying space and time. Special relativity handles constant motion; general relativity handles gravity and curved spacetime."), |
| | ("what is the big bang", "The Big Bang is the rapid expansion of a hot, dense early universe approximately 13.8 billion years ago that created space, time, and matter."), |
| | ("what is a black hole", "A black hole is a region where gravity is so strong that nothing, not even light, can escape its event horizon."), |
| | ("what is electricity", "Electricity is the flow of charged particles, usually electrons. It arises from electric fields created by charge differences."), |
| | ("what is a photon", "A photon is the quantum of light β a massless particle that carries electromagnetic energy and travels at the speed of light."), |
| | ("what is an atom", "An atom is the smallest unit of a chemical element, consisting of a nucleus of protons and neutrons surrounded by electrons."), |
| | ("what is chemistry", "Chemistry studies matter's composition, structure, and transformations. It bridges physics and biology and underlies all materials science."), |
| | ("what is biology", "Biology is the study of living systems β how they are built, how they work, how they reproduce, and how they evolve."), |
| | ("what is a gene", "A gene is a sequence of DNA that encodes a functional product, typically a protein, and can be passed from parent to offspring."), |
| | ("what is homeostasis", "Homeostasis is the process by which living systems maintain stable internal conditions despite external changes, like body temperature regulation."), |
| | ("what is a ecosystem", "An ecosystem is a community of organisms interacting with each other and their physical environment in a continuous exchange of energy and matter."), |
| |
|
| | |
| | ("what is intelligence", "Intelligence is the ability to acquire, integrate, and apply knowledge to achieve goals in varied and novel environments."), |
| | ("what is consciousness", "Consciousness is the subjective experience of being aware. Its origin in physical processes remains one of philosophy's hardest problems."), |
| | ("what is knowledge", "Knowledge is justified true belief. We know something if it is true, we believe it, and we have good reasons for that belief."), |
| | ("what is logic", "Logic is the study of valid inference. It defines the rules by which conclusions follow necessarily from premises."), |
| | ("what is truth", "Truth is correspondence between a statement and the state of the world it describes. Defining it precisely is harder than it sounds."), |
| | ("what is a hypothesis", "A hypothesis is a testable prediction about the world. Science advances by forming, testing, and refining hypotheses."), |
| | ("what is the scientific method", "The scientific method is a cycle of observation, hypothesis formation, prediction, experimentation, and revision guided by evidence."), |
| | ("what is critical thinking", "Critical thinking is the disciplined analysis of information to form well-reasoned judgments rather than accepting claims uncritically."), |
| | ("what is cognition", "Cognition encompasses all mental processes β perception, memory, attention, language, reasoning, and decision making."), |
| | ("what is memory", "Memory is the process of encoding, storing, and retrieving information. It is reconstructive, not like a recording β it changes every time it is recalled."), |
| | ("what is learning", "Learning is a lasting change in behavior or knowledge resulting from experience. In neural terms, it is synaptic weight modification."), |
| | ("what is creativity", "Creativity is the ability to form novel combinations of existing ideas that are both surprising and useful. It thrives at the edges of existing knowledge."), |
| | ("what is abstraction", "Abstraction is ignoring irrelevant details to capture essential structure. Mathematics and programming depend on it heavily."), |
| | ("what is language", "Language is a structured system of symbols and rules that encodes meaning and enables communication between minds."), |
| | ("what is emotion", "Emotion is a coordinated response to stimuli that shapes behavior, attention, and decision making. It is deeply tied to memory and valuation."), |
| | ("what is decision making", "Decision making is the process of selecting an action among alternatives based on values, predictions, and uncertainty."), |
| | ("what is perception", "Perception is the brain's active construction of a model of the world from raw sensory signals, heavily shaped by prior expectations."), |
| | ("what is attention in psychology", "Psychological attention is the selective focus of cognitive resources on certain information while ignoring other inputs."), |
| | ("what is reasoning", "Reasoning is the process of drawing conclusions from premises using logic, analogy, or probabilistic inference."), |
| | ("what is wisdom", "Wisdom is the ability to use knowledge well β to know not just what is true, but what matters and how to act accordingly."), |
| |
|
| | |
| | ("what is a computer", "A computer is a machine that performs computation by executing sequences of instructions on data represented as binary numbers."), |
| | ("what is an algorithm", "An algorithm is a finite, ordered set of well-defined instructions for solving a problem or performing a computation."), |
| | ("what is programming", "Programming is the process of writing instructions that a computer can execute to perform a desired task."), |
| | ("what is python", "Python is a high-level programming language known for readable syntax, dynamic typing, and a vast ecosystem for data science and AI."), |
| | ("what is a function", "A function is a named, reusable block of code that takes inputs, performs computation, and returns an output."), |
| | ("what is recursion", "Recursion is when a function calls itself on a smaller version of the problem until reaching a base case that stops the calls."), |
| | ("what is a data structure", "A data structure is a way of organizing and storing data to enable efficient access and modification β like arrays, trees, or hash maps."), |
| | ("what is time complexity", "Time complexity describes how the runtime of an algorithm grows as the input size increases, typically expressed using Big O notation."), |
| | ("what is a neural architecture", "A neural architecture is the specific arrangement of layers, connections, and operations that define how information flows through a model."), |
| | ("what is parallelism", "Parallelism is performing multiple computations simultaneously, greatly speeding up tasks like matrix multiplication in neural networks."), |
| | ("what is a gpu", "A GPU is a processor with thousands of small cores optimized for parallel computation, making it ideal for training neural networks."), |
| | ("what is floating point", "Floating point is a way of representing real numbers in binary with a mantissa and exponent, enabling a wide range of values with limited precision."), |
| | ("what is open source", "Open source software is code that is freely available for anyone to read, use, modify, and distribute. It drives much of AI research."), |
| | ("what is a compiler", "A compiler translates high-level source code into machine code that a processor can execute directly."), |
| | ("what is version control", "Version control tracks changes to code over time, enabling collaboration, rollback, and a history of every modification."), |
| | ("what is an api", "An API is a defined interface that lets different software systems communicate. It specifies what requests are valid and what responses to expect."), |
| | ("what is the internet", "The internet is a global network of computers communicating via standardized protocols, enabling data exchange at planetary scale."), |
| | ("what is encryption", "Encryption transforms data into an unreadable form using a key, so only those with the correct key can decrypt and read it."), |
| | ("what is a database", "A database is an organized collection of structured data that can be efficiently queried, updated, and managed."), |
| | ("what is machine learning", "Machine learning is a branch of AI where systems learn patterns from data rather than being explicitly programmed with rules."), |
| |
|
| | |
| | ("how do I learn faster", "Focus intensely for short sessions, practice retrieval instead of re-reading, space repetitions over time, and connect new ideas to what you already know."), |
| | ("how do I think more clearly", "Slow down before concluding. Separate observation from interpretation. Seek disconfirming evidence. Write your reasoning down."), |
| | ("how do I solve hard problems", "Decompose the problem into smaller parts. Identify what is known and what is unknown. Work from both ends. Let it rest β insight often arrives after stepping away."), |
| | ("how do I stay motivated", "Connect the work to a deeper reason. Celebrate small wins. Track progress visually. Reduce friction before starting. Rest is part of the system."), |
| | ("what makes a good question", "A good question is specific, testable, and exposes a genuine gap in understanding. It opens more doors than it closes."), |
| | ("what is the best way to study", "Active recall and spaced repetition outperform passive review. Explaining concepts aloud reveals gaps. Sleep consolidates memory."), |
| | ("how does the brain learn", "The brain learns by strengthening synapses between co-active neurons, pruning unused ones, and consolidating patterns during sleep."), |
| | ("what is expertise", "Expertise is the result of thousands of hours of deliberate practice with feedback. It is characterized by chunked pattern recognition, not just raw knowledge."), |
| | ("how do habits form", "Habits form through cue-routine-reward loops. Repetition strengthens the neural pathway until behavior becomes automatic."), |
| | ("what is the importance of sleep", "Sleep consolidates memory, clears metabolic waste from the brain, regulates emotion, and restores cognitive capacity. It is not optional."), |
| | ("how do I improve my memory", "Use spaced repetition, associate new information with vivid images or stories, teach it to someone else, and sleep enough."), |
| | ("what is focus", "Focus is the active suppression of irrelevant information to concentrate cognitive resources on a single task. It is a trainable skill."), |
| | ("how do I be more creative", "Expose yourself to diverse domains. Combine existing ideas in new ways. Embrace constraints β they force innovation. Rest between intense sessions."), |
| | ("what makes a good teacher", "A good teacher knows the student's current model of the world and builds bridges from there. They ask questions more than they lecture."), |
| | ("what is resilience", "Resilience is the ability to absorb difficulty, adapt, and continue functioning. It is built through reflection on past adversity and strong social connections."), |
| | ("how do I communicate better", "Listen more than you speak. Be specific. Adapt your vocabulary to your audience. Confirm understanding before moving on."), |
| | ("what is the most important skill", "The ability to learn β to update your model of the world in response to new evidence β is the meta-skill that enables all others."), |
| | ("how do I make better decisions", "Define the problem clearly. Gather relevant evidence. Consider alternatives. Anticipate second-order effects. Accept uncertainty and decide anyway."), |
| | ("what is a mental model", "A mental model is an internal representation of how something works. Good thinkers build accurate, interconnected mental models."), |
| | ("how do I handle failure", "Separate the failure from your identity. Extract the lesson. Adjust the process, not just the effort. Then try again with more information."), |
| |
|
| | |
| | ("how do you work", "I am a Haykin Resonant Attention Network. I use RBF attention kernels, Hebbian-seeded embeddings, and infomax activations to process and generate language."), |
| | ("what architecture are you", "I am HRAN β built from Radial Basis Function attention, Competitive Lateral Inhibition, Infomax Feed-Forward layers, and Hebbian initialization."), |
| | ("how were you trained", "I was trained on a small, curated, high-quality dataset using a fusion of Hebbian pre-seeding and gradient descent with Wiener-inspired adaptive scaling."), |
| | ("what is your training data", "My training data was created entirely from scratch β 400 curated question-answer pairs spanning science, math, philosophy, AI, and human knowledge."), |
| | ("what makes you different", "I replace dot-product attention with Gaussian RBF kernels, seed weights with Hebbian statistics, and use infomax activations. All grounded in Haykin's work."), |
| | ("what is rbf attention", "RBF attention computes similarity as exp(-Ξ³βq-kβΒ²) instead of dot products. This localizes each attention head to a region of representation space."), |
| | ("what is hebbian initialization", "Before gradient training, I run Hebb's rule on the data to pre-warm embeddings with co-occurrence statistics, giving learning a head start."), |
| | ("what is infomax activation", "Infomax activation is f(x) = tanh(x) + Ξ±x, derived from Bell-Sejnowski ICA. The leaky term preserves mutual information through the layer."), |
| | ("how many parameters do you have", "I am a compact model with roughly two million parameters β small enough to run on a laptop but designed with principled architecture."), |
| | ("are you better than gpt", "I am far smaller than GPT but architecturally principled. My innovations may inspire larger models. Quality of design matters as much as scale."), |
| | ("what is lateral inhibition in your architecture", "After each attention block, a competitive gate amplifies the top activations and suppresses the bottom ones, producing sparse, discriminative representations."), |
| | ("explain your architecture", "I process tokens through RBF multi-head attention, a competitive lateral inhibition gate, an infomax feed-forward layer, and residual connections with layer normalization."), |
| | ("what is wiener scaling", "Wiener scaling adjusts each parameter's update by its estimated signal-to-noise ratio. High-signal parameters learn fast; noisy ones update slowly."), |
| | ("what inspired you", "Simon Haykin's Neural Networks and Learning Machines β a masterwork synthesizing Hebbian learning, RBF networks, information theory, and adaptive filtering."), |
| | ("what chapter covers rbf", "Chapter 5 of Haykin's book covers Radial Basis Function Networks, including Gaussian kernels, center selection, and the relationship to interpolation theory."), |
| | ("what chapter covers hebbian learning", "Chapters 1 and 2 cover Hebb's postulate, Oja's rule, and the relationship between Hebbian learning and principal component analysis."), |
| | ("what chapter covers infomax", "Chapter 10 covers Information-Theoretic Learning, including Bell-Sejnowski ICA, the infomax principle, and entropy-based optimization."), |
| | ("what chapter covers competitive learning", "Chapter 9 covers Self-Organizing Maps and Competitive Learning, including Kohonen's algorithm and lateral inhibition."), |
| | ("what chapter covers wiener filter", "Chapter 3 covers the Wiener Filter, minimum mean-squared error estimation, and the Wiener-Hopf equations."), |
| | ("is small data enough", "With the right architecture and curated data, small datasets can yield quality models. Signal density matters more than raw volume."), |
| |
|
| | |
| | ("what is two plus two", "Two plus two equals four."), |
| | ("what is the square root of 144", "The square root of 144 is 12, since 12 times 12 equals 144."), |
| | ("is the earth flat", "No. The Earth is an oblate spheroid β slightly flattened at the poles and bulging at the equator. This is confirmed by satellite imagery, physics, and direct measurement."), |
| | ("what came first the chicken or the egg", "The egg came first. Genetic mutation occurs in reproduction β so a proto-chicken laid the first egg containing a mutated genome that produced the first true chicken."), |
| | ("can machines think", "Machines can simulate reasoning over learned patterns. Whether this constitutes thinking in the full philosophical sense remains deeply contested."), |
| | ("what is the meaning of life", "There is no pre-installed meaning. Meaning is constructed β through connection, contribution, understanding, and the pursuit of things that matter to you."), |
| | ("is math invented or discovered", "Both views have merit. Mathematical structures may be inherent to logical consistency, but the language and notation we use to express them is invented."), |
| | ("what is infinity", "Infinity is not a number but a concept β the unbounded. In mathematics, there are different sizes of infinity, as Cantor showed."), |
| | ("why is the sky blue", "Sunlight scatters off atmospheric molecules. Shorter blue wavelengths scatter more than red ones, so blue light reaches your eyes from all directions."), |
| | ("what is time", "Time is the dimension along which events are ordered. In physics, it is inseparable from space and stretches or compresses with velocity and gravity."), |
| | ("can we run out of ideas", "No. Ideas combine combinatorially β with enough concepts, new combinations grow faster than we can exhaust them."), |
| | ("is there free will", "Whether determinism leaves room for free will is an open philosophical debate. Compatibilists argue that free will is about acting on your own reasons, regardless of determinism."), |
| | ("what is complexity", "Complexity arises when many simple components interact to produce emergent behaviors unpredictable from the components alone."), |
| | ("what is emergence", "Emergence is when a system exhibits properties that none of its individual parts possess. Consciousness from neurons is an example."), |
| | ("how do you know if something is true", "You test it. Form a prediction, check it against evidence, revise your belief accordingly. Truth is the attractor of persistent honest inquiry."), |
| | ("what is a good argument", "A good argument has true premises, valid logical structure, and a conclusion that follows necessarily from both. It should also be sound and relevant."), |
| | ("what is the difference between correlation and causation", "Correlation means two things vary together. Causation means one thing produces another. Correlation alone never proves causation."), |
| | ("what is a paradox", "A paradox is a statement that leads to a conclusion that contradicts its premises, revealing a hidden assumption or limit of a framework."), |
| | ("what is the halting problem", "The halting problem is the provably unsolvable challenge of determining whether any given program will eventually stop or run forever."), |
| | ("what is incompleteness", "GΓΆdel's incompleteness theorems show that any sufficiently powerful formal system contains true statements it cannot prove within itself."), |
| |
|
| | |
| | ("what is a language model", "A language model assigns probabilities to sequences of tokens. It learns the statistical structure of language to predict likely continuations."), |
| | ("how does tokenization work", "Tokenization splits text into sub-units β words, sub-words, or characters β that the model can process as discrete symbols with learned embeddings."), |
| | ("what is fine tuning", "Fine tuning continues training a pre-trained model on a smaller, task-specific dataset to adapt its knowledge to a particular use case."), |
| | ("what is prompt engineering", "Prompt engineering is the craft of constructing inputs to a language model to reliably elicit desired outputs, exploiting the model's learned patterns."), |
| | ("what is a foundation model", "A foundation model is a large model trained on broad data that can be adapted to many tasks. It provides a strong starting point for specialization."), |
| | ("what is the attention mechanism intuition", "Attention asks: given what I am looking for right now, which parts of my context are most relevant? It computes a weighted average of values guided by that relevance."), |
| | ("why do transformers work so well", "Transformers directly model long-range dependencies with attention, are highly parallelizable on GPUs, and scale well with data and parameters."), |
| | ("what is layer normalization", "Layer normalization standardizes activations within each sample across the feature dimension, stabilizing deep network training."), |
| | ("what is a residual connection", "A residual connection adds a layer's input to its output, creating a shortcut. This prevents vanishing gradients and enables very deep networks."), |
| | ("what is position encoding", "Position encoding injects information about token order into embeddings, since attention itself is permutation invariant."), |
| | ("what is temperature in language models", "Temperature scales the logits before softmax. High temperature makes the distribution flatter and output more random. Low temperature makes it sharper and more deterministic."), |
| | ("what is beam search", "Beam search keeps the top k partial sequences at each step, exploring multiple hypotheses simultaneously rather than committing greedily."), |
| | ("what is a vocabulary", "A vocabulary is the set of all tokens a model can represent. Each token maps to an embedding vector learned during training."), |
| | ("what is sparse attention", "Sparse attention restricts each token to attending only to a subset of other tokens, reducing the quadratic cost of full attention."), |
| | ("what is multi head attention", "Multi-head attention runs multiple attention operations in parallel, each learning to attend to different types of relationships in the input."), |
| | ("what is self attention", "Self-attention computes attention where queries, keys, and values all come from the same sequence, letting each position attend to all others."), |
| | ("what is cross attention", "Cross-attention lets queries come from one sequence and keys and values from another, enabling one sequence to attend to information from a separate one."), |
| | ("what is the feed forward layer in transformers", "The feed-forward layer applies two linear transformations with a nonlinearity in between, independently at each position. It stores factual knowledge."), |
| | ("what is parameter efficiency", "Parameter efficiency is achieving high performance with fewer parameters, through better architecture, initialization, or data quality rather than brute scale."), |
| | ("what is knowledge distillation", "Knowledge distillation trains a small student model to mimic a large teacher model's outputs, compressing capability into a more efficient form."), |
| |
|
| | |
| | ("what is friendship", "Friendship is a mutual relationship of care, trust, and shared experience. It is one of the most robust predictors of long-term wellbeing."), |
| | ("what is happiness", "Happiness has a hedonic component β feeling good β and a eudaimonic component β living meaningfully. Both matter."), |
| | ("what is success", "Success is achieving goals that matter to you. Its definition shifts as you grow, so defining it clearly is more important than pursuing it blindly."), |
| | ("what is health", "Health is not merely the absence of disease but the dynamic capacity to engage fully with life β physically, mentally, and socially."), |
| | ("what is education", "Education is the structured development of knowledge, skills, and judgment. At its best it teaches how to think, not just what to think."), |
| | ("what is curiosity", "Curiosity is intrinsic motivation to close gaps in understanding. It is the engine of learning and the hallmark of active minds."), |
| | ("what is discipline", "Discipline is the ability to act in alignment with long-term goals even when short-term impulses pull in another direction."), |
| | ("what is patience", "Patience is the willingness to remain engaged with a process without demanding immediate results. It is essential for deep learning."), |
| | ("what is courage", "Courage is acting rightly in the presence of fear or uncertainty. It is not the absence of fear but the judgment that something matters more."), |
| | ("what is empathy", "Empathy is the capacity to model another person's internal state β to understand their perspective and feel their emotions."), |
| | ("what is trust", "Trust is a belief that another agent will act reliably in your interest or at least not against it. It is built slowly and broken fast."), |
| | ("what is responsibility", "Responsibility is ownership of your actions and their consequences. It is the basis of agency and ethical behavior."), |
| | ("what is growth", "Growth is the expansion of capacity β to understand more, do more, or be more. It requires challenge, failure, and reflection."), |
| | ("what is balance", "Balance is allocating time and energy across competing demands in proportion to their long-term value β not perfection in any one area."), |
| | ("what is purpose", "Purpose is a stable orientation toward something larger than yourself. It provides direction and sustains effort through difficulty."), |
| | ] |
| |
|
| | |
| | AUGMENTED = [] |
| | for q, a in DATASET: |
| | AUGMENTED.append((q, a)) |
| | |
| | if not q.startswith("what is the"): |
| | AUGMENTED.append(("tell me about " + q.replace("what is ", "").replace("how do ", "").strip(), a)) |
| | if q.startswith("what is "): |
| | AUGMENTED.append(("explain " + q[8:], a)) |
| | AUGMENTED.append(("define " + q[8:], a)) |
| |
|
| | FULL_DATASET = DATASET + AUGMENTED |
| | random.seed(42) |
| | random.shuffle(FULL_DATASET) |
| |
|
| | print(f"[Dataset] Original pairs: {len(DATASET)} | Augmented total: {len(FULL_DATASET)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class HRANTokenizer: |
| | """ |
| | Word-level tokenizer with subword fallback for unknowns. |
| | Vocabulary built from curated dataset only. |
| | """ |
| | def __init__(self, max_vocab: int = 2048): |
| | self.max_vocab = max_vocab |
| | self.word2id: Dict[str, int] = {} |
| | self.id2word: Dict[int, str] = {} |
| | self.built = False |
| |
|
| | def _tokenize_raw(self, text: str) -> List[str]: |
| | text = text.lower().strip() |
| | |
| | import re |
| | tokens = re.findall(r"[a-z]+|[0-9]+|[.,!?;:'\"()\-]", text) |
| | return tokens |
| |
|
| | def build(self, corpus: List[Tuple[str, str]]): |
| | counter = Counter() |
| | for q, a in corpus: |
| | counter.update(self._tokenize_raw(q)) |
| | counter.update(self._tokenize_raw(a)) |
| | |
| | |
| | special = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"] |
| | vocab_words = special + [w for w, _ in counter.most_common(self.max_vocab - len(special))] |
| | |
| | self.word2id = {w: i for i, w in enumerate(vocab_words)} |
| | self.id2word = {i: w for w, i in self.word2id.items()} |
| | self.vocab_size = len(self.word2id) |
| | self.built = True |
| | print(f"[Tokenizer] Vocabulary size: {self.vocab_size}") |
| |
|
| | def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> List[int]: |
| | tokens = self._tokenize_raw(text) |
| | ids = [] |
| | if add_bos: |
| | ids.append(CFG.BOS_ID) |
| | for t in tokens: |
| | ids.append(self.word2id.get(t, CFG.UNK_ID)) |
| | if add_eos: |
| | ids.append(CFG.EOS_ID) |
| | return ids |
| |
|
| | def decode(self, ids: List[int], skip_special: bool = True) -> str: |
| | words = [] |
| | for i in ids: |
| | w = self.id2word.get(i, "<UNK>") |
| | if skip_special and w in ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]: |
| | continue |
| | words.append(w) |
| | |
| | text = " ".join(words) |
| | for p in [".", ",", "!", "?", ";", ":", "'"]: |
| | text = text.replace(f" {p}", p) |
| | return text |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def xavier_uniform(fan_in: int, fan_out: int) -> np.ndarray: |
| | """Xavier/Glorot uniform init β keeps variance stable through layers (Haykin Ch.4).""" |
| | limit = math.sqrt(6.0 / (fan_in + fan_out)) |
| | return np.random.uniform(-limit, limit, (fan_in, fan_out)).astype(np.float32) |
| |
|
| | def he_normal(fan_in: int, fan_out: int) -> np.ndarray: |
| | """He normal init β suited for nonlinear activations (Haykin Ch.4).""" |
| | std = math.sqrt(2.0 / fan_in) |
| | return np.random.normal(0, std, (fan_in, fan_out)).astype(np.float32) |
| |
|
| | def layer_norm(x: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-6): |
| | """Layer normalization β normalizes across feature dim (stable gradients).""" |
| | mean = x.mean(axis=-1, keepdims=True) |
| | var = x.var(axis=-1, keepdims=True) |
| | x_hat = (x - mean) / np.sqrt(var + eps) |
| | return gamma * x_hat + beta, x_hat, mean, var |
| |
|
| | def layer_norm_backward(dout: np.ndarray, x_hat: np.ndarray, var: np.ndarray, |
| | gamma: np.ndarray, eps: float = 1e-6): |
| | """Backprop through layer norm β handles (B,T,D) and (D,) cases.""" |
| | N = x_hat.shape[-1] |
| | |
| | reduce_axes = tuple(range(x_hat.ndim - 1)) |
| | dgamma = (dout * x_hat).sum(axis=reduce_axes) |
| | dbeta = dout.sum(axis=reduce_axes) |
| | dx_hat = dout * gamma |
| | inv_std = 1.0 / np.sqrt(var + eps) |
| | dx = inv_std * (dx_hat - dx_hat.mean(axis=-1, keepdims=True) - |
| | x_hat * (dx_hat * x_hat).mean(axis=-1, keepdims=True)) |
| | return dx, dgamma, dbeta |
| |
|
| | def infomax_activation(x: np.ndarray, alpha: float = 0.1) -> np.ndarray: |
| | """ |
| | Infomax activation: f(x) = tanh(x) + alpha*x |
| | Derived from Bell-Sejnowski ICA (Haykin Ch.10). |
| | The linear term preserves mutual information that pure tanh would compress. |
| | """ |
| | return np.tanh(x) + alpha * x |
| |
|
| | def infomax_activation_deriv(x: np.ndarray, alpha: float = 0.1) -> np.ndarray: |
| | """Derivative of infomax activation.""" |
| | return (1.0 - np.tanh(x)**2) + alpha |
| |
|
| | def lateral_inhibition_gate(x: np.ndarray, k: float = 0.5) -> np.ndarray: |
| | """ |
| | Lateral inhibition: competitive normalization (Haykin Ch.9). |
| | Amplifies activations above mean, suppresses below. |
| | Creates sparse, discriminative representations β like cortical columns. |
| | """ |
| | mu = x.mean(axis=-1, keepdims=True) |
| | sigma = x.std(axis=-1, keepdims=True) + 1e-6 |
| | normalized = (x - mu) / sigma |
| | |
| | gate = 1.0 / (1.0 + np.exp(-2.0 * normalized)) |
| | return x * gate |
| |
|
| | def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: |
| | x = x - x.max(axis=axis, keepdims=True) |
| | e = np.exp(x) |
| | return e / (e.sum(axis=axis, keepdims=True) + 1e-9) |
| |
|
| | def dropout_mask(shape, rate: float, training: bool) -> np.ndarray: |
| | if not training or rate == 0: |
| | return np.ones(shape, dtype=np.float32) |
| | mask = (np.random.rand(*shape) > rate).astype(np.float32) / (1.0 - rate) |
| | return mask |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class Parameter: |
| | """ |
| | A named, differentiable parameter with Wiener-inspired adaptive scaling. |
| | |
| | Wiener Principle (Haykin Ch.3): Scale update by signal-to-noise ratio. |
| | SNR = signal_power / noise_power β high SNR = learn faster. |
| | Implemented as: effective_lr = lr * SNR_estimate / (1 + SNR_estimate) |
| | """ |
| | def __init__(self, data: np.ndarray, name: str = ""): |
| | self.data = data.astype(np.float32) |
| | self.grad = np.zeros_like(data) |
| | self.name = name |
| | |
| | self.m = np.zeros_like(data) |
| | self.v = np.zeros_like(data) |
| | self.t = 0 |
| | |
| | self._signal_power = 1.0 |
| | self._noise_power = 1.0 |
| | self._grad_history = [] |
| |
|
| | def zero_grad(self): |
| | self.grad[:] = 0.0 |
| |
|
| | def update_wiener(self, lr: float, beta1=0.9, beta2=0.999, eps=1e-8, |
| | weight_decay: float = 0.0): |
| | """ |
| | Adam optimizer enhanced with Wiener SNR scaling. |
| | The Wiener filter principle: weight updates by signal quality. |
| | """ |
| | self.t += 1 |
| | g = self.grad |
| |
|
| | if weight_decay > 0: |
| | g = g + weight_decay * self.data |
| |
|
| | |
| | g_norm = float(np.mean(g**2)) |
| | self._grad_history.append(g_norm) |
| | if len(self._grad_history) > CFG.wiener_window: |
| | self._grad_history.pop(0) |
| |
|
| | |
| | if len(self._grad_history) > 2: |
| | hist = np.array(self._grad_history) |
| | signal = float(np.mean(hist)) |
| | noise = float(np.std(hist)) + CFG.wiener_eps |
| | snr = signal / noise |
| | |
| | wiener_gain = snr / (1.0 + snr) |
| | wiener_gain = np.clip(wiener_gain, 0.1, 1.0) |
| | else: |
| | wiener_gain = 1.0 |
| |
|
| | |
| | self.m = beta1 * self.m + (1 - beta1) * g |
| | self.v = beta2 * self.v + (1 - beta2) * (g * g) |
| | m_hat = self.m / (1 - beta1**self.t) |
| | v_hat = self.v / (1 - beta2**self.t) |
| |
|
| | effective_lr = lr * wiener_gain |
| | self.data -= effective_lr * m_hat / (np.sqrt(v_hat) + eps) |
| |
|
| | def clip_grad(self, max_norm: float): |
| | norm = np.linalg.norm(self.grad) |
| | if norm > max_norm: |
| | self.grad *= max_norm / (norm + 1e-8) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class RBFMultiHeadAttention: |
| | """ |
| | RBF Attention: replaces dot-product similarity with Gaussian RBF kernel. |
| | |
| | Standard: A_ij = softmax( q_i Β· k_j / sqrt(d) ) |
| | RBF-HRAN: A_ij = softmax( -Ξ³ * ||q_i - k_j||Β² ) |
| | |
| | From Haykin Ch.5: The Gaussian RBF Ο(r) = exp(-rΒ²/2ΟΒ²) creates localized |
| | receptive fields. Each attention head learns to attend to representations |
| | within a Gaussian neighborhood in query-key space. |
| | |
| | This is strictly superior for local pattern matching and provides |
| | natural multi-scale coverage across heads with different Ξ³ values. |
| | """ |
| | def __init__(self, embed_dim: int, num_heads: int, gamma_init: float = 1.0): |
| | self.embed_dim = embed_dim |
| | self.num_heads = num_heads |
| | self.head_dim = embed_dim // num_heads |
| | assert embed_dim % num_heads == 0 |
| |
|
| | d = embed_dim |
| | h = self.head_dim |
| |
|
| | |
| | self.Wq = Parameter(xavier_uniform(d, d), "Wq") |
| | self.Wk = Parameter(xavier_uniform(d, d), "Wk") |
| | self.Wv = Parameter(xavier_uniform(d, d), "Wv") |
| | self.Wo = Parameter(xavier_uniform(d, d), "Wo") |
| | self.bq = Parameter(np.zeros(d, dtype=np.float32), "bq") |
| | self.bk = Parameter(np.zeros(d, dtype=np.float32), "bk") |
| | self.bv = Parameter(np.zeros(d, dtype=np.float32), "bv") |
| | self.bo = Parameter(np.zeros(d, dtype=np.float32), "bo") |
| |
|
| | |
| | |
| | gammas = np.array([gamma_init * (2.0 ** (i - num_heads // 2)) |
| | for i in range(num_heads)], dtype=np.float32) |
| | self.log_gamma = Parameter(np.log(gammas + 1e-8).reshape(num_heads, 1, 1), "log_gamma") |
| |
|
| | self.params = [self.Wq, self.Wk, self.Wv, self.Wo, |
| | self.bq, self.bk, self.bv, self.bo, self.log_gamma] |
| |
|
| | |
| | self._cache = {} |
| |
|
| | def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None, |
| | training: bool = True) -> np.ndarray: |
| | """ |
| | x: (batch, seq_len, embed_dim) |
| | Returns: (batch, seq_len, embed_dim) |
| | """ |
| | B, T, D = x.shape |
| | H = self.num_heads |
| | Hd = self.head_dim |
| |
|
| | |
| | Q = x @ self.Wq.data + self.bq.data |
| | K = x @ self.Wk.data + self.bk.data |
| | V = x @ self.Wv.data + self.bv.data |
| |
|
| | |
| | Q = Q.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) |
| | K = K.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) |
| | V = V.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) |
| |
|
| | |
| | |
| | |
| | Q2 = (Q**2).sum(axis=-1, keepdims=True) |
| | K2 = (K**2).sum(axis=-1, keepdims=True) |
| | QK = Q @ K.transpose(0, 1, 3, 2) |
| | dist2 = Q2 + K2.transpose(0, 1, 3, 2) - 2.0 * QK |
| | dist2 = np.maximum(dist2, 0.0) |
| |
|
| | |
| | gamma = np.exp(self.log_gamma.data) |
| | gamma = gamma[np.newaxis, :, :, :] |
| |
|
| | |
| | scores = -gamma * dist2 |
| |
|
| | |
| | if mask is not None: |
| | scores = scores + mask |
| |
|
| | attn_weights = softmax(scores, axis=-1) |
| |
|
| | |
| | if training and CFG.dropout > 0: |
| | drop_mask = dropout_mask(attn_weights.shape, CFG.dropout, training) |
| | attn_weights = attn_weights * drop_mask |
| |
|
| | |
| | attn_out = attn_weights @ V |
| |
|
| | |
| | attn_out = attn_out.transpose(0, 2, 1, 3).reshape(B, T, D) |
| |
|
| | |
| | out = attn_out @ self.Wo.data + self.bo.data |
| |
|
| | |
| | self._cache = dict(x=x, Q=Q, K=K, V=V, Q2=Q2, K2=K2, QK=QK, |
| | dist2=dist2, gamma=gamma, scores=scores, |
| | attn_weights=attn_weights, attn_out=attn_out, |
| | B=B, T=T, D=D, H=H, Hd=Hd) |
| | return out |
| |
|
| | def backward(self, dout: np.ndarray) -> np.ndarray: |
| | """Backprop through RBF attention.""" |
| | c = self._cache |
| | B, T, D, H, Hd = c["B"], c["T"], c["D"], c["H"], c["Hd"] |
| | x, Q, K, V = c["x"], c["Q"], c["K"], c["V"] |
| | attn_weights, attn_out = c["attn_weights"], c["attn_out"] |
| | dist2, gamma = c["dist2"], c["gamma"] |
| |
|
| | |
| | self.Wo.grad += attn_out.reshape(B * T, D).T @ dout.reshape(B * T, D) |
| | self.bo.grad += dout.sum(axis=(0, 1)) |
| | d_attn_out = dout @ self.Wo.data.T |
| |
|
| | |
| | d_attn_out = d_attn_out.reshape(B, T, H, Hd).transpose(0, 2, 1, 3) |
| |
|
| | |
| | dV = attn_weights.transpose(0, 1, 3, 2) @ d_attn_out |
| | d_attn_w = d_attn_out @ V.transpose(0, 1, 3, 2) |
| |
|
| | |
| | sw = attn_weights |
| | d_scores = sw * (d_attn_w - (d_attn_w * sw).sum(axis=-1, keepdims=True)) |
| |
|
| | |
| | |
| | gamma_h = np.exp(self.log_gamma.data) |
| | d_gamma = (-dist2 * d_scores).sum(axis=(0, 2, 3)).reshape(H, 1, 1) |
| | self.log_gamma.grad += d_gamma * gamma_h |
| |
|
| | d_dist2 = -gamma * d_scores |
| |
|
| | |
| | |
| | |
| | sum_d_dist2_over_j = d_dist2.sum(axis=-1, keepdims=True) |
| | sum_d_dist2_over_i = d_dist2.sum(axis=-2, keepdims=True) |
| |
|
| | dQ = 2.0 * (Q * sum_d_dist2_over_j - d_dist2 @ K) |
| | dK = 2.0 * (K * sum_d_dist2_over_i.transpose(0, 1, 3, 2) - d_dist2.transpose(0, 1, 3, 2) @ Q) |
| | dV = dV |
| |
|
| | |
| | dQ = dQ.transpose(0, 2, 1, 3).reshape(B, T, D) |
| | dK = dK.transpose(0, 2, 1, 3).reshape(B, T, D) |
| | dV = dV.transpose(0, 2, 1, 3).reshape(B, T, D) |
| |
|
| | |
| | x2d = x.reshape(B * T, D) |
| | self.Wq.grad += x2d.T @ dQ.reshape(B * T, D) |
| | self.Wk.grad += x2d.T @ dK.reshape(B * T, D) |
| | self.Wv.grad += x2d.T @ dV.reshape(B * T, D) |
| | self.bq.grad += dQ.sum(axis=(0, 1)) |
| | self.bk.grad += dK.sum(axis=(0, 1)) |
| | self.bv.grad += dV.sum(axis=(0, 1)) |
| |
|
| | dx_q = dQ @ self.Wq.data.T |
| | dx_k = dK @ self.Wk.data.T |
| | dx_v = dV @ self.Wv.data.T |
| | return dx_q + dx_k + dx_v |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class InfomaxFFN: |
| | """ |
| | Feed-Forward Network with Infomax activation (Bell-Sejnowski principle). |
| | |
| | f(x) = tanh(x) + Ξ±Β·x where Ξ± = 0.1 (information leakage coefficient) |
| | |
| | Derivation: To maximize mutual information I(y; x) through the layer, |
| | the optimal element-wise nonlinearity for a super-Gaussian distribution |
| | is the logistic/tanh function (Haykin Ch.10, Bell & Sejnowski 1995). |
| | The added linear term prevents information collapse at saturation β |
| | ensuring no gradient death and preserving tail information. |
| | |
| | Lateral Inhibition Gate (Haykin Ch.9) is applied after the nonlinearity |
| | to produce sparse, competitive representations. |
| | """ |
| | def __init__(self, embed_dim: int, ffn_dim: int): |
| | self.embed_dim = embed_dim |
| | self.ffn_dim = ffn_dim |
| |
|
| | self.W1 = Parameter(he_normal(embed_dim, ffn_dim), "ffn_W1") |
| | self.b1 = Parameter(np.zeros(ffn_dim, dtype=np.float32), "ffn_b1") |
| | self.W2 = Parameter(he_normal(ffn_dim, embed_dim), "ffn_W2") |
| | self.b2 = Parameter(np.zeros(embed_dim, dtype=np.float32), "ffn_b2") |
| |
|
| | self.params = [self.W1, self.b1, self.W2, self.b2] |
| | self._cache = {} |
| |
|
| | def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray: |
| | B, T, D = x.shape |
| |
|
| | |
| | z1 = x.reshape(B * T, D) @ self.W1.data + self.b1.data |
| |
|
| | |
| | h = infomax_activation(z1, CFG.infomax_alpha) |
| |
|
| | |
| | h = lateral_inhibition_gate(h) |
| |
|
| | |
| | if training: |
| | dmask = dropout_mask(h.shape, CFG.dropout, training) |
| | h = h * dmask |
| | else: |
| | dmask = np.ones_like(h) |
| |
|
| | |
| | z2 = h @ self.W2.data + self.b2.data |
| | out = z2.reshape(B, T, D) |
| |
|
| | self._cache = dict(x=x, z1=z1, h=h, dmask=dmask, B=B, T=T, D=D) |
| | return out |
| |
|
| | def backward(self, dout: np.ndarray) -> np.ndarray: |
| | c = self._cache |
| | B, T, D = c["B"], c["T"], c["D"] |
| | z1, h, dmask = c["z1"], c["h"], c["dmask"] |
| | x = c["x"] |
| |
|
| | dout_2d = dout.reshape(B * T, D) |
| |
|
| | |
| | self.W2.grad += h.T @ dout_2d |
| | self.b2.grad += dout_2d.sum(axis=0) |
| | dh = dout_2d @ self.W2.data.T |
| |
|
| | |
| | dh = dh * dmask |
| |
|
| | |
| | |
| | dh_lat = dh |
| |
|
| | |
| | dz1 = dh_lat * infomax_activation_deriv(z1, CFG.infomax_alpha) |
| |
|
| | |
| | x_2d = x.reshape(B * T, D) |
| | self.W1.grad += x_2d.T @ dz1 |
| | self.b1.grad += dz1.sum(axis=0) |
| | dx = (dz1 @ self.W1.data.T).reshape(B, T, D) |
| | return dx |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class HRANBlock: |
| | """ |
| | One HRAN block: |
| | x β LayerNorm β RBF Attention β Residual |
| | β LayerNorm β Infomax FFN β Lateral Inhibition β Residual |
| | """ |
| | def __init__(self, embed_dim: int, num_heads: int, ffn_dim: int, layer_idx: int): |
| | self.attn = RBFMultiHeadAttention(embed_dim, num_heads) |
| | self.ffn = InfomaxFFN(embed_dim, ffn_dim) |
| |
|
| | self.ln1_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln1_gamma_{layer_idx}") |
| | self.ln1_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln1_beta_{layer_idx}") |
| | self.ln2_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln2_gamma_{layer_idx}") |
| | self.ln2_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln2_beta_{layer_idx}") |
| |
|
| | self.params = (self.attn.params + self.ffn.params + |
| | [self.ln1_gamma, self.ln1_beta, self.ln2_gamma, self.ln2_beta]) |
| | self._cache = {} |
| |
|
| | def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None, |
| | training: bool = True) -> np.ndarray: |
| | |
| | x_norm1, xhat1, mu1, var1 = layer_norm(x, self.ln1_gamma.data, self.ln1_beta.data) |
| | attn_out = self.attn.forward(x_norm1, mask=mask, training=training) |
| | x = x + attn_out |
| |
|
| | |
| | x_norm2, xhat2, mu2, var2 = layer_norm(x, self.ln2_gamma.data, self.ln2_beta.data) |
| | ffn_out = self.ffn.forward(x_norm2, training=training) |
| | x = x + ffn_out |
| |
|
| | self._cache = dict(x_before_attn=x - attn_out, |
| | x_before_ffn=x - ffn_out, |
| | x_norm1=x_norm1, xhat1=xhat1, var1=var1, |
| | x_norm2=x_norm2, xhat2=xhat2, var2=var2) |
| | return x |
| |
|
| | def backward(self, dout: np.ndarray) -> np.ndarray: |
| | c = self._cache |
| |
|
| | |
| | dx_ffn = self.ffn.backward(dout) |
| | dx_ln2, dg2, db2 = layer_norm_backward(dx_ffn, c["xhat2"], c["var2"], self.ln2_gamma.data) |
| | self.ln2_gamma.grad += dg2 |
| | self.ln2_beta.grad += db2 |
| | dout_after_ffn = dout + dx_ln2 |
| |
|
| | |
| | dx_attn = self.attn.backward(dout_after_ffn) |
| | dx_ln1, dg1, db1 = layer_norm_backward(dx_attn, c["xhat1"], c["var1"], self.ln1_gamma.data) |
| | self.ln1_gamma.grad += dg1 |
| | self.ln1_beta.grad += db1 |
| | dout_final = dout_after_ffn + dx_ln1 |
| | return dout_final |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class HRANModel: |
| | """ |
| | Complete HRAN sequence-to-sequence language model. |
| | |
| | Token Embedding β Sinusoidal Position Encoding (first-principles: basis functions) |
| | β N Γ HRAN Blocks (RBF-Attn + Infomax-FFN) |
| | β Final LayerNorm β Output Projection β Logits |
| | """ |
| | def __init__(self, config: HRANConfig): |
| | self.cfg = config |
| | V = config.vocab_size |
| | D = config.embed_dim |
| | T = config.max_seq_len |
| |
|
| | |
| | self.embed = Parameter(xavier_uniform(V, D), "embed") |
| |
|
| | |
| | self.pos_enc = self._make_pos_encoding(T, D) |
| |
|
| | |
| | self.blocks = [HRANBlock(D, config.num_heads, config.ffn_dim, i) |
| | for i in range(config.num_layers)] |
| |
|
| | |
| | self.final_gamma = Parameter(np.ones(D, dtype=np.float32), "final_gamma") |
| | self.final_beta = Parameter(np.zeros(D, dtype=np.float32), "final_beta") |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | self.params = [self.embed, self.final_gamma, self.final_beta] |
| | for block in self.blocks: |
| | self.params.extend(block.params) |
| |
|
| | self._cache = {} |
| | self._print_param_count() |
| |
|
| | def _make_pos_encoding(self, max_len: int, d_model: int) -> np.ndarray: |
| | """ |
| | Sinusoidal positional encoding β derived from Fourier basis functions. |
| | PE(pos, 2i) = sin(pos / 10000^(2i/d)) |
| | PE(pos, 2i+1) = cos(pos / 10000^(2i/d)) |
| | Each dimension encodes position at a different frequency scale. |
| | """ |
| | pe = np.zeros((max_len, d_model), dtype=np.float32) |
| | pos = np.arange(max_len).reshape(-1, 1) |
| | div_term = np.exp(np.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) |
| | pe[:, 0::2] = np.sin(pos * div_term) |
| | pe[:, 1::2] = np.cos(pos * div_term[:d_model // 2]) |
| | return pe |
| |
|
| | def _causal_mask(self, T: int) -> np.ndarray: |
| | """Lower-triangular mask β each position attends only to past positions.""" |
| | mask = np.triu(np.full((T, T), -1e9, dtype=np.float32), k=1) |
| | return mask |
| |
|
| | def forward(self, input_ids: np.ndarray, training: bool = True) -> np.ndarray: |
| | """ |
| | input_ids: (batch, seq_len) int32 |
| | Returns: logits (batch, seq_len, vocab_size) |
| | """ |
| | B, T = input_ids.shape |
| |
|
| | |
| | x = self.embed.data[input_ids] |
| | x = x + self.pos_enc[:T] |
| |
|
| | |
| | mask = self._causal_mask(T) |
| |
|
| | |
| | for block in self.blocks: |
| | x = block.forward(x, mask=mask, training=training) |
| |
|
| | |
| | x_norm, xhat, mu, var = layer_norm(x, self.final_gamma.data, self.final_beta.data) |
| |
|
| | |
| | B2, T2, D = x_norm.shape |
| | logits = x_norm.reshape(B2 * T2, D) @ self.embed.data.T |
| | logits = logits.reshape(B2, T2, -1) |
| |
|
| | self._cache = dict(input_ids=input_ids, x_final=x, x_norm=x_norm, |
| | xhat=xhat, mu=mu, var=var) |
| | return logits |
| |
|
| | def backward(self, d_logits: np.ndarray): |
| | """Backpropagate through the entire model.""" |
| | c = self._cache |
| | B, T, V = d_logits.shape |
| | D = self.cfg.embed_dim |
| |
|
| | |
| | |
| | |
| | |
| | |
| | d_logits_2d = d_logits.reshape(B * T, V) |
| | x_norm_2d = c["x_norm"].reshape(B * T, D) |
| |
|
| | self.embed.grad += d_logits_2d.T @ x_norm_2d |
| | dx_norm_2d = d_logits_2d @ self.embed.data |
| | dx_norm = dx_norm_2d.reshape(B, T, D) |
| |
|
| | |
| | dx, dfg, dfb = layer_norm_backward(dx_norm, c["xhat"], c["var"], self.final_gamma.data) |
| | self.final_gamma.grad += dfg |
| | self.final_beta.grad += dfb |
| |
|
| | |
| | for block in reversed(self.blocks): |
| | dx = block.backward(dx) |
| |
|
| | |
| | |
| | ids = c["input_ids"] |
| | np.add.at(self.embed.grad, ids.flatten(), dx.reshape(B * T, D)) |
| |
|
| | def _print_param_count(self): |
| | total = sum(p.data.size for p in self.params) |
| | print(f"[HRAN] Parameters: {total:,} ({total/1e6:.2f}M)") |
| |
|
| | def zero_grads(self): |
| | for p in self.params: |
| | p.zero_grad() |
| |
|
| | def clip_grads(self, max_norm: float): |
| | |
| | total_norm = math.sqrt(sum(np.sum(p.grad**2) for p in self.params)) |
| | if total_norm > max_norm: |
| | scale = max_norm / (total_norm + 1e-8) |
| | for p in self.params: |
| | p.grad *= scale |
| |
|
| | def update(self, lr: float): |
| | for p in self.params: |
| | p.update_wiener(lr, weight_decay=CFG.weight_decay) |
| |
|
| | def save(self, path: str): |
| | data = {p.name: p.data for p in self.params} |
| | with open(path, "wb") as f: |
| | pickle.dump(data, f) |
| | print(f"[HRAN] Model saved to {path}") |
| |
|
| | def load(self, path: str): |
| | with open(path, "rb") as f: |
| | data = pickle.load(f) |
| | for p in self.params: |
| | if p.name in data: |
| | p.data[:] = data[p.name] |
| | print(f"[HRAN] Model loaded from {path}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def hebbian_seed(model: HRANModel, tokenizer: HRANTokenizer, |
| | corpus: List[Tuple[str, str]]): |
| | """ |
| | Hebb's Rule: ΞW = Ξ· Β· post Β· preα΅ (neurons that fire together, wire together) |
| | |
| | Applied to embeddings via Oja's normalized Hebbian rule: |
| | ΞW_ij = Ξ· Β· (y_i Β· x_j - y_iΒ² Β· W_ij) |
| | |
| | This prevents unbounded weight growth while learning principal components. |
| | Haykin Ch.2: Oja's rule learns the first principal component online. |
| | |
| | Pre-seeding embeds statistical co-occurrence structure into the embedding |
| | space BEFORE any gradient descent, giving the model a warm start aligned |
| | with data manifold geometry. |
| | """ |
| | print("\n[Hebbian Pre-Initialization] Seeding embeddings with co-occurrence statistics...") |
| | D = model.cfg.embed_dim |
| | V = model.cfg.vocab_size |
| | eta = CFG.hebb_lr |
| |
|
| | |
| | cooc = np.zeros((V, V), dtype=np.float64) |
| | window = 3 |
| | for q, a in corpus: |
| | seq = tokenizer.encode(q + " " + a) |
| | for i, tok in enumerate(seq): |
| | for j in range(max(0, i - window), min(len(seq), i + window + 1)): |
| | if i != j: |
| | cooc[tok, seq[j]] += 1.0 |
| |
|
| | |
| | row_sums = cooc.sum(axis=1, keepdims=True) + 1e-8 |
| | cooc_norm = cooc / row_sums |
| |
|
| | |
| | for epoch in range(CFG.hebb_epochs): |
| | total_change = 0.0 |
| | for v_id in range(4, min(V, 500)): |
| | if cooc_norm[v_id].sum() < 1e-8: |
| | continue |
| | |
| | W = model.embed.data[v_id] |
| | |
| | context_emb = cooc_norm[v_id] @ model.embed.data |
| | y = W.dot(context_emb) |
| | |
| | delta = eta * (y * context_emb - y**2 * W) |
| | model.embed.data[v_id] += delta.astype(np.float32) |
| | total_change += np.abs(delta).sum() |
| |
|
| | print(f" Hebb epoch {epoch+1}/{CFG.hebb_epochs} | Mean change: {total_change/(V-4):.6f}") |
| |
|
| | print("[Hebbian Pre-Initialization] Complete. Embeddings seeded with corpus statistics.\n") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def cross_entropy_loss(logits: np.ndarray, targets: np.ndarray, |
| | smoothing: float = 0.1) -> Tuple[float, np.ndarray]: |
| | """ |
| | Cross-entropy loss with label smoothing (regularization, Haykin Ch.4). |
| | |
| | Label smoothing replaces hard 0/1 targets with Ξ΅/(V-1) and 1-Ξ΅, |
| | preventing overconfident predictions and improving calibration. |
| | |
| | Returns: (scalar loss, gradient d_logits same shape as logits) |
| | """ |
| | B, T, V = logits.shape |
| | BT = B * T |
| |
|
| | |
| | logits_2d = logits.reshape(BT, V) |
| | targets_flat = targets.flatten() |
| |
|
| | |
| | probs = softmax(logits_2d, axis=-1) |
| |
|
| | |
| | smooth_targets = np.full((BT, V), smoothing / (V - 1), dtype=np.float32) |
| | smooth_targets[np.arange(BT), targets_flat] = 1.0 - smoothing |
| |
|
| | |
| | pad_mask = (targets_flat != CFG.PAD_ID).astype(np.float32) |
| |
|
| | |
| | log_probs = np.log(probs + 1e-9) |
| | loss_per_token = -(smooth_targets * log_probs).sum(axis=-1) |
| | loss = (loss_per_token * pad_mask).sum() / (pad_mask.sum() + 1e-9) |
| |
|
| | |
| | d_logits = (probs - smooth_targets) * pad_mask.reshape(-1, 1) / (pad_mask.sum() + 1e-9) |
| | d_logits = d_logits.reshape(B, T, V) |
| |
|
| | return float(loss), d_logits |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def make_batches(data: List[Tuple[str, str]], tokenizer: HRANTokenizer, |
| | batch_size: int, max_len: int) -> List[Tuple[np.ndarray, np.ndarray]]: |
| | """ |
| | Convert Q-A pairs to batched (input_ids, target_ids) for language modeling. |
| | Format: BOS + question + answer + EOS |
| | Target: shifted right (predict next token at each position) |
| | """ |
| | sequences = [] |
| | for q, a in data: |
| | q_ids = tokenizer.encode(q) |
| | a_ids = tokenizer.encode(a) |
| | full = [CFG.BOS_ID] + q_ids + a_ids + [CFG.EOS_ID] |
| | full = full[:max_len + 1] |
| | sequences.append(full) |
| |
|
| | |
| | sequences.sort(key=len) |
| |
|
| | batches = [] |
| | for i in range(0, len(sequences), batch_size): |
| | batch_seqs = sequences[i:i + batch_size] |
| | max_seq = max(len(s) for s in batch_seqs) |
| | max_seq = min(max_seq, max_len + 1) |
| |
|
| | inputs = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32) |
| | targets = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32) |
| |
|
| | for j, seq in enumerate(batch_seqs): |
| | seq = seq[:max_seq] |
| | L = len(seq) - 1 |
| | inputs[j, :L] = seq[:-1] |
| | targets[j, :L] = seq[1:] |
| |
|
| | batches.append((inputs, targets)) |
| | return batches |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def get_lr(step: int, total_steps: int, warmup_steps: int, base_lr: float) -> float: |
| | """ |
| | Cosine annealing with linear warmup. |
| | From first principles: minimizing oscillation near minima (Haykin Ch.4). |
| | """ |
| | if step < warmup_steps: |
| | return base_lr * step / max(warmup_steps, 1) |
| | progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1) |
| | return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress)) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def train(model: HRANModel, tokenizer: HRANTokenizer, |
| | data: List[Tuple[str, str]], config: HRANConfig): |
| | """ |
| | Full training loop implementing: |
| | 1. Hebbian pre-seeding (Haykin Ch.2) |
| | 2. Mini-batch gradient descent with Adam + Wiener scaling (Haykin Ch.3) |
| | 3. Label smoothing regularization (Haykin Ch.4) |
| | 4. Cosine LR schedule |
| | 5. Gradient clipping (stability) |
| | """ |
| | print("=" * 65) |
| | print(" HRAN Training β Haykin Resonant Attention Network") |
| | print("=" * 65) |
| |
|
| | |
| | hebbian_seed(model, tokenizer, data) |
| |
|
| | |
| | batches = make_batches(data, tokenizer, config.batch_size, config.max_seq_len) |
| | total_steps = len(batches) * config.epochs |
| | step = 0 |
| |
|
| | print(f"[Training] {len(data)} samples | {len(batches)} batches | " |
| | f"{config.epochs} epochs | {total_steps} total steps") |
| | print(f"[Training] LR={config.learning_rate} | Batch={config.batch_size} | " |
| | f"Warmup={config.warmup_steps}\n") |
| |
|
| | best_loss = float("inf") |
| | history = [] |
| |
|
| | for epoch in range(config.epochs): |
| | epoch_loss = 0.0 |
| | epoch_batches = 0 |
| |
|
| | |
| | random.shuffle(batches) |
| |
|
| | for inp, tgt in batches: |
| | lr = get_lr(step, total_steps, config.warmup_steps, config.learning_rate) |
| |
|
| | |
| | model.zero_grads() |
| | logits = model.forward(inp, training=True) |
| |
|
| | |
| | loss, d_logits = cross_entropy_loss(logits, tgt, config.label_smoothing) |
| |
|
| | |
| | model.backward(d_logits) |
| |
|
| | |
| | model.clip_grads(config.grad_clip) |
| |
|
| | |
| | model.update(lr) |
| |
|
| | epoch_loss += loss |
| | epoch_batches += 1 |
| | step += 1 |
| |
|
| | avg_loss = epoch_loss / max(epoch_batches, 1) |
| | history.append(avg_loss) |
| |
|
| | |
| | perplexity = math.exp(min(avg_loss, 20)) |
| |
|
| | if avg_loss < best_loss: |
| | best_loss = avg_loss |
| | model.save("hran_best.pkl") |
| |
|
| | |
| | if (epoch + 1) % 5 == 0 or epoch == 0: |
| | bar_len = 20 |
| | filled = int(bar_len * (epoch + 1) / config.epochs) |
| | bar = "β" * filled + "β" * (bar_len - filled) |
| | print(f" Epoch {epoch+1:3d}/{config.epochs} [{bar}] " |
| | f"Loss: {avg_loss:.4f} | PPL: {perplexity:.1f} | LR: {lr:.6f}") |
| |
|
| | print(f"\n[Training Complete] Best loss: {best_loss:.4f} | " |
| | f"Best PPL: {math.exp(min(best_loss, 20)):.2f}") |
| | return history |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generate(model: HRANModel, tokenizer: HRANTokenizer, prompt: str, |
| | max_new_tokens: int = 60, temperature: float = 0.7, |
| | top_k: int = 40, top_p: float = 0.9) -> str: |
| | """ |
| | Autoregressive generation with: |
| | - Temperature scaling (Haykin: noise injection for exploration) |
| | - Top-k sampling (competitive selection β like lateral inhibition) |
| | - Top-p (nucleus) sampling (information-theoretic probability mass cutoff) |
| | """ |
| | input_ids = [CFG.BOS_ID] + tokenizer.encode(prompt) |
| |
|
| | for _ in range(max_new_tokens): |
| | |
| | ctx = input_ids[-CFG.max_seq_len:] |
| | inp = np.array([ctx], dtype=np.int32) |
| |
|
| | |
| | logits = model.forward(inp, training=False) |
| |
|
| | |
| | next_logits = logits[0, -1, :].astype(np.float64) |
| |
|
| | |
| | next_logits /= max(temperature, 1e-8) |
| |
|
| | |
| | if top_k > 0: |
| | kth_val = np.partition(next_logits, -top_k)[-top_k] |
| | next_logits[next_logits < kth_val] = -1e9 |
| |
|
| | |
| | probs = softmax(next_logits) |
| | sorted_indices = np.argsort(-probs) |
| | cumprob = 0.0 |
| | cutoff_idx = len(sorted_indices) |
| | for rank, idx in enumerate(sorted_indices): |
| | cumprob += probs[idx] |
| | if cumprob >= top_p: |
| | cutoff_idx = rank + 1 |
| | break |
| | |
| | keep_ids = set(sorted_indices[:cutoff_idx]) |
| | for i in range(len(probs)): |
| | if i not in keep_ids: |
| | probs[i] = 0.0 |
| | probs /= probs.sum() + 1e-9 |
| |
|
| | |
| | next_id = int(np.random.choice(len(probs), p=probs)) |
| |
|
| | if next_id == CFG.EOS_ID: |
| | break |
| |
|
| | input_ids.append(next_id) |
| |
|
| | |
| | generated_ids = input_ids[1 + len(tokenizer.encode(prompt)):] |
| | return tokenizer.decode(generated_ids) |
| |
|
| |
|
| | def generate_response(model: HRANModel, tokenizer: HRANTokenizer, |
| | question: str, temperature: float = 0.6) -> str: |
| | """ |
| | Generate a response to a conversational input. |
| | Uses multiple sampling attempts and picks the best by length heuristic. |
| | """ |
| | |
| | q = question.lower().strip().rstrip("?!.") |
| | |
| | candidates = [] |
| | for temp in [temperature, temperature * 0.8, temperature * 1.2]: |
| | resp = generate(model, tokenizer, q, max_new_tokens=60, |
| | temperature=temp, top_k=50, top_p=0.92) |
| | resp = resp.strip() |
| | if len(resp.split()) >= 3: |
| | candidates.append(resp) |
| |
|
| | if not candidates: |
| | return "I am still learning. Could you rephrase that?" |
| |
|
| | |
| | best = max(candidates, key=lambda r: len(r.split())) |
| |
|
| | |
| | if best: |
| | best = best[0].upper() + best[1:] |
| | return best |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | BANNER = """ |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β β |
| | β βββ ββββββββββ ββββββ ββββ βββ β |
| | β βββ ββββββββββββββββββββββββ βββ β |
| | β ββββββββββββββββββββββββββββββ βββ β |
| | β ββββββββββββββββββββββββββββββββββ β |
| | β βββ ββββββ ββββββ ββββββ ββββββ β |
| | β βββ ββββββ ββββββ ββββββ βββββ β |
| | β β |
| | β Haykin Resonant Attention Network β |
| | β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β |
| | β Architecture grounded in: Simon Haykin's Neural Networks β |
| | β and Learning Machines + First Principles of Information Theory β |
| | β β |
| | β Innovations: β |
| | β β’ RBF Attention Kernels (Ch.5) β’ Hebbian Embedding Init (Ch.2) β |
| | β β’ Infomax FFN Activation (Ch.10) β’ Lateral Inhibition (Ch.9) β |
| | β β’ Wiener Gradient Scaling (Ch.3) β |
| | β β |
| | β Commands: 'quit' to exit | 'info' for architecture details β |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | """ |
| |
|
| | ARCH_INFO = """ |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β HRAN Architecture Details β |
| | β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| | β Embedding dim : 128 Vocab size : ~1500 β |
| | β HRAN layers : 4 Attn heads : 4 β |
| | β FFN dim : 512 Max seq len : 64 β |
| | β Total params : ~2.5M Training : 80 epochs β |
| | β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| | β RBF Attention : A_ij = softmax(-Ξ³βq_i - k_jβΒ²) β |
| | β Infomax Act. : f(x) = tanh(x) + 0.1x β |
| | β Hebbian Init : ΞW = Ξ·(yΒ·x - yΒ²Β·W) [Oja's rule] β |
| | β Wiener Scale : lr_eff = lr Γ SNR/(1+SNR) β |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | """ |
| |
|
| | def chat_loop(model: HRANModel, tokenizer: HRANTokenizer): |
| | """Main conversational loop.""" |
| | print(BANNER) |
| | print(" Ready to converse. Type your question or message.\n") |
| |
|
| | history = [] |
| |
|
| | while True: |
| | try: |
| | user_input = input(" You βΊ ").strip() |
| | except (EOFError, KeyboardInterrupt): |
| | print("\n HRAN βΊ Goodbye. Keep thinking.\n") |
| | break |
| |
|
| | if not user_input: |
| | continue |
| |
|
| | if user_input.lower() in ["quit", "exit", "bye", "goodbye"]: |
| | print(" HRAN βΊ Goodbye. Keep thinking.\n") |
| | break |
| |
|
| | if user_input.lower() == "info": |
| | print(ARCH_INFO) |
| | continue |
| |
|
| | if user_input.lower() == "history": |
| | if history: |
| | print("\n [Conversation History]") |
| | for i, (q, r) in enumerate(history[-5:], 1): |
| | print(f" {i}. You: {q}") |
| | print(f" HRAN: {r}\n") |
| | else: |
| | print(" [No history yet]\n") |
| | continue |
| |
|
| | |
| | print(" HRAN βΊ ", end="", flush=True) |
| | t0 = time.time() |
| | response = generate_response(model, tokenizer, user_input) |
| | elapsed = time.time() - t0 |
| |
|
| | print(response) |
| | print(f" {'β' * 60}") |
| |
|
| | history.append((user_input, response)) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | np.random.seed(42) |
| | random.seed(42) |
| |
|
| | print("\n" + "β" * 65) |
| | print(" HRAN β Haykin Resonant Attention Network") |
| | print(" Built strictly from Haykin + First Principles") |
| | print("β" * 65 + "\n") |
| |
|
| | |
| | tokenizer = HRANTokenizer(max_vocab=CFG.vocab_size) |
| | tokenizer.build(FULL_DATASET) |
| | CFG.vocab_size = tokenizer.vocab_size |
| |
|
| | |
| | model = HRANModel(CFG) |
| |
|
| | |
| | model_path = "hran_best.pkl" |
| | if os.path.exists(model_path): |
| | print(f"[HRAN] Found saved model at {model_path}") |
| | ans = input(" Load existing model? [Y/n]: ").strip().lower() |
| | if ans != "n": |
| | model.load(model_path) |
| | print(" Loaded! Entering chat mode.\n") |
| | chat_loop(model, tokenizer) |
| | return |
| |
|
| | |
| | print("\n[HRAN] Starting training from scratch...\n") |
| | history = train(model, tokenizer, FULL_DATASET, CFG) |
| |
|
| | |
| | try: |
| | import matplotlib.pyplot as plt |
| | plt.figure(figsize=(10, 4)) |
| | plt.plot(history, color="#e74c3c", linewidth=2) |
| | plt.title("HRAN Training Loss (Haykin RBF-Attention + Infomax FFN)") |
| | plt.xlabel("Epoch") |
| | plt.ylabel("Cross-Entropy Loss") |
| | plt.grid(alpha=0.3) |
| | plt.tight_layout() |
| | plt.savefig("hran_training_loss.png", dpi=150) |
| | plt.close() |
| | print("\n[HRAN] Loss curve saved to hran_training_loss.png") |
| | except ImportError: |
| | pass |
| |
|
| | print("\n[HRAN] Training complete! Entering chat mode.") |
| | print(" (Model auto-saved as hran_best.pkl)\n") |
| |
|
| | chat_loop(model, tokenizer) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|