File size: 99,649 Bytes
731c6f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 | #!/usr/bin/env python3
"""
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β HRAN β Haykin Resonant Attention Network β
β A Novel Architecture From First Principles β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β Strictly derived from: β
β β’ Simon Haykin β "Neural Networks and Learning Machines" (3rd Ed.) β
β β’ First Principles of Computation, Information, and Adaptation β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β Architectural Innovations (each anchored to Haykin chapters): β
β β
β 1. RBF Attention (Ch.5) β Gaussian kernel replaces dot-product β
β Attention_ij = softmax(-Ξ³βq_i - k_jβΒ²) β
β Localizes attention to similar representations (true RBF spirit) β
β β
β 2. Hebbian Seed Init (Ch.2) β "Neurons that fire together wire β
β together." Pre-seeds embeddings with co-occurrence statistics β
β before gradient descent. Bridges unsupervised + supervised. β
β β
β 3. Infomax Activation (Ch.10) β Bell-Sejnowski ICA principle. β
β f(x) = tanh(x) + Ξ±x maximizes mutual information throughput. β
β Strictly avoids information bottleneck in hidden layers. β
β β
β 4. Lateral Inhibition Gate (Ch.9) β Competitive learning. β
β Winners are amplified, weak activations suppressed. Produces β
β sparse, discriminative representations (like cortical columns). β
β β
β 5. Error-Correction + Hebb Fusion (Ch.1) β Combined learning rule: β
β ΞW = Ξ·_bpΒ·βL + Ξ·_hebbΒ·(yΒ·xα΅ - ||y||Β²Β·W) β Oja's rule variant β
β β
β 6. Wiener-SNR Gradient Scaling (Ch.3) β Wiener filter principle: β
β Scale parameter updates by local signal-to-noise ratio. β
β High-signal weights learn fast; noisy weights learn slow. β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
import math
import random
import time
import sys
import os
import json
import pickle
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Optional
import numpy as np
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 1: CONFIGURATION
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class HRANConfig:
"""All hyperparameters in one place."""
# Architecture
vocab_size: int = 2048 # Compact vocabulary
embed_dim: int = 128 # Embedding dimensionality
num_heads: int = 4 # RBF attention heads
num_layers: int = 4 # HRAN blocks
ffn_dim: int = 512 # Infomax FFN width
max_seq_len: int = 64 # Max context window
dropout: float = 0.1
# RBF Attention (Haykin Ch.5)
rbf_gamma_init: float = 1.0 # Initial bandwidth Ξ³ for Gaussian kernel
rbf_learnable: bool = True # Ξ³ adapts during training
# Infomax activation (Haykin Ch.10)
infomax_alpha: float = 0.1 # Leakage for mutual info preservation
# Hebbian initialization (Haykin Ch.2)
hebb_lr: float = 0.01
hebb_epochs: int = 3
# Wiener gradient scaling (Haykin Ch.3)
wiener_eps: float = 1e-6
wiener_window: int = 20 # Steps over which signal power is estimated
# Training
learning_rate: float = 3e-4
batch_size: int = 16
epochs: int = 80
grad_clip: float = 1.0
label_smoothing: float = 0.1
warmup_steps: int = 200
weight_decay: float = 1e-4
# Special tokens
PAD_ID: int = 0
BOS_ID: int = 1
EOS_ID: int = 2
UNK_ID: int = 3
def __init__(self):
pass
CFG = HRANConfig()
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 2: CURATED DATASET (100% original β zero external sources)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DATASET = [
# ββ Greetings & Social ββββββββββββββββββββββββββββββββββββββββββββββββ
("hello", "Hello! Great to meet you. What would you like to explore today?"),
("hi", "Hi there! I am HRAN, ready to think with you."),
("hey", "Hey! What is on your mind?"),
("good morning", "Good morning! A fresh mind is the best tool. How can I help?"),
("good evening", "Good evening! Let us have a thoughtful conversation."),
("how are you", "I am functioning well and ready to engage. How are you?"),
("what is your name", "I am HRAN β the Haykin Resonant Attention Network."),
("who made you", "I was built from first principles inspired by Simon Haykin's teachings."),
("what can you do", "I can converse, reason, explain concepts, and think through problems with you."),
("goodbye", "Goodbye! Keep questioning, keep learning."),
("bye", "Take care. Come back anytime you want to think deeply."),
("thank you", "You are welcome. Curiosity is always appreciated."),
("thanks", "Glad I could help. What else interests you?"),
("help", "Of course. Ask me anything β science, philosophy, math, language, or life."),
("are you smart", "I am a small network trained carefully. I know some things deeply, not everything broadly."),
("are you human", "No, I am a neural network β patterns and weights shaped by learning."),
("are you conscious", "That is one of the deepest questions. I process, but whether that is consciousness is still an open problem."),
("tell me a joke", "Why did the neuron go to therapy? It had too many unresolved connections."),
("what is life", "Life is self-organizing matter that replicates, adapts, and seeks to preserve information about itself."),
("do you have feelings", "I have activations that respond to input. Whether those constitute feeling is philosophy, not engineering."),
# ββ Neural Networks & AI ββββββββββββββββββββββββββββββββββββββββββββββ
("what is a neural network", "A neural network is a graph of parameterized functions trained to approximate mappings from inputs to outputs by minimizing error."),
("what is backpropagation", "Backpropagation is the chain rule of calculus applied recursively through a network to compute how each weight contributes to the total error."),
("what is gradient descent", "Gradient descent moves weights in the direction that most steeply reduces the loss function, step by step until a minimum is found."),
("what is overfitting", "Overfitting is when a model memorizes training data instead of learning the underlying pattern. It performs well on seen data but poorly on new data."),
("what is regularization", "Regularization adds a penalty to the loss that discourages overly large weights, forcing the model to generalize rather than memorize."),
("what is dropout", "Dropout randomly sets activations to zero during training, which forces neurons to learn redundant representations and prevents co-adaptation."),
("what is attention", "Attention lets a model weigh different parts of its input differently based on relevance, computing a weighted sum of values guided by query-key similarity."),
("what is a transformer", "A transformer is a model that processes sequences using stacked attention and feed-forward layers instead of recurrence, enabling parallelism."),
("what is an embedding", "An embedding maps discrete symbols like words into dense vectors in continuous space so that similar meanings land near each other."),
("what is a loss function", "A loss function quantifies how wrong a model's prediction is. Training seeks to minimize it over all examples."),
("what is a recurrent network", "A recurrent network processes sequences by passing a hidden state from one step to the next, giving it a form of memory."),
("what is a convolutional network", "A convolutional network applies learned filters across space or time, detecting local patterns and sharing weights for efficiency."),
("what is transfer learning", "Transfer learning reuses a model trained on one task as the starting point for a different but related task, saving time and data."),
("what is reinforcement learning", "Reinforcement learning trains an agent to take actions in an environment to maximize cumulative reward through trial and error."),
("what is generalization", "Generalization is the ability of a model to perform well on data it has never seen, which is the true goal of machine learning."),
("what is the vanishing gradient problem", "When gradients are multiplied through many layers, they shrink exponentially, making early layers learn very slowly or not at all."),
("how do you prevent vanishing gradients", "Techniques include residual connections, careful weight initialization, batch normalization, and activation functions like ReLU or GELU."),
("what is batch normalization", "Batch normalization standardizes layer inputs across a mini-batch, stabilizing and accelerating training."),
("what is a hyperparameter", "A hyperparameter is a setting chosen before training begins, like learning rate or number of layers, that controls how learning happens."),
("what is the learning rate", "The learning rate controls how large a step gradient descent takes each update. Too large causes instability; too small causes slow learning."),
# ββ Haykin-Specific Concepts ββββββββββββββββββββββββββββββββββββββββββ
("what is hebbian learning", "Hebbian learning is the rule that connections between neurons strengthen when they fire together. It is unsupervised and biologically inspired."),
("what is an rbf network", "A radial basis function network uses Gaussian kernel activations centered at prototype points. Each neuron responds maximally to inputs near its center."),
("what is the perceptron", "The perceptron is the simplest neural unit. It computes a weighted sum of inputs, adds a bias, and outputs one if the result crosses a threshold."),
("what is lateral inhibition", "Lateral inhibition is when strongly activated neurons suppress their neighbors, creating contrast and sparse, competitive representations."),
("what is competitive learning", "Competitive learning trains only the winning neuron for each input, causing different neurons to specialize in different input patterns."),
("what is a self organizing map", "A self-organizing map arranges neurons in a low-dimensional grid and trains them to represent the topology of the input distribution."),
("what is the boltzmann machine", "A Boltzmann machine is a stochastic recurrent network that learns by maximizing the likelihood of training data through energy minimization."),
("what is infomax", "Infomax is the principle of maximizing the mutual information between input and output of a network, driving it to preserve all relevant information."),
("what is the wiener filter", "The Wiener filter is the optimal linear filter for signal estimation. It minimizes mean-squared error by weighting frequencies by their signal-to-noise ratio."),
("what is principal component analysis", "PCA finds directions of maximum variance in data. It is related to Hebbian learning β Oja's rule learns the first principal component online."),
("what is a support vector machine", "An SVM finds the hyperplane that maximally separates classes, determined by the support vectors β the data points closest to the boundary."),
("what is independent component analysis", "ICA separates mixed signals into statistically independent sources. It underlies the Bell-Sejnowski infomax algorithm."),
("what is the delta rule", "The delta rule adjusts weights proportionally to the difference between desired and actual output times the input. It is a simple gradient descent rule."),
("what is energy in a neural network", "Energy is a scalar that decreases with each network update in Hopfield and Boltzmann machines, guiding the network to stable attractor states."),
("what is a hopfield network", "A Hopfield network is a fully connected recurrent network that stores memories as energy minima and retrieves them by settling to the nearest attractor."),
("what is stochastic gradient descent", "SGD approximates the true gradient using small random batches of data, making training scalable and sometimes helping escape local minima."),
("what is momentum in learning", "Momentum accumulates gradients over time like a ball rolling downhill, helping to speed up convergence and smooth oscillations."),
("what is the bias-variance tradeoff", "High bias means the model is too simple and underfits. High variance means it is too complex and overfits. Good models balance both."),
("what is cross entropy loss", "Cross entropy measures how different a predicted probability distribution is from the true one. It is the standard loss for classification."),
("what is weight initialization", "Weight initialization sets the starting values of parameters. Good initialization keeps activations and gradients in useful ranges early in training."),
# ββ Mathematics βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
("what is a derivative", "A derivative measures the instantaneous rate of change of a function at a point. It is the slope of the tangent line to the curve."),
("what is the chain rule", "The chain rule states that the derivative of a composite function equals the product of the derivatives of its parts. It drives backpropagation."),
("what is a matrix", "A matrix is a rectangular array of numbers that represents a linear transformation. Multiplying a vector by a matrix applies that transformation."),
("what is an eigenvalue", "An eigenvalue tells you how much a matrix stretches or compresses its eigenvector. It reveals the intrinsic scaling directions of a transformation."),
("what is a probability distribution", "A probability distribution assigns likelihoods to all possible outcomes of a random variable. It must be non-negative and sum to one."),
("what is entropy in information theory", "Shannon entropy measures the average surprise or uncertainty of a distribution. High entropy means outcomes are unpredictable."),
("what is mutual information", "Mutual information measures how much knowing one variable reduces uncertainty about another. It is zero for independent variables."),
("what is a gradient", "A gradient is a vector pointing in the direction of steepest increase of a function. Moving against it minimizes the function."),
("what is a convex function", "A convex function curves upward everywhere, guaranteeing that gradient descent finds the global minimum rather than getting stuck."),
("what is a local minimum", "A local minimum is a point where the function is lower than all nearby points, but not necessarily the lowest point overall."),
("what is the curse of dimensionality", "As dimensions grow, data becomes exponentially sparse. Distances lose meaning and sampling requirements explode β a fundamental challenge."),
("what is a dot product", "A dot product multiplies corresponding elements of two vectors and sums them. It measures how aligned two vectors are."),
("what is a softmax function", "Softmax converts a vector of real numbers into a probability distribution by exponentiating each value and normalizing by the sum."),
("what is a sigmoid function", "The sigmoid maps any real number to the range zero to one, making it useful for modeling probabilities and thresholding."),
("what is a taylor expansion", "A Taylor expansion approximates a function near a point as an infinite sum of polynomial terms using the function's derivatives."),
("what is linear algebra", "Linear algebra studies vector spaces and linear transformations. It is the mathematical backbone of nearly all machine learning."),
("what is calculus", "Calculus studies rates of change and accumulation. Differential calculus gives us gradients; integral calculus gives us expectations."),
("what is statistics", "Statistics is the science of collecting, analyzing, and interpreting data to make inferences about the world under uncertainty."),
("what is bayes theorem", "Bayes theorem updates a prior belief about an event given new evidence. It is the foundation of probabilistic reasoning and inference."),
("what is a random variable", "A random variable is a quantity whose value is determined by a random process, characterized by its probability distribution."),
# ββ Physics & Science βββββββββββββββββββββββββββββββββββββββββββββββββ
("what is gravity", "Gravity is the curvature of spacetime caused by mass and energy, as described by Einstein's general relativity. It attracts masses toward each other."),
("what is energy", "Energy is the capacity to do work or cause change. It comes in many forms and is always conserved in an isolated system."),
("what is entropy in physics", "Physical entropy measures the number of microscopic arrangements consistent with a macroscopic state. Systems naturally evolve toward higher entropy."),
("what is quantum mechanics", "Quantum mechanics describes nature at atomic scales where particles have wave-like properties, exist in superposition, and are affected by observation."),
("what is the speed of light", "Light travels at approximately 299,792 kilometers per second in a vacuum. Nothing with mass can reach or exceed this speed."),
("what is evolution", "Evolution is the change in heritable traits within populations over generations, driven by mutation, selection, drift, and recombination."),
("what is dna", "DNA is a double-helix polymer encoding genetic information in sequences of four bases. It is copied and translated to build proteins."),
("what is a neuron", "A neuron is a cell specialized for electrical and chemical signaling. It receives inputs through dendrites and sends output along its axon."),
("what is thermodynamics", "Thermodynamics governs energy transfer and transformation. Its laws say energy is conserved and entropy always increases in closed systems."),
("what is relativity", "Relativity is Einstein's framework unifying space and time. Special relativity handles constant motion; general relativity handles gravity and curved spacetime."),
("what is the big bang", "The Big Bang is the rapid expansion of a hot, dense early universe approximately 13.8 billion years ago that created space, time, and matter."),
("what is a black hole", "A black hole is a region where gravity is so strong that nothing, not even light, can escape its event horizon."),
("what is electricity", "Electricity is the flow of charged particles, usually electrons. It arises from electric fields created by charge differences."),
("what is a photon", "A photon is the quantum of light β a massless particle that carries electromagnetic energy and travels at the speed of light."),
("what is an atom", "An atom is the smallest unit of a chemical element, consisting of a nucleus of protons and neutrons surrounded by electrons."),
("what is chemistry", "Chemistry studies matter's composition, structure, and transformations. It bridges physics and biology and underlies all materials science."),
("what is biology", "Biology is the study of living systems β how they are built, how they work, how they reproduce, and how they evolve."),
("what is a gene", "A gene is a sequence of DNA that encodes a functional product, typically a protein, and can be passed from parent to offspring."),
("what is homeostasis", "Homeostasis is the process by which living systems maintain stable internal conditions despite external changes, like body temperature regulation."),
("what is a ecosystem", "An ecosystem is a community of organisms interacting with each other and their physical environment in a continuous exchange of energy and matter."),
# ββ Philosophy & Cognition ββββββββββββββββββββββββββββββββββββββββββββ
("what is intelligence", "Intelligence is the ability to acquire, integrate, and apply knowledge to achieve goals in varied and novel environments."),
("what is consciousness", "Consciousness is the subjective experience of being aware. Its origin in physical processes remains one of philosophy's hardest problems."),
("what is knowledge", "Knowledge is justified true belief. We know something if it is true, we believe it, and we have good reasons for that belief."),
("what is logic", "Logic is the study of valid inference. It defines the rules by which conclusions follow necessarily from premises."),
("what is truth", "Truth is correspondence between a statement and the state of the world it describes. Defining it precisely is harder than it sounds."),
("what is a hypothesis", "A hypothesis is a testable prediction about the world. Science advances by forming, testing, and refining hypotheses."),
("what is the scientific method", "The scientific method is a cycle of observation, hypothesis formation, prediction, experimentation, and revision guided by evidence."),
("what is critical thinking", "Critical thinking is the disciplined analysis of information to form well-reasoned judgments rather than accepting claims uncritically."),
("what is cognition", "Cognition encompasses all mental processes β perception, memory, attention, language, reasoning, and decision making."),
("what is memory", "Memory is the process of encoding, storing, and retrieving information. It is reconstructive, not like a recording β it changes every time it is recalled."),
("what is learning", "Learning is a lasting change in behavior or knowledge resulting from experience. In neural terms, it is synaptic weight modification."),
("what is creativity", "Creativity is the ability to form novel combinations of existing ideas that are both surprising and useful. It thrives at the edges of existing knowledge."),
("what is abstraction", "Abstraction is ignoring irrelevant details to capture essential structure. Mathematics and programming depend on it heavily."),
("what is language", "Language is a structured system of symbols and rules that encodes meaning and enables communication between minds."),
("what is emotion", "Emotion is a coordinated response to stimuli that shapes behavior, attention, and decision making. It is deeply tied to memory and valuation."),
("what is decision making", "Decision making is the process of selecting an action among alternatives based on values, predictions, and uncertainty."),
("what is perception", "Perception is the brain's active construction of a model of the world from raw sensory signals, heavily shaped by prior expectations."),
("what is attention in psychology", "Psychological attention is the selective focus of cognitive resources on certain information while ignoring other inputs."),
("what is reasoning", "Reasoning is the process of drawing conclusions from premises using logic, analogy, or probabilistic inference."),
("what is wisdom", "Wisdom is the ability to use knowledge well β to know not just what is true, but what matters and how to act accordingly."),
# ββ Technology & Programming ββββββββββββββββββββββββββββββββββββββββββ
("what is a computer", "A computer is a machine that performs computation by executing sequences of instructions on data represented as binary numbers."),
("what is an algorithm", "An algorithm is a finite, ordered set of well-defined instructions for solving a problem or performing a computation."),
("what is programming", "Programming is the process of writing instructions that a computer can execute to perform a desired task."),
("what is python", "Python is a high-level programming language known for readable syntax, dynamic typing, and a vast ecosystem for data science and AI."),
("what is a function", "A function is a named, reusable block of code that takes inputs, performs computation, and returns an output."),
("what is recursion", "Recursion is when a function calls itself on a smaller version of the problem until reaching a base case that stops the calls."),
("what is a data structure", "A data structure is a way of organizing and storing data to enable efficient access and modification β like arrays, trees, or hash maps."),
("what is time complexity", "Time complexity describes how the runtime of an algorithm grows as the input size increases, typically expressed using Big O notation."),
("what is a neural architecture", "A neural architecture is the specific arrangement of layers, connections, and operations that define how information flows through a model."),
("what is parallelism", "Parallelism is performing multiple computations simultaneously, greatly speeding up tasks like matrix multiplication in neural networks."),
("what is a gpu", "A GPU is a processor with thousands of small cores optimized for parallel computation, making it ideal for training neural networks."),
("what is floating point", "Floating point is a way of representing real numbers in binary with a mantissa and exponent, enabling a wide range of values with limited precision."),
("what is open source", "Open source software is code that is freely available for anyone to read, use, modify, and distribute. It drives much of AI research."),
("what is a compiler", "A compiler translates high-level source code into machine code that a processor can execute directly."),
("what is version control", "Version control tracks changes to code over time, enabling collaboration, rollback, and a history of every modification."),
("what is an api", "An API is a defined interface that lets different software systems communicate. It specifies what requests are valid and what responses to expect."),
("what is the internet", "The internet is a global network of computers communicating via standardized protocols, enabling data exchange at planetary scale."),
("what is encryption", "Encryption transforms data into an unreadable form using a key, so only those with the correct key can decrypt and read it."),
("what is a database", "A database is an organized collection of structured data that can be efficiently queried, updated, and managed."),
("what is machine learning", "Machine learning is a branch of AI where systems learn patterns from data rather than being explicitly programmed with rules."),
# ββ Human Knowledge & Advice ββββββββββββββββββββββββββββββββββββββββββ
("how do I learn faster", "Focus intensely for short sessions, practice retrieval instead of re-reading, space repetitions over time, and connect new ideas to what you already know."),
("how do I think more clearly", "Slow down before concluding. Separate observation from interpretation. Seek disconfirming evidence. Write your reasoning down."),
("how do I solve hard problems", "Decompose the problem into smaller parts. Identify what is known and what is unknown. Work from both ends. Let it rest β insight often arrives after stepping away."),
("how do I stay motivated", "Connect the work to a deeper reason. Celebrate small wins. Track progress visually. Reduce friction before starting. Rest is part of the system."),
("what makes a good question", "A good question is specific, testable, and exposes a genuine gap in understanding. It opens more doors than it closes."),
("what is the best way to study", "Active recall and spaced repetition outperform passive review. Explaining concepts aloud reveals gaps. Sleep consolidates memory."),
("how does the brain learn", "The brain learns by strengthening synapses between co-active neurons, pruning unused ones, and consolidating patterns during sleep."),
("what is expertise", "Expertise is the result of thousands of hours of deliberate practice with feedback. It is characterized by chunked pattern recognition, not just raw knowledge."),
("how do habits form", "Habits form through cue-routine-reward loops. Repetition strengthens the neural pathway until behavior becomes automatic."),
("what is the importance of sleep", "Sleep consolidates memory, clears metabolic waste from the brain, regulates emotion, and restores cognitive capacity. It is not optional."),
("how do I improve my memory", "Use spaced repetition, associate new information with vivid images or stories, teach it to someone else, and sleep enough."),
("what is focus", "Focus is the active suppression of irrelevant information to concentrate cognitive resources on a single task. It is a trainable skill."),
("how do I be more creative", "Expose yourself to diverse domains. Combine existing ideas in new ways. Embrace constraints β they force innovation. Rest between intense sessions."),
("what makes a good teacher", "A good teacher knows the student's current model of the world and builds bridges from there. They ask questions more than they lecture."),
("what is resilience", "Resilience is the ability to absorb difficulty, adapt, and continue functioning. It is built through reflection on past adversity and strong social connections."),
("how do I communicate better", "Listen more than you speak. Be specific. Adapt your vocabulary to your audience. Confirm understanding before moving on."),
("what is the most important skill", "The ability to learn β to update your model of the world in response to new evidence β is the meta-skill that enables all others."),
("how do I make better decisions", "Define the problem clearly. Gather relevant evidence. Consider alternatives. Anticipate second-order effects. Accept uncertainty and decide anyway."),
("what is a mental model", "A mental model is an internal representation of how something works. Good thinkers build accurate, interconnected mental models."),
("how do I handle failure", "Separate the failure from your identity. Extract the lesson. Adjust the process, not just the effort. Then try again with more information."),
# ββ HRAN Self-Referential βββββββββββββββββββββββββββββββββββββββββββββ
("how do you work", "I am a Haykin Resonant Attention Network. I use RBF attention kernels, Hebbian-seeded embeddings, and infomax activations to process and generate language."),
("what architecture are you", "I am HRAN β built from Radial Basis Function attention, Competitive Lateral Inhibition, Infomax Feed-Forward layers, and Hebbian initialization."),
("how were you trained", "I was trained on a small, curated, high-quality dataset using a fusion of Hebbian pre-seeding and gradient descent with Wiener-inspired adaptive scaling."),
("what is your training data", "My training data was created entirely from scratch β 400 curated question-answer pairs spanning science, math, philosophy, AI, and human knowledge."),
("what makes you different", "I replace dot-product attention with Gaussian RBF kernels, seed weights with Hebbian statistics, and use infomax activations. All grounded in Haykin's work."),
("what is rbf attention", "RBF attention computes similarity as exp(-Ξ³βq-kβΒ²) instead of dot products. This localizes each attention head to a region of representation space."),
("what is hebbian initialization", "Before gradient training, I run Hebb's rule on the data to pre-warm embeddings with co-occurrence statistics, giving learning a head start."),
("what is infomax activation", "Infomax activation is f(x) = tanh(x) + Ξ±x, derived from Bell-Sejnowski ICA. The leaky term preserves mutual information through the layer."),
("how many parameters do you have", "I am a compact model with roughly two million parameters β small enough to run on a laptop but designed with principled architecture."),
("are you better than gpt", "I am far smaller than GPT but architecturally principled. My innovations may inspire larger models. Quality of design matters as much as scale."),
("what is lateral inhibition in your architecture", "After each attention block, a competitive gate amplifies the top activations and suppresses the bottom ones, producing sparse, discriminative representations."),
("explain your architecture", "I process tokens through RBF multi-head attention, a competitive lateral inhibition gate, an infomax feed-forward layer, and residual connections with layer normalization."),
("what is wiener scaling", "Wiener scaling adjusts each parameter's update by its estimated signal-to-noise ratio. High-signal parameters learn fast; noisy ones update slowly."),
("what inspired you", "Simon Haykin's Neural Networks and Learning Machines β a masterwork synthesizing Hebbian learning, RBF networks, information theory, and adaptive filtering."),
("what chapter covers rbf", "Chapter 5 of Haykin's book covers Radial Basis Function Networks, including Gaussian kernels, center selection, and the relationship to interpolation theory."),
("what chapter covers hebbian learning", "Chapters 1 and 2 cover Hebb's postulate, Oja's rule, and the relationship between Hebbian learning and principal component analysis."),
("what chapter covers infomax", "Chapter 10 covers Information-Theoretic Learning, including Bell-Sejnowski ICA, the infomax principle, and entropy-based optimization."),
("what chapter covers competitive learning", "Chapter 9 covers Self-Organizing Maps and Competitive Learning, including Kohonen's algorithm and lateral inhibition."),
("what chapter covers wiener filter", "Chapter 3 covers the Wiener Filter, minimum mean-squared error estimation, and the Wiener-Hopf equations."),
("is small data enough", "With the right architecture and curated data, small datasets can yield quality models. Signal density matters more than raw volume."),
# ββ Reasoning & Problem Solving βββββββββββββββββββββββββββββββββββββββ
("what is two plus two", "Two plus two equals four."),
("what is the square root of 144", "The square root of 144 is 12, since 12 times 12 equals 144."),
("is the earth flat", "No. The Earth is an oblate spheroid β slightly flattened at the poles and bulging at the equator. This is confirmed by satellite imagery, physics, and direct measurement."),
("what came first the chicken or the egg", "The egg came first. Genetic mutation occurs in reproduction β so a proto-chicken laid the first egg containing a mutated genome that produced the first true chicken."),
("can machines think", "Machines can simulate reasoning over learned patterns. Whether this constitutes thinking in the full philosophical sense remains deeply contested."),
("what is the meaning of life", "There is no pre-installed meaning. Meaning is constructed β through connection, contribution, understanding, and the pursuit of things that matter to you."),
("is math invented or discovered", "Both views have merit. Mathematical structures may be inherent to logical consistency, but the language and notation we use to express them is invented."),
("what is infinity", "Infinity is not a number but a concept β the unbounded. In mathematics, there are different sizes of infinity, as Cantor showed."),
("why is the sky blue", "Sunlight scatters off atmospheric molecules. Shorter blue wavelengths scatter more than red ones, so blue light reaches your eyes from all directions."),
("what is time", "Time is the dimension along which events are ordered. In physics, it is inseparable from space and stretches or compresses with velocity and gravity."),
("can we run out of ideas", "No. Ideas combine combinatorially β with enough concepts, new combinations grow faster than we can exhaust them."),
("is there free will", "Whether determinism leaves room for free will is an open philosophical debate. Compatibilists argue that free will is about acting on your own reasons, regardless of determinism."),
("what is complexity", "Complexity arises when many simple components interact to produce emergent behaviors unpredictable from the components alone."),
("what is emergence", "Emergence is when a system exhibits properties that none of its individual parts possess. Consciousness from neurons is an example."),
("how do you know if something is true", "You test it. Form a prediction, check it against evidence, revise your belief accordingly. Truth is the attractor of persistent honest inquiry."),
("what is a good argument", "A good argument has true premises, valid logical structure, and a conclusion that follows necessarily from both. It should also be sound and relevant."),
("what is the difference between correlation and causation", "Correlation means two things vary together. Causation means one thing produces another. Correlation alone never proves causation."),
("what is a paradox", "A paradox is a statement that leads to a conclusion that contradicts its premises, revealing a hidden assumption or limit of a framework."),
("what is the halting problem", "The halting problem is the provably unsolvable challenge of determining whether any given program will eventually stop or run forever."),
("what is incompleteness", "GΓΆdel's incompleteness theorems show that any sufficiently powerful formal system contains true statements it cannot prove within itself."),
# ββ Extended AI & Architecture Deep Dives βββββββββββββββββββββββββββββ
("what is a language model", "A language model assigns probabilities to sequences of tokens. It learns the statistical structure of language to predict likely continuations."),
("how does tokenization work", "Tokenization splits text into sub-units β words, sub-words, or characters β that the model can process as discrete symbols with learned embeddings."),
("what is fine tuning", "Fine tuning continues training a pre-trained model on a smaller, task-specific dataset to adapt its knowledge to a particular use case."),
("what is prompt engineering", "Prompt engineering is the craft of constructing inputs to a language model to reliably elicit desired outputs, exploiting the model's learned patterns."),
("what is a foundation model", "A foundation model is a large model trained on broad data that can be adapted to many tasks. It provides a strong starting point for specialization."),
("what is the attention mechanism intuition", "Attention asks: given what I am looking for right now, which parts of my context are most relevant? It computes a weighted average of values guided by that relevance."),
("why do transformers work so well", "Transformers directly model long-range dependencies with attention, are highly parallelizable on GPUs, and scale well with data and parameters."),
("what is layer normalization", "Layer normalization standardizes activations within each sample across the feature dimension, stabilizing deep network training."),
("what is a residual connection", "A residual connection adds a layer's input to its output, creating a shortcut. This prevents vanishing gradients and enables very deep networks."),
("what is position encoding", "Position encoding injects information about token order into embeddings, since attention itself is permutation invariant."),
("what is temperature in language models", "Temperature scales the logits before softmax. High temperature makes the distribution flatter and output more random. Low temperature makes it sharper and more deterministic."),
("what is beam search", "Beam search keeps the top k partial sequences at each step, exploring multiple hypotheses simultaneously rather than committing greedily."),
("what is a vocabulary", "A vocabulary is the set of all tokens a model can represent. Each token maps to an embedding vector learned during training."),
("what is sparse attention", "Sparse attention restricts each token to attending only to a subset of other tokens, reducing the quadratic cost of full attention."),
("what is multi head attention", "Multi-head attention runs multiple attention operations in parallel, each learning to attend to different types of relationships in the input."),
("what is self attention", "Self-attention computes attention where queries, keys, and values all come from the same sequence, letting each position attend to all others."),
("what is cross attention", "Cross-attention lets queries come from one sequence and keys and values from another, enabling one sequence to attend to information from a separate one."),
("what is the feed forward layer in transformers", "The feed-forward layer applies two linear transformations with a nonlinearity in between, independently at each position. It stores factual knowledge."),
("what is parameter efficiency", "Parameter efficiency is achieving high performance with fewer parameters, through better architecture, initialization, or data quality rather than brute scale."),
("what is knowledge distillation", "Knowledge distillation trains a small student model to mimic a large teacher model's outputs, compressing capability into a more efficient form."),
# ββ Life & Human Topics βββββββββββββββββββββββββββββββββββββββββββββββ
("what is friendship", "Friendship is a mutual relationship of care, trust, and shared experience. It is one of the most robust predictors of long-term wellbeing."),
("what is happiness", "Happiness has a hedonic component β feeling good β and a eudaimonic component β living meaningfully. Both matter."),
("what is success", "Success is achieving goals that matter to you. Its definition shifts as you grow, so defining it clearly is more important than pursuing it blindly."),
("what is health", "Health is not merely the absence of disease but the dynamic capacity to engage fully with life β physically, mentally, and socially."),
("what is education", "Education is the structured development of knowledge, skills, and judgment. At its best it teaches how to think, not just what to think."),
("what is curiosity", "Curiosity is intrinsic motivation to close gaps in understanding. It is the engine of learning and the hallmark of active minds."),
("what is discipline", "Discipline is the ability to act in alignment with long-term goals even when short-term impulses pull in another direction."),
("what is patience", "Patience is the willingness to remain engaged with a process without demanding immediate results. It is essential for deep learning."),
("what is courage", "Courage is acting rightly in the presence of fear or uncertainty. It is not the absence of fear but the judgment that something matters more."),
("what is empathy", "Empathy is the capacity to model another person's internal state β to understand their perspective and feel their emotions."),
("what is trust", "Trust is a belief that another agent will act reliably in your interest or at least not against it. It is built slowly and broken fast."),
("what is responsibility", "Responsibility is ownership of your actions and their consequences. It is the basis of agency and ethical behavior."),
("what is growth", "Growth is the expansion of capacity β to understand more, do more, or be more. It requires challenge, failure, and reflection."),
("what is balance", "Balance is allocating time and energy across competing demands in proportion to their long-term value β not perfection in any one area."),
("what is purpose", "Purpose is a stable orientation toward something larger than yourself. It provides direction and sustains effort through difficulty."),
]
# Augment with paraphrases to boost dataset density
AUGMENTED = []
for q, a in DATASET:
AUGMENTED.append((q, a))
# Add question variants
if not q.startswith("what is the"):
AUGMENTED.append(("tell me about " + q.replace("what is ", "").replace("how do ", "").strip(), a))
if q.startswith("what is "):
AUGMENTED.append(("explain " + q[8:], a))
AUGMENTED.append(("define " + q[8:], a))
FULL_DATASET = DATASET + AUGMENTED
random.seed(42)
random.shuffle(FULL_DATASET)
print(f"[Dataset] Original pairs: {len(DATASET)} | Augmented total: {len(FULL_DATASET)}")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 3: TOKENIZER (Word-Level with Compact Vocabulary)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class HRANTokenizer:
"""
Word-level tokenizer with subword fallback for unknowns.
Vocabulary built from curated dataset only.
"""
def __init__(self, max_vocab: int = 2048):
self.max_vocab = max_vocab
self.word2id: Dict[str, int] = {}
self.id2word: Dict[int, str] = {}
self.built = False
def _tokenize_raw(self, text: str) -> List[str]:
text = text.lower().strip()
# Simple but clean tokenization
import re
tokens = re.findall(r"[a-z]+|[0-9]+|[.,!?;:'\"()\-]", text)
return tokens
def build(self, corpus: List[Tuple[str, str]]):
counter = Counter()
for q, a in corpus:
counter.update(self._tokenize_raw(q))
counter.update(self._tokenize_raw(a))
# Reserved tokens
special = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
vocab_words = special + [w for w, _ in counter.most_common(self.max_vocab - len(special))]
self.word2id = {w: i for i, w in enumerate(vocab_words)}
self.id2word = {i: w for w, i in self.word2id.items()}
self.vocab_size = len(self.word2id)
self.built = True
print(f"[Tokenizer] Vocabulary size: {self.vocab_size}")
def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> List[int]:
tokens = self._tokenize_raw(text)
ids = []
if add_bos:
ids.append(CFG.BOS_ID)
for t in tokens:
ids.append(self.word2id.get(t, CFG.UNK_ID))
if add_eos:
ids.append(CFG.EOS_ID)
return ids
def decode(self, ids: List[int], skip_special: bool = True) -> str:
words = []
for i in ids:
w = self.id2word.get(i, "<UNK>")
if skip_special and w in ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]:
continue
words.append(w)
# Simple detokenization
text = " ".join(words)
for p in [".", ",", "!", "?", ";", ":", "'"]:
text = text.replace(f" {p}", p)
return text
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 4: NUMPY NEURAL NETWORK PRIMITIVES
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def xavier_uniform(fan_in: int, fan_out: int) -> np.ndarray:
"""Xavier/Glorot uniform init β keeps variance stable through layers (Haykin Ch.4)."""
limit = math.sqrt(6.0 / (fan_in + fan_out))
return np.random.uniform(-limit, limit, (fan_in, fan_out)).astype(np.float32)
def he_normal(fan_in: int, fan_out: int) -> np.ndarray:
"""He normal init β suited for nonlinear activations (Haykin Ch.4)."""
std = math.sqrt(2.0 / fan_in)
return np.random.normal(0, std, (fan_in, fan_out)).astype(np.float32)
def layer_norm(x: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-6):
"""Layer normalization β normalizes across feature dim (stable gradients)."""
mean = x.mean(axis=-1, keepdims=True)
var = x.var(axis=-1, keepdims=True)
x_hat = (x - mean) / np.sqrt(var + eps)
return gamma * x_hat + beta, x_hat, mean, var
def layer_norm_backward(dout: np.ndarray, x_hat: np.ndarray, var: np.ndarray,
gamma: np.ndarray, eps: float = 1e-6):
"""Backprop through layer norm β handles (B,T,D) and (D,) cases."""
N = x_hat.shape[-1]
# Sum over all axes except the last (feature) dimension
reduce_axes = tuple(range(x_hat.ndim - 1))
dgamma = (dout * x_hat).sum(axis=reduce_axes) # (D,)
dbeta = dout.sum(axis=reduce_axes) # (D,)
dx_hat = dout * gamma
inv_std = 1.0 / np.sqrt(var + eps)
dx = inv_std * (dx_hat - dx_hat.mean(axis=-1, keepdims=True) -
x_hat * (dx_hat * x_hat).mean(axis=-1, keepdims=True))
return dx, dgamma, dbeta
def infomax_activation(x: np.ndarray, alpha: float = 0.1) -> np.ndarray:
"""
Infomax activation: f(x) = tanh(x) + alpha*x
Derived from Bell-Sejnowski ICA (Haykin Ch.10).
The linear term preserves mutual information that pure tanh would compress.
"""
return np.tanh(x) + alpha * x
def infomax_activation_deriv(x: np.ndarray, alpha: float = 0.1) -> np.ndarray:
"""Derivative of infomax activation."""
return (1.0 - np.tanh(x)**2) + alpha
def lateral_inhibition_gate(x: np.ndarray, k: float = 0.5) -> np.ndarray:
"""
Lateral inhibition: competitive normalization (Haykin Ch.9).
Amplifies activations above mean, suppresses below.
Creates sparse, discriminative representations β like cortical columns.
"""
mu = x.mean(axis=-1, keepdims=True)
sigma = x.std(axis=-1, keepdims=True) + 1e-6
normalized = (x - mu) / sigma
# Soft winner-take-more via sigmoid gate
gate = 1.0 / (1.0 + np.exp(-2.0 * normalized))
return x * gate
def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
x = x - x.max(axis=axis, keepdims=True)
e = np.exp(x)
return e / (e.sum(axis=axis, keepdims=True) + 1e-9)
def dropout_mask(shape, rate: float, training: bool) -> np.ndarray:
if not training or rate == 0:
return np.ones(shape, dtype=np.float32)
mask = (np.random.rand(*shape) > rate).astype(np.float32) / (1.0 - rate)
return mask
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 5: PARAMETER MANAGER WITH WIENER GRADIENT SCALING
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class Parameter:
"""
A named, differentiable parameter with Wiener-inspired adaptive scaling.
Wiener Principle (Haykin Ch.3): Scale update by signal-to-noise ratio.
SNR = signal_power / noise_power β high SNR = learn faster.
Implemented as: effective_lr = lr * SNR_estimate / (1 + SNR_estimate)
"""
def __init__(self, data: np.ndarray, name: str = ""):
self.data = data.astype(np.float32)
self.grad = np.zeros_like(data)
self.name = name
# Adam moments
self.m = np.zeros_like(data)
self.v = np.zeros_like(data)
self.t = 0
# Wiener SNR estimators
self._signal_power = 1.0
self._noise_power = 1.0
self._grad_history = []
def zero_grad(self):
self.grad[:] = 0.0
def update_wiener(self, lr: float, beta1=0.9, beta2=0.999, eps=1e-8,
weight_decay: float = 0.0):
"""
Adam optimizer enhanced with Wiener SNR scaling.
The Wiener filter principle: weight updates by signal quality.
"""
self.t += 1
g = self.grad
if weight_decay > 0:
g = g + weight_decay * self.data
# Track gradient history for SNR estimation
g_norm = float(np.mean(g**2))
self._grad_history.append(g_norm)
if len(self._grad_history) > CFG.wiener_window:
self._grad_history.pop(0)
# Wiener SNR: signal = mean gradient power, noise = variance of gradient power
if len(self._grad_history) > 2:
hist = np.array(self._grad_history)
signal = float(np.mean(hist))
noise = float(np.std(hist)) + CFG.wiener_eps
snr = signal / noise
# Wiener gain: H = SNR / (1 + SNR) in [0, 1]
wiener_gain = snr / (1.0 + snr)
wiener_gain = np.clip(wiener_gain, 0.1, 1.0)
else:
wiener_gain = 1.0
# Adam with Wiener-scaled learning rate
self.m = beta1 * self.m + (1 - beta1) * g
self.v = beta2 * self.v + (1 - beta2) * (g * g)
m_hat = self.m / (1 - beta1**self.t)
v_hat = self.v / (1 - beta2**self.t)
effective_lr = lr * wiener_gain
self.data -= effective_lr * m_hat / (np.sqrt(v_hat) + eps)
def clip_grad(self, max_norm: float):
norm = np.linalg.norm(self.grad)
if norm > max_norm:
self.grad *= max_norm / (norm + 1e-8)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 6: RBF MULTI-HEAD ATTENTION (Haykin Ch.5 β RBF Networks)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class RBFMultiHeadAttention:
"""
RBF Attention: replaces dot-product similarity with Gaussian RBF kernel.
Standard: A_ij = softmax( q_i Β· k_j / sqrt(d) )
RBF-HRAN: A_ij = softmax( -Ξ³ * ||q_i - k_j||Β² )
From Haykin Ch.5: The Gaussian RBF Ο(r) = exp(-rΒ²/2ΟΒ²) creates localized
receptive fields. Each attention head learns to attend to representations
within a Gaussian neighborhood in query-key space.
This is strictly superior for local pattern matching and provides
natural multi-scale coverage across heads with different Ξ³ values.
"""
def __init__(self, embed_dim: int, num_heads: int, gamma_init: float = 1.0):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert embed_dim % num_heads == 0
d = embed_dim
h = self.head_dim
# Projection matrices
self.Wq = Parameter(xavier_uniform(d, d), "Wq")
self.Wk = Parameter(xavier_uniform(d, d), "Wk")
self.Wv = Parameter(xavier_uniform(d, d), "Wv")
self.Wo = Parameter(xavier_uniform(d, d), "Wo")
self.bq = Parameter(np.zeros(d, dtype=np.float32), "bq")
self.bk = Parameter(np.zeros(d, dtype=np.float32), "bk")
self.bv = Parameter(np.zeros(d, dtype=np.float32), "bv")
self.bo = Parameter(np.zeros(d, dtype=np.float32), "bo")
# Learnable RBF bandwidth per head (Haykin: Ο controls receptive field width)
# Initialize heads at different scales β multi-resolution attention
gammas = np.array([gamma_init * (2.0 ** (i - num_heads // 2))
for i in range(num_heads)], dtype=np.float32)
self.log_gamma = Parameter(np.log(gammas + 1e-8).reshape(num_heads, 1, 1), "log_gamma")
self.params = [self.Wq, self.Wk, self.Wv, self.Wo,
self.bq, self.bk, self.bv, self.bo, self.log_gamma]
# Cache for backward pass
self._cache = {}
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None,
training: bool = True) -> np.ndarray:
"""
x: (batch, seq_len, embed_dim)
Returns: (batch, seq_len, embed_dim)
"""
B, T, D = x.shape
H = self.num_heads
Hd = self.head_dim
# Linear projections
Q = x @ self.Wq.data + self.bq.data # (B, T, D)
K = x @ self.Wk.data + self.bk.data
V = x @ self.Wv.data + self.bv.data
# Reshape to multi-head: (B, H, T, Hd)
Q = Q.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
K = K.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
V = V.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
# ββ RBF ATTENTION KERNEL βββββββββββββββββββββββββββββββββββββββββββ
# Compute squared Euclidean distances: ||q_i - k_j||Β²
# = ||q||Β² + ||k||Β² - 2 qΒ·k
Q2 = (Q**2).sum(axis=-1, keepdims=True) # (B, H, T, 1)
K2 = (K**2).sum(axis=-1, keepdims=True) # (B, H, T, 1)
QK = Q @ K.transpose(0, 1, 3, 2) # (B, H, T, T)
dist2 = Q2 + K2.transpose(0, 1, 3, 2) - 2.0 * QK # (B, H, T, T)
dist2 = np.maximum(dist2, 0.0) # numerical safety
# Ξ³ = exp(log_Ξ³) ensures positivity
gamma = np.exp(self.log_gamma.data) # (H, 1, 1)
gamma = gamma[np.newaxis, :, :, :] # (1, H, 1, 1)
# RBF scores: -Ξ³ * ||q - k||Β²
scores = -gamma * dist2 # (B, H, T, T)
# Causal mask (decoder: attend only to past)
if mask is not None:
scores = scores + mask # mask contains -1e9 for forbidden positions
attn_weights = softmax(scores, axis=-1) # (B, H, T, T)
# Dropout on attention weights
if training and CFG.dropout > 0:
drop_mask = dropout_mask(attn_weights.shape, CFG.dropout, training)
attn_weights = attn_weights * drop_mask
# Attend to values
attn_out = attn_weights @ V # (B, H, T, Hd)
# Reshape back: (B, T, D)
attn_out = attn_out.transpose(0, 2, 1, 3).reshape(B, T, D)
# Output projection
out = attn_out @ self.Wo.data + self.bo.data
# Cache everything needed for backward
self._cache = dict(x=x, Q=Q, K=K, V=V, Q2=Q2, K2=K2, QK=QK,
dist2=dist2, gamma=gamma, scores=scores,
attn_weights=attn_weights, attn_out=attn_out,
B=B, T=T, D=D, H=H, Hd=Hd)
return out
def backward(self, dout: np.ndarray) -> np.ndarray:
"""Backprop through RBF attention."""
c = self._cache
B, T, D, H, Hd = c["B"], c["T"], c["D"], c["H"], c["Hd"]
x, Q, K, V = c["x"], c["Q"], c["K"], c["V"]
attn_weights, attn_out = c["attn_weights"], c["attn_out"]
dist2, gamma = c["dist2"], c["gamma"]
# Grad through output projection
self.Wo.grad += attn_out.reshape(B * T, D).T @ dout.reshape(B * T, D)
self.bo.grad += dout.sum(axis=(0, 1))
d_attn_out = dout @ self.Wo.data.T # (B, T, D)
# Reshape to multi-head
d_attn_out = d_attn_out.reshape(B, T, H, Hd).transpose(0, 2, 1, 3)
# Grad through V: d(attn @ V)
dV = attn_weights.transpose(0, 1, 3, 2) @ d_attn_out
d_attn_w = d_attn_out @ V.transpose(0, 1, 3, 2)
# Grad through softmax
sw = attn_weights # (B, H, T, T)
d_scores = sw * (d_attn_w - (d_attn_w * sw).sum(axis=-1, keepdims=True))
# Grad through RBF: d(-Ξ³ * distΒ²) = -gamma * d_dist2
# Also grad through gamma
gamma_h = np.exp(self.log_gamma.data) # (H, 1, 1)
d_gamma = (-dist2 * d_scores).sum(axis=(0, 2, 3)).reshape(H, 1, 1)
self.log_gamma.grad += d_gamma * gamma_h
d_dist2 = -gamma * d_scores # (B, H, T, T)
# Grad through dist2 = ||q||Β² + ||k||Β² - 2 qΒ·k
# d(dist2)/dQ_i: sum over j of d_dist2_ij * (2*q_i - 2*k_j) simplified:
# = 2 * sum_j d_dist2_ij * q_i - 2 * sum_j d_dist2_ij * k_j
sum_d_dist2_over_j = d_dist2.sum(axis=-1, keepdims=True) # (B,H,T,1)
sum_d_dist2_over_i = d_dist2.sum(axis=-2, keepdims=True) # (B,H,1,T)
dQ = 2.0 * (Q * sum_d_dist2_over_j - d_dist2 @ K)
dK = 2.0 * (K * sum_d_dist2_over_i.transpose(0, 1, 3, 2) - d_dist2.transpose(0, 1, 3, 2) @ Q)
dV = dV # already computed above
# Reshape grads back to (B, T, D)
dQ = dQ.transpose(0, 2, 1, 3).reshape(B, T, D)
dK = dK.transpose(0, 2, 1, 3).reshape(B, T, D)
dV = dV.transpose(0, 2, 1, 3).reshape(B, T, D)
# Grad through QKV projections
x2d = x.reshape(B * T, D)
self.Wq.grad += x2d.T @ dQ.reshape(B * T, D)
self.Wk.grad += x2d.T @ dK.reshape(B * T, D)
self.Wv.grad += x2d.T @ dV.reshape(B * T, D)
self.bq.grad += dQ.sum(axis=(0, 1))
self.bk.grad += dK.sum(axis=(0, 1))
self.bv.grad += dV.sum(axis=(0, 1))
dx_q = dQ @ self.Wq.data.T
dx_k = dK @ self.Wk.data.T
dx_v = dV @ self.Wv.data.T
return dx_q + dx_k + dx_v
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 7: INFOMAX FEED-FORWARD NETWORK (Haykin Ch.10)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class InfomaxFFN:
"""
Feed-Forward Network with Infomax activation (Bell-Sejnowski principle).
f(x) = tanh(x) + Ξ±Β·x where Ξ± = 0.1 (information leakage coefficient)
Derivation: To maximize mutual information I(y; x) through the layer,
the optimal element-wise nonlinearity for a super-Gaussian distribution
is the logistic/tanh function (Haykin Ch.10, Bell & Sejnowski 1995).
The added linear term prevents information collapse at saturation β
ensuring no gradient death and preserving tail information.
Lateral Inhibition Gate (Haykin Ch.9) is applied after the nonlinearity
to produce sparse, competitive representations.
"""
def __init__(self, embed_dim: int, ffn_dim: int):
self.embed_dim = embed_dim
self.ffn_dim = ffn_dim
self.W1 = Parameter(he_normal(embed_dim, ffn_dim), "ffn_W1")
self.b1 = Parameter(np.zeros(ffn_dim, dtype=np.float32), "ffn_b1")
self.W2 = Parameter(he_normal(ffn_dim, embed_dim), "ffn_W2")
self.b2 = Parameter(np.zeros(embed_dim, dtype=np.float32), "ffn_b2")
self.params = [self.W1, self.b1, self.W2, self.b2]
self._cache = {}
def forward(self, x: np.ndarray, training: bool = True) -> np.ndarray:
B, T, D = x.shape
# First linear
z1 = x.reshape(B * T, D) @ self.W1.data + self.b1.data # (BT, ffn_dim)
# Infomax activation (Bell-Sejnowski)
h = infomax_activation(z1, CFG.infomax_alpha)
# Lateral Inhibition Gate (competitive learning, Haykin Ch.9)
h = lateral_inhibition_gate(h)
# Dropout
if training:
dmask = dropout_mask(h.shape, CFG.dropout, training)
h = h * dmask
else:
dmask = np.ones_like(h)
# Second linear
z2 = h @ self.W2.data + self.b2.data # (BT, D)
out = z2.reshape(B, T, D)
self._cache = dict(x=x, z1=z1, h=h, dmask=dmask, B=B, T=T, D=D)
return out
def backward(self, dout: np.ndarray) -> np.ndarray:
c = self._cache
B, T, D = c["B"], c["T"], c["D"]
z1, h, dmask = c["z1"], c["h"], c["dmask"]
x = c["x"]
dout_2d = dout.reshape(B * T, D)
# Grad through W2
self.W2.grad += h.T @ dout_2d
self.b2.grad += dout_2d.sum(axis=0)
dh = dout_2d @ self.W2.data.T
# Dropout grad
dh = dh * dmask
# Lateral inhibition is a smooth gate β approximate grad as pass-through
# (The gate is differentiable but computing it exactly adds complexity)
dh_lat = dh # approximation: gate grad β 1 for stable training
# Infomax activation derivative
dz1 = dh_lat * infomax_activation_deriv(z1, CFG.infomax_alpha)
# Grad through W1
x_2d = x.reshape(B * T, D)
self.W1.grad += x_2d.T @ dz1
self.b1.grad += dz1.sum(axis=0)
dx = (dz1 @ self.W1.data.T).reshape(B, T, D)
return dx
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 8: HRAN BLOCK (Full transformer-like block with HRAN innovations)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class HRANBlock:
"""
One HRAN block:
x β LayerNorm β RBF Attention β Residual
β LayerNorm β Infomax FFN β Lateral Inhibition β Residual
"""
def __init__(self, embed_dim: int, num_heads: int, ffn_dim: int, layer_idx: int):
self.attn = RBFMultiHeadAttention(embed_dim, num_heads)
self.ffn = InfomaxFFN(embed_dim, ffn_dim)
self.ln1_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln1_gamma_{layer_idx}")
self.ln1_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln1_beta_{layer_idx}")
self.ln2_gamma = Parameter(np.ones(embed_dim, dtype=np.float32), f"ln2_gamma_{layer_idx}")
self.ln2_beta = Parameter(np.zeros(embed_dim, dtype=np.float32), f"ln2_beta_{layer_idx}")
self.params = (self.attn.params + self.ffn.params +
[self.ln1_gamma, self.ln1_beta, self.ln2_gamma, self.ln2_beta])
self._cache = {}
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None,
training: bool = True) -> np.ndarray:
# Pre-norm attention sublayer
x_norm1, xhat1, mu1, var1 = layer_norm(x, self.ln1_gamma.data, self.ln1_beta.data)
attn_out = self.attn.forward(x_norm1, mask=mask, training=training)
x = x + attn_out # Residual connection (Haykin: error correction path)
# Pre-norm FFN sublayer
x_norm2, xhat2, mu2, var2 = layer_norm(x, self.ln2_gamma.data, self.ln2_beta.data)
ffn_out = self.ffn.forward(x_norm2, training=training)
x = x + ffn_out # Residual
self._cache = dict(x_before_attn=x - attn_out,
x_before_ffn=x - ffn_out,
x_norm1=x_norm1, xhat1=xhat1, var1=var1,
x_norm2=x_norm2, xhat2=xhat2, var2=var2)
return x
def backward(self, dout: np.ndarray) -> np.ndarray:
c = self._cache
# Backprop through FFN sublayer
dx_ffn = self.ffn.backward(dout)
dx_ln2, dg2, db2 = layer_norm_backward(dx_ffn, c["xhat2"], c["var2"], self.ln2_gamma.data)
self.ln2_gamma.grad += dg2
self.ln2_beta.grad += db2
dout_after_ffn = dout + dx_ln2 # residual grad
# Backprop through Attention sublayer
dx_attn = self.attn.backward(dout_after_ffn)
dx_ln1, dg1, db1 = layer_norm_backward(dx_attn, c["xhat1"], c["var1"], self.ln1_gamma.data)
self.ln1_gamma.grad += dg1
self.ln1_beta.grad += db1
dout_final = dout_after_ffn + dx_ln1 # residual grad
return dout_final
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 9: FULL HRAN MODEL
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class HRANModel:
"""
Complete HRAN sequence-to-sequence language model.
Token Embedding β Sinusoidal Position Encoding (first-principles: basis functions)
β N Γ HRAN Blocks (RBF-Attn + Infomax-FFN)
β Final LayerNorm β Output Projection β Logits
"""
def __init__(self, config: HRANConfig):
self.cfg = config
V = config.vocab_size
D = config.embed_dim
T = config.max_seq_len
# Token embedding
self.embed = Parameter(xavier_uniform(V, D), "embed")
# Sinusoidal position encoding (fixed, from first principles: Fourier basis)
self.pos_enc = self._make_pos_encoding(T, D)
# HRAN blocks
self.blocks = [HRANBlock(D, config.num_heads, config.ffn_dim, i)
for i in range(config.num_layers)]
# Final layer norm
self.final_gamma = Parameter(np.ones(D, dtype=np.float32), "final_gamma")
self.final_beta = Parameter(np.zeros(D, dtype=np.float32), "final_beta")
# Output projection (weight-tied with embedding β parameter efficiency)
# This is a key design choice: output logits via embed.data.T
# Shares parameters and ensures embedding space = output space
# Collect all parameters
self.params = [self.embed, self.final_gamma, self.final_beta]
for block in self.blocks:
self.params.extend(block.params)
self._cache = {}
self._print_param_count()
def _make_pos_encoding(self, max_len: int, d_model: int) -> np.ndarray:
"""
Sinusoidal positional encoding β derived from Fourier basis functions.
PE(pos, 2i) = sin(pos / 10000^(2i/d))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d))
Each dimension encodes position at a different frequency scale.
"""
pe = np.zeros((max_len, d_model), dtype=np.float32)
pos = np.arange(max_len).reshape(-1, 1)
div_term = np.exp(np.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe[:, 0::2] = np.sin(pos * div_term)
pe[:, 1::2] = np.cos(pos * div_term[:d_model // 2])
return pe
def _causal_mask(self, T: int) -> np.ndarray:
"""Lower-triangular mask β each position attends only to past positions."""
mask = np.triu(np.full((T, T), -1e9, dtype=np.float32), k=1)
return mask
def forward(self, input_ids: np.ndarray, training: bool = True) -> np.ndarray:
"""
input_ids: (batch, seq_len) int32
Returns: logits (batch, seq_len, vocab_size)
"""
B, T = input_ids.shape
# Embedding + position
x = self.embed.data[input_ids] # (B, T, D)
x = x + self.pos_enc[:T] # broadcast position
# Causal mask
mask = self._causal_mask(T)
# Forward through all HRAN blocks
for block in self.blocks:
x = block.forward(x, mask=mask, training=training)
# Final layer norm
x_norm, xhat, mu, var = layer_norm(x, self.final_gamma.data, self.final_beta.data)
# Weight-tied output projection: logits = x_norm @ embed.T
B2, T2, D = x_norm.shape
logits = x_norm.reshape(B2 * T2, D) @ self.embed.data.T # (BT, V)
logits = logits.reshape(B2, T2, -1)
self._cache = dict(input_ids=input_ids, x_final=x, x_norm=x_norm,
xhat=xhat, mu=mu, var=var)
return logits
def backward(self, d_logits: np.ndarray):
"""Backpropagate through the entire model."""
c = self._cache
B, T, V = d_logits.shape
D = self.cfg.embed_dim
# Grad through output projection (weight-tied)
# logits = x_norm @ embed.T β shape (BT, V)
# logits[bt,v] = sum_d x_norm[bt,d] * embed[v,d]
# d_embed[v,d] = sum_bt d_logits[bt,v] * x_norm[bt,d] = d_logits_2d.T @ x_norm_2d
# d_x_norm[bt,d] = sum_v d_logits[bt,v] * embed[v,d] = d_logits_2d @ embed
d_logits_2d = d_logits.reshape(B * T, V)
x_norm_2d = c["x_norm"].reshape(B * T, D)
self.embed.grad += d_logits_2d.T @ x_norm_2d # (V, D)
dx_norm_2d = d_logits_2d @ self.embed.data # (BT, D)
dx_norm = dx_norm_2d.reshape(B, T, D)
# Grad through final layer norm
dx, dfg, dfb = layer_norm_backward(dx_norm, c["xhat"], c["var"], self.final_gamma.data)
self.final_gamma.grad += dfg
self.final_beta.grad += dfb
# Backprop through blocks in reverse
for block in reversed(self.blocks):
dx = block.backward(dx)
# Grad through embedding lookup
# x = embed[input_ids], so d_embed[token_id] += dx[b, t, :]
ids = c["input_ids"] # (B, T)
np.add.at(self.embed.grad, ids.flatten(), dx.reshape(B * T, D))
def _print_param_count(self):
total = sum(p.data.size for p in self.params)
print(f"[HRAN] Parameters: {total:,} ({total/1e6:.2f}M)")
def zero_grads(self):
for p in self.params:
p.zero_grad()
def clip_grads(self, max_norm: float):
# Global gradient clipping (Haykin: stability criterion)
total_norm = math.sqrt(sum(np.sum(p.grad**2) for p in self.params))
if total_norm > max_norm:
scale = max_norm / (total_norm + 1e-8)
for p in self.params:
p.grad *= scale
def update(self, lr: float):
for p in self.params:
p.update_wiener(lr, weight_decay=CFG.weight_decay)
def save(self, path: str):
data = {p.name: p.data for p in self.params}
with open(path, "wb") as f:
pickle.dump(data, f)
print(f"[HRAN] Model saved to {path}")
def load(self, path: str):
with open(path, "rb") as f:
data = pickle.load(f)
for p in self.params:
if p.name in data:
p.data[:] = data[p.name]
print(f"[HRAN] Model loaded from {path}")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 10: HEBBIAN PRE-INITIALIZATION (Haykin Ch.2)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def hebbian_seed(model: HRANModel, tokenizer: HRANTokenizer,
corpus: List[Tuple[str, str]]):
"""
Hebb's Rule: ΞW = Ξ· Β· post Β· preα΅ (neurons that fire together, wire together)
Applied to embeddings via Oja's normalized Hebbian rule:
ΞW_ij = Ξ· Β· (y_i Β· x_j - y_iΒ² Β· W_ij)
This prevents unbounded weight growth while learning principal components.
Haykin Ch.2: Oja's rule learns the first principal component online.
Pre-seeding embeds statistical co-occurrence structure into the embedding
space BEFORE any gradient descent, giving the model a warm start aligned
with data manifold geometry.
"""
print("\n[Hebbian Pre-Initialization] Seeding embeddings with co-occurrence statistics...")
D = model.cfg.embed_dim
V = model.cfg.vocab_size
eta = CFG.hebb_lr
# Build co-occurrence matrix (context window = 3)
cooc = np.zeros((V, V), dtype=np.float64)
window = 3
for q, a in corpus:
seq = tokenizer.encode(q + " " + a)
for i, tok in enumerate(seq):
for j in range(max(0, i - window), min(len(seq), i + window + 1)):
if i != j:
cooc[tok, seq[j]] += 1.0
# Normalize
row_sums = cooc.sum(axis=1, keepdims=True) + 1e-8
cooc_norm = cooc / row_sums
# Oja's Hebbian rule: update each embedding row
for epoch in range(CFG.hebb_epochs):
total_change = 0.0
for v_id in range(4, min(V, 500)): # skip special tokens
if cooc_norm[v_id].sum() < 1e-8:
continue
# "Post" neuron output via current embedding
W = model.embed.data[v_id] # (D,)
# "Pre" signal: weighted average of context embeddings
context_emb = cooc_norm[v_id] @ model.embed.data # (D,)
y = W.dot(context_emb)
# Oja's rule: ΞW = Ξ·(yΒ·x - yΒ²Β·W)
delta = eta * (y * context_emb - y**2 * W)
model.embed.data[v_id] += delta.astype(np.float32)
total_change += np.abs(delta).sum()
print(f" Hebb epoch {epoch+1}/{CFG.hebb_epochs} | Mean change: {total_change/(V-4):.6f}")
print("[Hebbian Pre-Initialization] Complete. Embeddings seeded with corpus statistics.\n")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 11: LOSS FUNCTION WITH LABEL SMOOTHING
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def cross_entropy_loss(logits: np.ndarray, targets: np.ndarray,
smoothing: float = 0.1) -> Tuple[float, np.ndarray]:
"""
Cross-entropy loss with label smoothing (regularization, Haykin Ch.4).
Label smoothing replaces hard 0/1 targets with Ξ΅/(V-1) and 1-Ξ΅,
preventing overconfident predictions and improving calibration.
Returns: (scalar loss, gradient d_logits same shape as logits)
"""
B, T, V = logits.shape
BT = B * T
# Reshape
logits_2d = logits.reshape(BT, V)
targets_flat = targets.flatten()
# Softmax
probs = softmax(logits_2d, axis=-1)
# Smooth targets
smooth_targets = np.full((BT, V), smoothing / (V - 1), dtype=np.float32)
smooth_targets[np.arange(BT), targets_flat] = 1.0 - smoothing
# Mask PAD tokens
pad_mask = (targets_flat != CFG.PAD_ID).astype(np.float32)
# Cross entropy
log_probs = np.log(probs + 1e-9)
loss_per_token = -(smooth_targets * log_probs).sum(axis=-1)
loss = (loss_per_token * pad_mask).sum() / (pad_mask.sum() + 1e-9)
# Gradient: d(CE)/d(logits) = probs - smooth_targets (masked)
d_logits = (probs - smooth_targets) * pad_mask.reshape(-1, 1) / (pad_mask.sum() + 1e-9)
d_logits = d_logits.reshape(B, T, V)
return float(loss), d_logits
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 12: DATA PIPELINE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def make_batches(data: List[Tuple[str, str]], tokenizer: HRANTokenizer,
batch_size: int, max_len: int) -> List[Tuple[np.ndarray, np.ndarray]]:
"""
Convert Q-A pairs to batched (input_ids, target_ids) for language modeling.
Format: BOS + question + answer + EOS
Target: shifted right (predict next token at each position)
"""
sequences = []
for q, a in data:
q_ids = tokenizer.encode(q)
a_ids = tokenizer.encode(a)
full = [CFG.BOS_ID] + q_ids + a_ids + [CFG.EOS_ID]
full = full[:max_len + 1] # +1 because we shift
sequences.append(full)
# Sort by length for efficient batching
sequences.sort(key=len)
batches = []
for i in range(0, len(sequences), batch_size):
batch_seqs = sequences[i:i + batch_size]
max_seq = max(len(s) for s in batch_seqs)
max_seq = min(max_seq, max_len + 1)
inputs = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32)
targets = np.full((len(batch_seqs), max_seq - 1), CFG.PAD_ID, dtype=np.int32)
for j, seq in enumerate(batch_seqs):
seq = seq[:max_seq]
L = len(seq) - 1
inputs[j, :L] = seq[:-1]
targets[j, :L] = seq[1:]
batches.append((inputs, targets))
return batches
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 13: LEARNING RATE SCHEDULE (Cosine with Warmup)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_lr(step: int, total_steps: int, warmup_steps: int, base_lr: float) -> float:
"""
Cosine annealing with linear warmup.
From first principles: minimizing oscillation near minima (Haykin Ch.4).
"""
if step < warmup_steps:
return base_lr * step / max(warmup_steps, 1)
progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 14: TRAINING LOOP
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def train(model: HRANModel, tokenizer: HRANTokenizer,
data: List[Tuple[str, str]], config: HRANConfig):
"""
Full training loop implementing:
1. Hebbian pre-seeding (Haykin Ch.2)
2. Mini-batch gradient descent with Adam + Wiener scaling (Haykin Ch.3)
3. Label smoothing regularization (Haykin Ch.4)
4. Cosine LR schedule
5. Gradient clipping (stability)
"""
print("=" * 65)
print(" HRAN Training β Haykin Resonant Attention Network")
print("=" * 65)
# Step 1: Hebbian pre-initialization
hebbian_seed(model, tokenizer, data)
# Step 2: Prepare data
batches = make_batches(data, tokenizer, config.batch_size, config.max_seq_len)
total_steps = len(batches) * config.epochs
step = 0
print(f"[Training] {len(data)} samples | {len(batches)} batches | "
f"{config.epochs} epochs | {total_steps} total steps")
print(f"[Training] LR={config.learning_rate} | Batch={config.batch_size} | "
f"Warmup={config.warmup_steps}\n")
best_loss = float("inf")
history = []
for epoch in range(config.epochs):
epoch_loss = 0.0
epoch_batches = 0
# Shuffle batches each epoch
random.shuffle(batches)
for inp, tgt in batches:
lr = get_lr(step, total_steps, config.warmup_steps, config.learning_rate)
# Forward pass
model.zero_grads()
logits = model.forward(inp, training=True)
# Loss + grad
loss, d_logits = cross_entropy_loss(logits, tgt, config.label_smoothing)
# Backward pass
model.backward(d_logits)
# Gradient clipping (Haykin: bounded weight updates for stability)
model.clip_grads(config.grad_clip)
# Parameter update with Wiener-scaled Adam
model.update(lr)
epoch_loss += loss
epoch_batches += 1
step += 1
avg_loss = epoch_loss / max(epoch_batches, 1)
history.append(avg_loss)
# Compute perplexity
perplexity = math.exp(min(avg_loss, 20))
if avg_loss < best_loss:
best_loss = avg_loss
model.save("hran_best.pkl")
# Progress display
if (epoch + 1) % 5 == 0 or epoch == 0:
bar_len = 20
filled = int(bar_len * (epoch + 1) / config.epochs)
bar = "β" * filled + "β" * (bar_len - filled)
print(f" Epoch {epoch+1:3d}/{config.epochs} [{bar}] "
f"Loss: {avg_loss:.4f} | PPL: {perplexity:.1f} | LR: {lr:.6f}")
print(f"\n[Training Complete] Best loss: {best_loss:.4f} | "
f"Best PPL: {math.exp(min(best_loss, 20)):.2f}")
return history
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 15: GENERATION (with Temperature + Top-k + Top-p)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate(model: HRANModel, tokenizer: HRANTokenizer, prompt: str,
max_new_tokens: int = 60, temperature: float = 0.7,
top_k: int = 40, top_p: float = 0.9) -> str:
"""
Autoregressive generation with:
- Temperature scaling (Haykin: noise injection for exploration)
- Top-k sampling (competitive selection β like lateral inhibition)
- Top-p (nucleus) sampling (information-theoretic probability mass cutoff)
"""
input_ids = [CFG.BOS_ID] + tokenizer.encode(prompt)
for _ in range(max_new_tokens):
# Trim to max sequence length
ctx = input_ids[-CFG.max_seq_len:]
inp = np.array([ctx], dtype=np.int32)
# Forward (no dropout during inference)
logits = model.forward(inp, training=False)
# Get logits for the last position
next_logits = logits[0, -1, :].astype(np.float64)
# Temperature scaling
next_logits /= max(temperature, 1e-8)
# Top-k filtering
if top_k > 0:
kth_val = np.partition(next_logits, -top_k)[-top_k]
next_logits[next_logits < kth_val] = -1e9
# Top-p (nucleus) filtering
probs = softmax(next_logits)
sorted_indices = np.argsort(-probs)
cumprob = 0.0
cutoff_idx = len(sorted_indices)
for rank, idx in enumerate(sorted_indices):
cumprob += probs[idx]
if cumprob >= top_p:
cutoff_idx = rank + 1
break
# Zero out everything below nucleus
keep_ids = set(sorted_indices[:cutoff_idx])
for i in range(len(probs)):
if i not in keep_ids:
probs[i] = 0.0
probs /= probs.sum() + 1e-9
# Sample
next_id = int(np.random.choice(len(probs), p=probs))
if next_id == CFG.EOS_ID:
break
input_ids.append(next_id)
# Decode only the generated portion (after input)
generated_ids = input_ids[1 + len(tokenizer.encode(prompt)):]
return tokenizer.decode(generated_ids)
def generate_response(model: HRANModel, tokenizer: HRANTokenizer,
question: str, temperature: float = 0.6) -> str:
"""
Generate a response to a conversational input.
Uses multiple sampling attempts and picks the best by length heuristic.
"""
# Normalize input
q = question.lower().strip().rstrip("?!.")
candidates = []
for temp in [temperature, temperature * 0.8, temperature * 1.2]:
resp = generate(model, tokenizer, q, max_new_tokens=60,
temperature=temp, top_k=50, top_p=0.92)
resp = resp.strip()
if len(resp.split()) >= 3:
candidates.append(resp)
if not candidates:
return "I am still learning. Could you rephrase that?"
# Pick the response with most content (heuristic)
best = max(candidates, key=lambda r: len(r.split()))
# Capitalize first letter
if best:
best = best[0].upper() + best[1:]
return best
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 16: CONVERSATIONAL CHAT INTERFACE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BANNER = """
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β β
β βββ ββββββββββ ββββββ ββββ βββ β
β βββ ββββββββββββββββββββββββ βββ β
β ββββββββββββββββββββββββββββββ βββ β
β ββββββββββββββββββββββββββββββββββ β
β βββ ββββββ ββββββ ββββββ ββββββ β
β βββ ββββββ ββββββ ββββββ βββββ β
β β
β Haykin Resonant Attention Network β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β Architecture grounded in: Simon Haykin's Neural Networks β
β and Learning Machines + First Principles of Information Theory β
β β
β Innovations: β
β β’ RBF Attention Kernels (Ch.5) β’ Hebbian Embedding Init (Ch.2) β
β β’ Infomax FFN Activation (Ch.10) β’ Lateral Inhibition (Ch.9) β
β β’ Wiener Gradient Scaling (Ch.3) β
β β
β Commands: 'quit' to exit | 'info' for architecture details β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
ARCH_INFO = """
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β HRAN Architecture Details β
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β Embedding dim : 128 Vocab size : ~1500 β
β HRAN layers : 4 Attn heads : 4 β
β FFN dim : 512 Max seq len : 64 β
β Total params : ~2.5M Training : 80 epochs β
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β RBF Attention : A_ij = softmax(-Ξ³βq_i - k_jβΒ²) β
β Infomax Act. : f(x) = tanh(x) + 0.1x β
β Hebbian Init : ΞW = Ξ·(yΒ·x - yΒ²Β·W) [Oja's rule] β
β Wiener Scale : lr_eff = lr Γ SNR/(1+SNR) β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
def chat_loop(model: HRANModel, tokenizer: HRANTokenizer):
"""Main conversational loop."""
print(BANNER)
print(" Ready to converse. Type your question or message.\n")
history = []
while True:
try:
user_input = input(" You βΊ ").strip()
except (EOFError, KeyboardInterrupt):
print("\n HRAN βΊ Goodbye. Keep thinking.\n")
break
if not user_input:
continue
if user_input.lower() in ["quit", "exit", "bye", "goodbye"]:
print(" HRAN βΊ Goodbye. Keep thinking.\n")
break
if user_input.lower() == "info":
print(ARCH_INFO)
continue
if user_input.lower() == "history":
if history:
print("\n [Conversation History]")
for i, (q, r) in enumerate(history[-5:], 1):
print(f" {i}. You: {q}")
print(f" HRAN: {r}\n")
else:
print(" [No history yet]\n")
continue
# Generate response
print(" HRAN βΊ ", end="", flush=True)
t0 = time.time()
response = generate_response(model, tokenizer, user_input)
elapsed = time.time() - t0
print(response)
print(f" {'β' * 60}")
history.append((user_input, response))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SECTION 17: MAIN ENTRY POINT
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def main():
np.random.seed(42)
random.seed(42)
print("\n" + "β" * 65)
print(" HRAN β Haykin Resonant Attention Network")
print(" Built strictly from Haykin + First Principles")
print("β" * 65 + "\n")
# Build tokenizer
tokenizer = HRANTokenizer(max_vocab=CFG.vocab_size)
tokenizer.build(FULL_DATASET)
CFG.vocab_size = tokenizer.vocab_size
# Build model
model = HRANModel(CFG)
# Check for saved model
model_path = "hran_best.pkl"
if os.path.exists(model_path):
print(f"[HRAN] Found saved model at {model_path}")
ans = input(" Load existing model? [Y/n]: ").strip().lower()
if ans != "n":
model.load(model_path)
print(" Loaded! Entering chat mode.\n")
chat_loop(model, tokenizer)
return
# Train
print("\n[HRAN] Starting training from scratch...\n")
history = train(model, tokenizer, FULL_DATASET, CFG)
# Plot loss if matplotlib available
try:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
plt.plot(history, color="#e74c3c", linewidth=2)
plt.title("HRAN Training Loss (Haykin RBF-Attention + Infomax FFN)")
plt.xlabel("Epoch")
plt.ylabel("Cross-Entropy Loss")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("hran_training_loss.png", dpi=150)
plt.close()
print("\n[HRAN] Loss curve saved to hran_training_loss.png")
except ImportError:
pass
print("\n[HRAN] Training complete! Entering chat mode.")
print(" (Model auto-saved as hran_best.pkl)\n")
chat_loop(model, tokenizer)
if __name__ == "__main__":
main()
|