Adrian Gabriel commited on
Commit
9891cc7
Β·
1 Parent(s): cd838e1

fix elementwise addition error and json import error

Browse files
Files changed (3) hide show
  1. static/index.html +176 -97
  2. tinytorch/core/conv.py +2127 -0
  3. tracer.py +16 -7
static/index.html CHANGED
@@ -1248,8 +1248,14 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1248
 
1249
  // Helper to generate label from group data using current tensor names
1250
  function getGroupLabel(g) {
1251
- const output = tensors[g.outputId];
1252
- const inputs = (g.inputIds || []).map(id => tensors[id]).filter(Boolean);
 
 
 
 
 
 
1253
  const outputName = output?.name || g.opType;
1254
 
1255
  // For linear layers, only show the original input (not weight/bias)
@@ -1284,8 +1290,15 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1284
 
1285
  // Helper to build op object from group data
1286
  function getGroupOp(g) {
1287
- const output = tensors[g.outputId];
1288
- const inputs = (g.inputIds || []).map(id => tensors[id]).filter(Boolean);
 
 
 
 
 
 
 
1289
  return { type: g.opType, inputs, output, meta: g.meta || {} };
1290
  }
1291
 
@@ -1313,57 +1326,37 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1313
 
1314
  // If skipLayout is set, just position groups using their saved coordinates
1315
  if (skipLayout) {
1316
- const PAD = 20;
1317
- const LABEL_H = 24;
 
 
 
 
 
1318
 
1319
  // Use saved positions - no layout calculation
1320
  groups.forEach(g => {
1321
  const el = groupElById[g.id];
1322
  if (el) {
1323
- el.style.left = (g.x || 0) + 'px';
1324
- el.style.top = (g.y || 0) + 'px';
 
 
 
 
 
1325
  }
1326
  });
1327
 
1328
- // Recompute box bounds (supports nested boxes)
1329
- function recomputeBoxBounds(box) {
1330
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1331
-
1332
- // Include direct groups
1333
- (box.groupIds || []).forEach(gid => {
1334
- const g = groups.find(gr => gr.id === gid);
1335
- const el = groupElById[gid];
1336
- if (g && el) {
1337
- minX = Math.min(minX, g.x || 0);
1338
- minY = Math.min(minY, g.y || 0);
1339
- maxX = Math.max(maxX, (g.x || 0) + el.offsetWidth);
1340
- maxY = Math.max(maxY, (g.y || 0) + el.offsetHeight);
1341
- }
1342
- });
1343
-
1344
- // Include child boxes
1345
- (box.childBoxIds || []).forEach(bid => {
1346
- const childBox = boxes.find(b => b.id === bid);
1347
- if (childBox) {
1348
- if (childBox.w <= 0) recomputeBoxBounds(childBox);
1349
- if (childBox.w > 0) {
1350
- minX = Math.min(minX, childBox.x);
1351
- minY = Math.min(minY, childBox.y);
1352
- maxX = Math.max(maxX, childBox.x + childBox.w);
1353
- maxY = Math.max(maxY, childBox.y + childBox.h);
1354
- }
1355
- }
1356
- });
1357
-
1358
- if (minX < Infinity) {
1359
- box.x = minX - PAD;
1360
- box.y = minY - PAD - LABEL_H;
1361
- box.w = (maxX - minX) + 2 * PAD;
1362
- box.h = (maxY - minY) + 2 * PAD + LABEL_H;
1363
- }
1364
- }
1365
 
1366
- boxes.forEach(recomputeBoxBounds);
1367
  skipLayout = false; // Reset flag
1368
  } else {
1369
  // Create boxes from pending box() calls - this handles all layout
@@ -1622,6 +1615,7 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1622
 
1623
  // ==================== Element-wise Hover Highlighting ====================
1624
  // For operations where output[i,j] corresponds to input[i,j] (ReLU, activations, etc.)
 
1625
  function setupElementwiseHover(inputCards, outputCard) {
1626
  const outputTable = outputCard.querySelector('.matrix-table');
1627
  if (!outputTable) return;
@@ -1629,21 +1623,35 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1629
  const inputTables = inputCards.map(card => card.querySelector('.matrix-table')).filter(Boolean);
1630
  if (inputTables.length === 0) return;
1631
 
 
 
 
 
 
 
 
 
1632
  const outputCells = outputTable.querySelectorAll('td[data-row][data-col]');
1633
 
1634
  outputCells.forEach(cell => {
1635
  cell.style.cursor = 'pointer';
1636
 
1637
  cell.addEventListener('mouseenter', () => {
1638
- const row = cell.dataset.row;
1639
- const col = cell.dataset.col;
1640
 
1641
  // Highlight the output cell
1642
  cell.classList.add('highlight-cell');
1643
 
1644
- // Highlight corresponding cells in all input matrices
1645
- inputTables.forEach(inputTable => {
1646
- const inputCell = inputTable.querySelector(`td[data-row="${row}"][data-col="${col}"]`);
 
 
 
 
 
 
1647
  if (inputCell) {
1648
  inputCell.classList.add('highlight-cell');
1649
  }
@@ -1806,15 +1814,26 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1806
  // ==================== Render Operation ====================
1807
  function renderOp(container, op) {
1808
  const { type, inputs, output, meta } = op;
1809
- const isElementwise = ['add', 'sub', 'mul', 'div'].includes(type) && inputs.length >= 2;
1810
- const isMatmul = type === 'matmul' && inputs.length >= 2;
1811
- const isLoss = ['mseloss', 'crossentropyloss', 'bceloss'].includes(type) && inputs.length >= 2;
1812
- const isLinear = type === 'linear' && inputs.length >= 2 && meta?.has_weight;
 
 
 
 
 
 
 
 
 
 
 
1813
 
1814
  // Determine output orientation for reduction operations
1815
  let outputOrientation = 'auto';
1816
- if (['sum', 'mean', 'max'].includes(type) && inputs[0] && output.shape && output.shape.length === 1) {
1817
- const inputShape = inputs[0].shape || [];
1818
  const axis = meta?.axis ?? meta?.arg0;
1819
 
1820
  if (inputShape.length >= 2) {
@@ -1831,13 +1850,25 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1831
  // Get operator symbol for element-wise operations
1832
  const opSymbols = { add: '+', sub: 'βˆ’', mul: 'Γ—', div: 'Γ·' };
1833
 
1834
- if (isElementwise) {
1835
  // Element-wise: inputs side by side, result below left input
1836
  const wrapper = document.createElement('div');
1837
  wrapper.className = 'layout-elementwise';
1838
 
 
 
 
 
 
 
 
 
 
 
 
 
1839
  // First operand (top-left)
1840
- const left = createMatrixCard(inputs[0].name || inputs[0].id, inputs[0], 'auto', 'input');
1841
  left.classList.add('elem-left');
1842
  wrapper.appendChild(left);
1843
 
@@ -1847,13 +1878,13 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1847
  opSym.textContent = opSymbols[type] || '+';
1848
  wrapper.appendChild(opSym);
1849
 
1850
- // Second operand (top-right)
1851
- const right = createMatrixCard(inputs[1].name || inputs[1].id, inputs[1], 'auto', 'input');
1852
  right.classList.add('elem-right');
1853
  wrapper.appendChild(right);
1854
 
1855
  // Result (bottom-left, below first operand)
1856
- const result = createMatrixCard(output.name || type, output, outputOrientation, 'output');
1857
  result.classList.add('elem-result');
1858
  wrapper.appendChild(result);
1859
 
@@ -1864,13 +1895,13 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1864
  return;
1865
  }
1866
 
1867
- if (isLoss) {
1868
  // Loss functions: predictions | targets, loss below
1869
  const wrapper = document.createElement('div');
1870
  wrapper.className = 'layout-elementwise';
1871
 
1872
  // Predictions (top-left)
1873
- const preds = createMatrixCard(inputs[0].name || 'predictions', inputs[0], 'auto', 'input');
1874
  preds.classList.add('elem-left');
1875
  wrapper.appendChild(preds);
1876
 
@@ -1882,12 +1913,12 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1882
  wrapper.appendChild(arrow);
1883
 
1884
  // Targets (top-right)
1885
- const targs = createMatrixCard(inputs[1].name || 'targets', inputs[1], 'auto', 'input');
1886
  targs.classList.add('elem-right');
1887
  wrapper.appendChild(targs);
1888
 
1889
  // Loss value (bottom-left)
1890
- const lossCard = createMatrixCard(output.name || 'loss', output, 'auto', 'output');
1891
  lossCard.classList.add('elem-result');
1892
  wrapper.appendChild(lossCard);
1893
 
@@ -1898,40 +1929,38 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1898
  return;
1899
  }
1900
 
1901
- if (isLinear) {
1902
  // Linear layer: show as matmul layout for W @ X.T
1903
  // Instrumentation inserts weight at front: inputs[0] = weight, inputs[1] = x
1904
- const weight = inputs[0];
1905
- const x = inputs[1];
1906
- const bias = inputs.length > 2 ? inputs[2] : null;
1907
 
1908
  // Create transposed version of x for visualization
1909
  // The computation is (W @ x.T).T, so we show x.T on top
1910
  const xT = {
1911
- ...x,
1912
- name: (x.name || 'X') + '.T',
1913
- shape: x.shape ? [...x.shape].reverse() : x.shape,
1914
- data: x.data
1915
  };
1916
  // Transpose the actual data for display
1917
- if (x.data && Array.isArray(x.data) && x.data.length > 0) {
1918
  if (Array.isArray(x.data[0])) {
1919
  xT.data = x.data[0].map((_, colIdx) => x.data.map(row => row[colIdx]));
1920
  }
1921
  }
1922
 
1923
  // Create transposed version of output for visualization
1924
- // The actual output is (4,16) but visually we show W @ X.T = (16,4)
1925
  const outputT = {
1926
- ...output,
1927
- name: output.name || 'y',
1928
- shape: output.shape ? [...output.shape].reverse() : output.shape,
1929
- data: output.data
1930
  };
1931
  // Transpose the output data for display
1932
- if (output.data && Array.isArray(output.data) && output.data.length > 0) {
1933
- if (Array.isArray(output.data[0])) {
1934
- outputT.data = output.data[0].map((_, colIdx) => output.data.map(row => row[colIdx]));
1935
  }
1936
  }
1937
 
@@ -1965,20 +1994,20 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1965
  return;
1966
  }
1967
 
1968
- if (isMatmul) {
1969
  // Matrix multiplication: keep grid layout for proper alignment
1970
- const grid = document.createElement('div');
1971
  grid.className = 'layout-grid layout-binary';
1972
 
1973
- const left = createMatrixCard(inputs[0].name || inputs[0].id, inputs[0], 'auto', 'left');
1974
  left.classList.add('pos-left');
1975
  grid.appendChild(left);
1976
 
1977
- const top = createMatrixCard(inputs[1].name || inputs[1].id, inputs[1], 'auto', 'top');
1978
  top.classList.add('pos-top');
1979
  grid.appendChild(top);
1980
 
1981
- const res = createMatrixCard(output.name || type, output, outputOrientation, 'result');
1982
  res.classList.add('pos-result');
1983
  grid.appendChild(res);
1984
 
@@ -1995,6 +2024,7 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
1995
 
1996
  // Helper to get display name for a tensor, falling back to producer's output name
1997
  function getInputDisplayName(tensor) {
 
1998
  if (tensor.name) return tensor.name;
1999
  // Look for a group that produced this tensor
2000
  const producerGroup = groups.find(g => g.outputId === tensor.id);
@@ -2012,13 +2042,13 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
2012
  }
2013
 
2014
  const inputCards = [];
2015
- if (inputs[0]) {
2016
- const inputLabel = getInputDisplayName(inputs[0]);
2017
- const inputCard = createMatrixCard(inputLabel, inputs[0], 'auto', 'input');
2018
  grid.appendChild(inputCard);
2019
  inputCards.push(inputCard);
2020
  }
2021
- const outputCard = createMatrixCard(output.name || type, output, outputOrientation, 'output');
2022
  grid.appendChild(outputCard);
2023
 
2024
  // Add hover highlighting based on operation type
@@ -2365,6 +2395,16 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
2365
  x: b.x, y: b.y, w: b.w, h: b.h,
2366
  fromCode: b.fromCode
2367
  })),
 
 
 
 
 
 
 
 
 
 
2368
  notes: canvasNotes.map(n => ({ id: n.id, x: n.x, y: n.y, text: n.text }))
2369
  };
2370
  const blob = new Blob([JSON.stringify(state, null, 2)], { type: 'application/json' });
@@ -2385,17 +2425,31 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
2385
  // Restore code
2386
  if (state.code) document.getElementById('code').value = state.code;
2387
 
 
 
 
 
 
 
 
 
 
 
 
2388
  // Restore groups
2389
- if (state.groups) {
2390
  groups.length = 0;
2391
  state.groups.forEach(g => groups.push(g));
 
 
2392
  }
2393
 
2394
  // Restore boxes
2395
- if (state.boxes) {
2396
  boxes.length = 0;
2397
  state.boxes.forEach(b => boxes.push(b));
2398
- nextBoxId = Math.max(...boxes.map(b => parseInt(b.id.replace('b', '')) || 0), 0) + 1;
 
2399
  }
2400
 
2401
  // Restore notes
@@ -2407,13 +2461,38 @@ box("attn_scores", [k_transpose, scores, scaled_scores, softmaxed_scores, attn_s
2407
  // Set flag to preserve loaded positions
2408
  skipLayout = true;
2409
 
2410
- // Render without running code (data already loaded)
 
 
 
 
 
 
 
 
 
 
2411
  render();
2412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2413
  } catch(err) {
2414
- console.error(err);
2415
  alert('Invalid JSON: ' + err.message);
2416
  }
 
 
2417
  };
2418
  reader.readAsText(file);
2419
  };
 
1248
 
1249
  // Helper to generate label from group data using current tensor names
1250
  function getGroupLabel(g) {
1251
+ // Helper to get tensor or create placeholder for missing tensor
1252
+ function getOrPlaceholder(id) {
1253
+ if (tensors[id]) return tensors[id];
1254
+ return { id, shape: [], data: [], name: id };
1255
+ }
1256
+
1257
+ const output = getOrPlaceholder(g.outputId);
1258
+ const inputs = (g.inputIds || []).map(id => getOrPlaceholder(id));
1259
  const outputName = output?.name || g.opType;
1260
 
1261
  // For linear layers, only show the original input (not weight/bias)
 
1290
 
1291
  // Helper to build op object from group data
1292
  function getGroupOp(g) {
1293
+ // Create placeholder tensor if missing (for backward-compatible JSON loading)
1294
+ function getOrPlaceholder(id) {
1295
+ if (tensors[id]) return tensors[id];
1296
+ // Create placeholder with just id (will render as empty matrix)
1297
+ return { id, shape: [], data: [], name: id };
1298
+ }
1299
+
1300
+ const output = getOrPlaceholder(g.outputId);
1301
+ const inputs = (g.inputIds || []).map(id => getOrPlaceholder(id));
1302
  return { type: g.opType, inputs, output, meta: g.meta || {} };
1303
  }
1304
 
 
1326
 
1327
  // If skipLayout is set, just position groups using their saved coordinates
1328
  if (skipLayout) {
1329
+ // Find minimum Y to calculate offset if needed (ensure content is below toolbar)
1330
+ const MIN_Y = 80; // Minimum Y to ensure content is visible below toolbar
1331
+ let minLoadedY = Infinity;
1332
+ groups.forEach(g => {
1333
+ if (typeof g.y === 'number') minLoadedY = Math.min(minLoadedY, g.y);
1334
+ });
1335
+ const yOffset = (minLoadedY < MIN_Y && minLoadedY !== Infinity) ? (MIN_Y - minLoadedY) : 0;
1336
 
1337
  // Use saved positions - no layout calculation
1338
  groups.forEach(g => {
1339
  const el = groupElById[g.id];
1340
  if (el) {
1341
+ const xPos = typeof g.x === 'number' ? g.x : 0;
1342
+ const yPos = (typeof g.y === 'number' ? g.y : 0) + yOffset;
1343
+ el.style.left = xPos + 'px';
1344
+ el.style.top = yPos + 'px';
1345
+ // Update the group object with adjusted position
1346
+ g.x = xPos;
1347
+ g.y = yPos;
1348
  }
1349
  });
1350
 
1351
+ // Boxes already have their x, y, w, h from JSON - apply same offset
1352
+ boxes.forEach(box => {
1353
+ if (typeof box.x !== 'number') box.x = 0;
1354
+ if (typeof box.y !== 'number') box.y = 0;
1355
+ else box.y = box.y + yOffset; // Apply same offset
1356
+ if (typeof box.w !== 'number' || box.w <= 0) box.w = 100;
1357
+ if (typeof box.h !== 'number' || box.h <= 0) box.h = 100;
1358
+ });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1359
 
 
1360
  skipLayout = false; // Reset flag
1361
  } else {
1362
  // Create boxes from pending box() calls - this handles all layout
 
1615
 
1616
  // ==================== Element-wise Hover Highlighting ====================
1617
  // For operations where output[i,j] corresponds to input[i,j] (ReLU, activations, etc.)
1618
+ // Handles NumPy broadcasting: 1D tensors broadcast as rows (1, N) or columns (N, 1)
1619
  function setupElementwiseHover(inputCards, outputCard) {
1620
  const outputTable = outputCard.querySelector('.matrix-table');
1621
  if (!outputTable) return;
 
1623
  const inputTables = inputCards.map(card => card.querySelector('.matrix-table')).filter(Boolean);
1624
  if (inputTables.length === 0) return;
1625
 
1626
+ // Detect the shape of each input table (for broadcasting)
1627
+ const inputShapes = inputTables.map(table => {
1628
+ const rows = table.querySelectorAll('tr');
1629
+ const numRows = rows.length;
1630
+ const numCols = rows[0] ? rows[0].querySelectorAll('td[data-col]').length : 0;
1631
+ return { rows: numRows, cols: numCols };
1632
+ });
1633
+
1634
  const outputCells = outputTable.querySelectorAll('td[data-row][data-col]');
1635
 
1636
  outputCells.forEach(cell => {
1637
  cell.style.cursor = 'pointer';
1638
 
1639
  cell.addEventListener('mouseenter', () => {
1640
+ const row = parseInt(cell.dataset.row);
1641
+ const col = parseInt(cell.dataset.col);
1642
 
1643
  // Highlight the output cell
1644
  cell.classList.add('highlight-cell');
1645
 
1646
+ // Highlight corresponding cells in all input matrices (with broadcasting)
1647
+ inputTables.forEach((inputTable, idx) => {
1648
+ const shape = inputShapes[idx];
1649
+ // Handle broadcasting: if input has only 1 row, always use row 0
1650
+ // If input has only 1 col, always use col 0
1651
+ const targetRow = shape.rows === 1 ? 0 : row;
1652
+ const targetCol = shape.cols === 1 ? 0 : col;
1653
+
1654
+ const inputCell = inputTable.querySelector(`td[data-row="${targetRow}"][data-col="${targetCol}"]`);
1655
  if (inputCell) {
1656
  inputCell.classList.add('highlight-cell');
1657
  }
 
1814
  // ==================== Render Operation ====================
1815
  function renderOp(container, op) {
1816
  const { type, inputs, output, meta } = op;
1817
+
1818
+ // Safe helper to get tensor name with fallback
1819
+ function getName(t, fallback = '?') {
1820
+ if (!t) return fallback;
1821
+ return t.name || t.id || fallback;
1822
+ }
1823
+
1824
+ // Ensure inputs is always an array
1825
+ const safeInputs = inputs || [];
1826
+ const safeOutput = output || { id: '?', name: type, shape: [], data: [] };
1827
+
1828
+ const isElementwise = ['add', 'sub', 'mul', 'div'].includes(type) && safeInputs.length >= 2;
1829
+ const isMatmul = type === 'matmul' && safeInputs.length >= 2;
1830
+ const isLoss = ['mseloss', 'crossentropyloss', 'bceloss'].includes(type) && safeInputs.length >= 2;
1831
+ const isLinear = type === 'linear' && safeInputs.length >= 2 && meta?.has_weight;
1832
 
1833
  // Determine output orientation for reduction operations
1834
  let outputOrientation = 'auto';
1835
+ if (['sum', 'mean', 'max'].includes(type) && safeInputs[0] && safeOutput.shape && safeOutput.shape.length === 1) {
1836
+ const inputShape = safeInputs[0].shape || [];
1837
  const axis = meta?.axis ?? meta?.arg0;
1838
 
1839
  if (inputShape.length >= 2) {
 
1850
  // Get operator symbol for element-wise operations
1851
  const opSymbols = { add: '+', sub: 'βˆ’', mul: 'Γ—', div: 'Γ·' };
1852
 
1853
+ if (isElementwise && safeInputs[0] && safeInputs[1]) {
1854
  // Element-wise: inputs side by side, result below left input
1855
  const wrapper = document.createElement('div');
1856
  wrapper.className = 'layout-elementwise';
1857
 
1858
+ // Determine orientation for 1D tensors based on NumPy broadcasting rules:
1859
+ // A 1D tensor (N,) broadcasts as (1, N) - a ROW, not a column
1860
+ // This affects how we display the second operand (typically bias)
1861
+ const input0Shape = safeInputs[0].shape || [];
1862
+ const input1Shape = safeInputs[1].shape || [];
1863
+
1864
+ // If second input is 1D and first is 2D, display second as ROW (how NumPy broadcasts it)
1865
+ let rightOrientation = 'auto';
1866
+ if (input1Shape.length === 1 && input0Shape.length >= 2) {
1867
+ rightOrientation = 'row'; // 1D broadcasts as row in NumPy
1868
+ }
1869
+
1870
  // First operand (top-left)
1871
+ const left = createMatrixCard(getName(safeInputs[0]), safeInputs[0], 'auto', 'input');
1872
  left.classList.add('elem-left');
1873
  wrapper.appendChild(left);
1874
 
 
1878
  opSym.textContent = opSymbols[type] || '+';
1879
  wrapper.appendChild(opSym);
1880
 
1881
+ // Second operand (top-right) - use row orientation for 1D bias
1882
+ const right = createMatrixCard(getName(safeInputs[1]), safeInputs[1], rightOrientation, 'input');
1883
  right.classList.add('elem-right');
1884
  wrapper.appendChild(right);
1885
 
1886
  // Result (bottom-left, below first operand)
1887
+ const result = createMatrixCard(getName(safeOutput, type), safeOutput, outputOrientation, 'output');
1888
  result.classList.add('elem-result');
1889
  wrapper.appendChild(result);
1890
 
 
1895
  return;
1896
  }
1897
 
1898
+ if (isLoss && safeInputs[0] && safeInputs[1]) {
1899
  // Loss functions: predictions | targets, loss below
1900
  const wrapper = document.createElement('div');
1901
  wrapper.className = 'layout-elementwise';
1902
 
1903
  // Predictions (top-left)
1904
+ const preds = createMatrixCard(getName(safeInputs[0], 'predictions'), safeInputs[0], 'auto', 'input');
1905
  preds.classList.add('elem-left');
1906
  wrapper.appendChild(preds);
1907
 
 
1913
  wrapper.appendChild(arrow);
1914
 
1915
  // Targets (top-right)
1916
+ const targs = createMatrixCard(getName(safeInputs[1], 'targets'), safeInputs[1], 'auto', 'input');
1917
  targs.classList.add('elem-right');
1918
  wrapper.appendChild(targs);
1919
 
1920
  // Loss value (bottom-left)
1921
+ const lossCard = createMatrixCard(getName(safeOutput, 'loss'), safeOutput, 'auto', 'output');
1922
  lossCard.classList.add('elem-result');
1923
  wrapper.appendChild(lossCard);
1924
 
 
1929
  return;
1930
  }
1931
 
1932
+ if (isLinear && safeInputs[0] && safeInputs[1]) {
1933
  // Linear layer: show as matmul layout for W @ X.T
1934
  // Instrumentation inserts weight at front: inputs[0] = weight, inputs[1] = x
1935
+ const weight = safeInputs[0];
1936
+ const x = safeInputs[1];
 
1937
 
1938
  // Create transposed version of x for visualization
1939
  // The computation is (W @ x.T).T, so we show x.T on top
1940
  const xT = {
1941
+ ...(x || {}),
1942
+ name: getName(x, 'X') + '.T',
1943
+ shape: x?.shape ? [...x.shape].reverse() : [],
1944
+ data: x?.data || []
1945
  };
1946
  // Transpose the actual data for display
1947
+ if (x?.data && Array.isArray(x.data) && x.data.length > 0) {
1948
  if (Array.isArray(x.data[0])) {
1949
  xT.data = x.data[0].map((_, colIdx) => x.data.map(row => row[colIdx]));
1950
  }
1951
  }
1952
 
1953
  // Create transposed version of output for visualization
 
1954
  const outputT = {
1955
+ ...(safeOutput || {}),
1956
+ name: getName(safeOutput, 'y'),
1957
+ shape: safeOutput?.shape ? [...safeOutput.shape].reverse() : [],
1958
+ data: safeOutput?.data || []
1959
  };
1960
  // Transpose the output data for display
1961
+ if (safeOutput?.data && Array.isArray(safeOutput.data) && safeOutput.data.length > 0) {
1962
+ if (Array.isArray(safeOutput.data[0])) {
1963
+ outputT.data = safeOutput.data[0].map((_, colIdx) => safeOutput.data.map(row => row[colIdx]));
1964
  }
1965
  }
1966
 
 
1994
  return;
1995
  }
1996
 
1997
+ if (isMatmul && safeInputs[0] && safeInputs[1]) {
1998
  // Matrix multiplication: keep grid layout for proper alignment
1999
+ const grid = document.createElement('div');
2000
  grid.className = 'layout-grid layout-binary';
2001
 
2002
+ const left = createMatrixCard(getName(safeInputs[0]), safeInputs[0], 'auto', 'left');
2003
  left.classList.add('pos-left');
2004
  grid.appendChild(left);
2005
 
2006
+ const top = createMatrixCard(getName(safeInputs[1]), safeInputs[1], 'auto', 'top');
2007
  top.classList.add('pos-top');
2008
  grid.appendChild(top);
2009
 
2010
+ const res = createMatrixCard(getName(safeOutput, type), safeOutput, outputOrientation, 'result');
2011
  res.classList.add('pos-result');
2012
  grid.appendChild(res);
2013
 
 
2024
 
2025
  // Helper to get display name for a tensor, falling back to producer's output name
2026
  function getInputDisplayName(tensor) {
2027
+ if (!tensor) return '?';
2028
  if (tensor.name) return tensor.name;
2029
  // Look for a group that produced this tensor
2030
  const producerGroup = groups.find(g => g.outputId === tensor.id);
 
2042
  }
2043
 
2044
  const inputCards = [];
2045
+ if (safeInputs[0]) {
2046
+ const inputLabel = getInputDisplayName(safeInputs[0]);
2047
+ const inputCard = createMatrixCard(inputLabel, safeInputs[0], 'auto', 'input');
2048
  grid.appendChild(inputCard);
2049
  inputCards.push(inputCard);
2050
  }
2051
+ const outputCard = createMatrixCard(getName(safeOutput, type), safeOutput, outputOrientation, 'output');
2052
  grid.appendChild(outputCard);
2053
 
2054
  // Add hover highlighting based on operation type
 
2395
  x: b.x, y: b.y, w: b.w, h: b.h,
2396
  fromCode: b.fromCode
2397
  })),
2398
+ tensors: Object.fromEntries(
2399
+ Object.entries(tensors)
2400
+ .filter(([id, t]) => t != null) // Skip null/undefined tensors
2401
+ .map(([id, t]) => [id, {
2402
+ id: t.id || id,
2403
+ shape: t.shape || [],
2404
+ data: t.data || [],
2405
+ name: t.name || null
2406
+ }])
2407
+ ),
2408
  notes: canvasNotes.map(n => ({ id: n.id, x: n.x, y: n.y, text: n.text }))
2409
  };
2410
  const blob = new Blob([JSON.stringify(state, null, 2)], { type: 'application/json' });
 
2425
  // Restore code
2426
  if (state.code) document.getElementById('code').value = state.code;
2427
 
2428
+ // Restore tensors (needed for rendering)
2429
+ if (state.tensors && typeof state.tensors === 'object') {
2430
+ // Clear and restore tensors
2431
+ Object.keys(tensors).forEach(k => delete tensors[k]);
2432
+ Object.entries(state.tensors).forEach(([id, t]) => {
2433
+ if (t != null) {
2434
+ tensors[id] = t;
2435
+ }
2436
+ });
2437
+ }
2438
+
2439
  // Restore groups
2440
+ if (state.groups && Array.isArray(state.groups)) {
2441
  groups.length = 0;
2442
  state.groups.forEach(g => groups.push(g));
2443
+ const groupNums = groups.map(g => parseInt((g.id || '').replace('g', '')) || 0);
2444
+ nextGroupId = groupNums.length > 0 ? Math.max(...groupNums) + 1 : 1;
2445
  }
2446
 
2447
  // Restore boxes
2448
+ if (state.boxes && Array.isArray(state.boxes)) {
2449
  boxes.length = 0;
2450
  state.boxes.forEach(b => boxes.push(b));
2451
+ const boxNums = boxes.map(b => parseInt((b.id || '').replace('b', '')) || 0);
2452
+ nextBoxId = boxNums.length > 0 ? Math.max(...boxNums) + 1 : 1;
2453
  }
2454
 
2455
  // Restore notes
 
2461
  // Set flag to preserve loaded positions
2462
  skipLayout = true;
2463
 
2464
+ // If tensors weren't in the JSON (old format), warn user
2465
+ if (!state.tensors || Object.keys(state.tensors).length === 0) {
2466
+ alert('This JSON was saved in an old format without tensor data.\n\n' +
2467
+ 'The layout will be restored, but matrices will be empty.\n' +
2468
+ 'To get full data: Run code, then Export JSON again.');
2469
+ }
2470
+
2471
+ // Reset zoom to 100% for proper positioning (set before render so toolbar picks it up)
2472
+ zoomLevel = 1;
2473
+
2474
+ // Render without running code (data already loaded or using placeholders)
2475
  render();
2476
 
2477
+ // After render, ensure zoom select shows correct value and scroll to top
2478
+ setTimeout(() => {
2479
+ const zoomSelect = document.querySelector('#toolbar select');
2480
+ if (zoomSelect) zoomSelect.value = '100%';
2481
+ applyZoom();
2482
+
2483
+ const canvas = document.getElementById('canvas');
2484
+ if (canvas) {
2485
+ canvas.scrollTop = 0;
2486
+ canvas.scrollLeft = 0;
2487
+ }
2488
+ }, 50);
2489
+
2490
  } catch(err) {
2491
+ console.error('JSON load error:', err);
2492
  alert('Invalid JSON: ' + err.message);
2493
  }
2494
+ // Reset file input so same file can be loaded again
2495
+ e.target.value = '';
2496
  };
2497
  reader.readAsText(file);
2498
  };
tinytorch/core/conv.py ADDED
@@ -0,0 +1,2127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---
2
+ # jupyter:
3
+ # jupytext:
4
+ # text_representation:
5
+ # extension: .py
6
+ # format_name: percent
7
+ # format_version: '1.3'
8
+ # jupytext_version: 1.17.1
9
+ # kernelspec:
10
+ # display_name: Python 3 (ipykernel)
11
+ # language: python
12
+ # name: python3
13
+ # ---
14
+
15
+ # %% [markdown]
16
+ """
17
+ # Module 09: Convolutions - Processing Images with Convolutions
18
+
19
+ Welcome to Module 09! You'll implement spatial operations that transform machine learning from working with simple vectors to understanding images and spatial patterns.
20
+
21
+ ## πŸ”— Prerequisites & Progress
22
+ **You've Built**: Complete training pipeline with MLPs, optimizers, and data loaders
23
+ **You'll Build**: Spatial operations - Conv2d, MaxPool2d, AvgPool2d for image processing
24
+ **You'll Enable**: Convolutional Neural Networks (CNNs) for computer vision
25
+
26
+ **Connection Map**:
27
+ ```
28
+ Training Pipeline β†’ Spatial Operations β†’ CNN (Milestone 03)
29
+ (MLPs) (Conv/Pool) (Computer Vision)
30
+ ```
31
+
32
+ ## 🎯 Learning Objectives
33
+ By the end of this module, you will:
34
+ 1. Implement Conv2d with explicit loops to understand O(NΒ²MΒ²KΒ²) complexity
35
+ 2. Build pooling operations (Max and Average) for spatial reduction
36
+ 3. Understand receptive fields and spatial feature extraction
37
+ 4. Analyze memory vs computation trade-offs in spatial operations
38
+
39
+ Let's get started!
40
+
41
+ ## πŸ“¦ Where This Code Lives in the Final Package
42
+
43
+ **Learning Side:** You work in `modules/09_convolutions/convolutions_dev.py`
44
+ **Building Side:** Code exports to `tinytorch.core.spatial`
45
+
46
+ ```python
47
+ # How to use this module:
48
+ from tinytorch.core.spatial import Conv2d, MaxPool2d, AvgPool2d
49
+ ```
50
+
51
+ **Why this matters:**
52
+ - **Learning:** Complete spatial processing system in one focused module for deep understanding
53
+ - **Production:** Proper organization like PyTorch's torch.nn.Conv2d with all spatial operations together
54
+ - **Consistency:** All convolution and pooling operations in core.spatial
55
+ - **Integration:** Works seamlessly with existing layers for complete CNN architectures
56
+ """
57
+
58
+ # %% nbgrader={"grade": false, "grade_id": "spatial-setup", "solution": true}
59
+
60
+
61
+ #| default_exp core.spatial
62
+
63
+ #| export
64
+ import numpy as np
65
+ import time
66
+
67
+ from tinytorch.core.tensor import Tensor
68
+
69
+ # Constants for convolution defaults
70
+ DEFAULT_KERNEL_SIZE = 3 # Default kernel size for convolutions
71
+ DEFAULT_STRIDE = 1 # Default stride for convolutions
72
+ DEFAULT_PADDING = 0 # Default padding for convolutions
73
+
74
+ # %% [markdown]
75
+ """
76
+ ## πŸ’‘ Introduction - What are Spatial Operations?
77
+
78
+ Spatial operations transform machine learning from working with simple vectors to understanding images and spatial patterns. When you look at a photo, your brain naturally processes spatial relationships - edges, textures, objects. Spatial operations give neural networks this same capability.
79
+
80
+ ### The Two Core Spatial Operations
81
+
82
+ **Convolution**: Detects local patterns by sliding filters across the input
83
+ **Pooling**: Reduces spatial dimensions while preserving important features
84
+
85
+ ### Visual Example: How Convolution Works
86
+
87
+ ```
88
+ Input Image (5Γ—5): Kernel (3Γ—3): Output (3Γ—3):
89
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”
90
+ β”‚ 1 2 3 4 5 β”‚ β”‚ 1 0 -1 β”‚ β”‚ ? ? ? β”‚
91
+ β”‚ 6 7 8 9 0 β”‚ * β”‚ 1 0 -1 β”‚ = β”‚ ? ? ? β”‚
92
+ β”‚ 1 2 3 4 5 β”‚ β”‚ 1 0 -1 β”‚ β”‚ ? ? ? β”‚
93
+ β”‚ 6 7 8 9 0 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
94
+ β”‚ 1 2 3 4 5 β”‚
95
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
96
+
97
+ Sliding Window Process:
98
+ Position (0,0): [1,2,3] Position (0,1): [2,3,4] Position (0,2): [3,4,5]
99
+ [6,7,8] * [7,8,9] * [8,9,0] *
100
+ [1,2,3] [2,3,4] [3,4,5]
101
+ = Output[0,0] = Output[0,1] = Output[0,2]
102
+ ```
103
+
104
+ Each output pixel summarizes a local neighborhood, allowing the network to detect patterns like edges, corners, and textures.
105
+
106
+ ### Why Spatial Operations Transform ML
107
+
108
+ ```
109
+ Without Convolution: With Convolution:
110
+ 32Γ—32Γ—3 image = 3,072 inputs 32Γ—32Γ—3 β†’ Conv β†’ 32Γ—32Γ—16
111
+ ↓ ↓ ↓
112
+ Dense(3072 β†’ 1000) = 3M parameters Shared 3Γ—3 kernel = 432 parameters
113
+ ↓ ↓ ↓
114
+ Memory explosion + no spatial awareness Efficient + preserves spatial structure
115
+ ```
116
+
117
+ Convolution achieves dramatic parameter reduction (1000Γ— fewer!) while preserving the spatial relationships that matter for visual understanding.
118
+ """
119
+
120
+ # %% [markdown]
121
+ """
122
+ ## πŸ“ Mathematical Foundations
123
+
124
+ ### Understanding Convolution Step by Step
125
+
126
+ Convolution sounds complex, but it's just "sliding window multiplication and summation." Let's see exactly how it works:
127
+
128
+ ```
129
+ Step 1: Position the kernel over input
130
+ Input: Kernel:
131
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”
132
+ β”‚ 1 2 3 4 β”‚ β”‚ 1 0 β”‚ ← Place kernel at position (0,0)
133
+ β”‚ 5 6 7 8 β”‚ Γ— β”‚ 0 1 β”‚
134
+ β”‚ 9 0 1 2 β”‚ β””β”€β”€β”€β”€β”€β”˜
135
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
136
+
137
+ Step 2: Multiply corresponding elements
138
+ Overlap: Computation:
139
+ β”Œβ”€β”€β”€β”€β”€β” 1Γ—1 + 2Γ—0 + 5Γ—0 + 6Γ—1 = 1 + 0 + 0 + 6 = 7
140
+ β”‚ 1 2 β”‚
141
+ β”‚ 5 6 β”‚
142
+ β””β”€β”€β”€β”€β”€β”˜
143
+
144
+ Step 3: Slide kernel and repeat
145
+ Position (0,1): Position (1,0): Position (1,1):
146
+ β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”
147
+ β”‚ 2 3 β”‚ β”‚ 5 6 β”‚ β”‚ 6 7 β”‚
148
+ β”‚ 6 7 β”‚ β”‚ 9 0 β”‚ β”‚ 0 1 β”‚
149
+ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜
150
+ Result: 9 Result: 5 Result: 8
151
+
152
+ Final Output: β”Œβ”€β”€β”€β”€β”€β”
153
+ β”‚ 7 9 β”‚
154
+ β”‚ 5 8 β”‚
155
+ β””β”€β”€β”€β”€β”€β”˜
156
+ ```
157
+
158
+ ### The Mathematical Formula
159
+
160
+ For 2D convolution, we slide kernel K across input I:
161
+ ```
162
+ O[i,j] = Ξ£ Ξ£ I[i+m, j+n] Γ— K[m,n]
163
+ m n
164
+ ```
165
+
166
+ This formula captures the "multiply and sum" operation for each kernel position.
167
+
168
+ ### Pooling: Spatial Summarization
169
+
170
+ ```
171
+ Max Pooling Example (2Γ—2 window):
172
+ Input: Output:
173
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”
174
+ β”‚ 1 3 2 4 β”‚ β”‚ 6 8 β”‚ ← max([1,3,5,6])=6, max([2,4,7,8])=8
175
+ β”‚ 5 6 7 8 β”‚ β”‚ 9 9 β”‚ ← max([5,2,9,1])=9, max([7,4,9,3])=9
176
+ β”‚ 2 9 1 3 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”˜
177
+ β”‚ 0 1 9 3 β”‚
178
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
179
+
180
+ Average Pooling (same window):
181
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
182
+ β”‚ 3.75 5.25 β”‚ ← avg([1,3,5,6])=3.75, avg([2,4,7,8])=5.25
183
+ β”‚ 2.75 5.75 β”‚ ← avg([5,2,9,1])=4.25, avg([7,4,9,3])=5.75
184
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
185
+ ```
186
+
187
+ ### Why This Complexity Matters
188
+
189
+ For convolution with input (1, 3, 224, 224) and kernel (64, 3, 3, 3):
190
+ - **Operations**: 1 Γ— 64 Γ— 3 Γ— 3 Γ— 3 Γ— 224 Γ— 224 = 86.7 million multiply-adds
191
+ - **Memory**: Input (600KB) + Weights (6.9KB) + Output (12.8MB) = ~13.4MB
192
+
193
+ This is why kernel size matters enormously - a 7Γ—7 kernel would require 5.4Γ— more computation!
194
+
195
+ ### Key Properties That Enable Deep Learning
196
+
197
+ **Translation Equivariance**: Move the cat β†’ detection moves the same way
198
+ **Parameter Sharing**: Same edge detector works everywhere in the image
199
+ **Local Connectivity**: Each output only looks at nearby inputs (like human vision)
200
+ **Hierarchical Features**: Early layers detect edges β†’ later layers detect objects
201
+ """
202
+
203
+ # %% [markdown]
204
+ """
205
+ ## πŸ—οΈ Implementation - Building Spatial Operations
206
+
207
+ Now we'll implement convolution step by step, using explicit loops so you can see and feel the computational complexity. This helps you understand why modern optimizations matter!
208
+
209
+ ### Conv2d: Detecting Patterns with Sliding Windows
210
+
211
+ Convolution slides a small filter (kernel) across the entire input, computing weighted sums at each position. Think of it like using a template to find matching patterns everywhere in an image.
212
+
213
+ ```
214
+ Convolution Visualization:
215
+ Input (4Γ—4): Kernel (3Γ—3): Output (2Γ—2):
216
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”
217
+ β”‚ a b c d β”‚ β”‚ k1 k2 k3β”‚ β”‚ o1 o2 β”‚
218
+ β”‚ e f g h β”‚ Γ— β”‚ k4 k5 k6β”‚ = β”‚ o3 o4 β”‚
219
+ β”‚ i j k l β”‚ β”‚ k7 k8 k9β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
220
+ β”‚ m n o p β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
221
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
222
+
223
+ Computation Details:
224
+ o1 = aΓ—k1 + bΓ—k2 + cΓ—k3 + eΓ—k4 + fΓ—k5 + gΓ—k6 + iΓ—k7 + jΓ—k8 + kΓ—k9
225
+ o2 = bΓ—k1 + cΓ—k2 + dΓ—k3 + fΓ—k4 + gΓ—k5 + hΓ—k6 + jΓ—k7 + kΓ—k8 + lΓ—k9
226
+ o3 = eΓ—k1 + fΓ—k2 + gΓ—k3 + iΓ—k4 + jΓ—k5 + kΓ—k6 + mΓ—k7 + nΓ—k8 + oΓ—k9
227
+ o4 = fΓ—k1 + gΓ—k2 + hΓ—k3 + jΓ—k4 + kΓ—k5 + lΓ—k6 + nΓ—k7 + oΓ—k8 + pΓ—k9
228
+ ```
229
+
230
+ ### The Six Nested Loops of Convolution
231
+
232
+ Our implementation will use explicit loops to show exactly where the computational cost comes from:
233
+
234
+ ```
235
+ for batch in range(B): # Loop 1: Process each sample
236
+ for out_ch in range(C_out): # Loop 2: Generate each output channel
237
+ for out_h in range(H_out): # Loop 3: Each output row
238
+ for out_w in range(W_out): # Loop 4: Each output column
239
+ for k_h in range(K_h): # Loop 5: Each kernel row
240
+ for k_w in range(K_w): # Loop 6: Each kernel column
241
+ for in_ch in range(C_in): # Loop 7: Each input channel
242
+ # The actual multiply-accumulate operation
243
+ result += input[...] * kernel[...]
244
+ ```
245
+
246
+ Total operations: B Γ— C_out Γ— H_out Γ— W_out Γ— K_h Γ— K_w Γ— C_in
247
+
248
+ For typical values (B=32, C_out=64, H_out=224, W_out=224, K_h=3, K_w=3, C_in=3):
249
+ That's 32 Γ— 64 Γ— 224 Γ— 224 Γ— 3 Γ— 3 Γ— 3 = **2.8 billion operations** per forward pass!
250
+ """
251
+
252
+ # %% [markdown]
253
+ """
254
+ ### Conv2d Implementation - Building the Core of Computer Vision
255
+
256
+ Conv2d is the workhorse of computer vision. It slides learned filters across images to detect patterns like edges, textures, and eventually complex objects.
257
+
258
+ #### How Conv2d Transforms Machine Learning
259
+
260
+ ```
261
+ Before Conv2d (Dense Only): After Conv2d (Spatial Aware):
262
+ Input: 32Γ—32Γ—3 = 3,072 values Input: 32Γ—32Γ—3 structured as image
263
+ ↓ ↓
264
+ Dense(3072β†’1000) = 3M params Conv2d(3β†’16, 3Γ—3) = 448 params
265
+ ↓ ↓
266
+ No spatial awareness Preserves spatial relationships
267
+ Massive parameter count Parameter sharing across space
268
+ ```
269
+
270
+ #### Weight Initialization: He Initialization for ReLU Networks
271
+
272
+ Our Conv2d uses He initialization, specifically designed for ReLU activations:
273
+ - **Problem**: Wrong initialization β†’ vanishing/exploding gradients
274
+ - **Solution**: std = sqrt(2 / fan_in) where fan_in = channels Γ— kernel_height Γ— kernel_width
275
+ - **Why it works**: Maintains variance through ReLU nonlinearity
276
+
277
+ #### The 6-Loop Implementation Strategy
278
+
279
+ We'll implement convolution with explicit loops to show the true computational cost:
280
+
281
+ ```
282
+ Nested Loop Structure:
283
+ for batch: ← Process each sample in parallel (in practice)
284
+ for out_channel: ← Generate each output feature map
285
+ for out_h: ← Each row of output
286
+ for out_w: ← Each column of output
287
+ for k_h: ← Each row of kernel
288
+ for k_w: ← Each column of kernel
289
+ for in_ch: ← Accumulate across input channels
290
+ result += input[...] * weight[...]
291
+ ```
292
+
293
+ This reveals why convolution is expensive: O(BΓ—C_outΓ—HΓ—WΓ—K_hΓ—K_wΓ—C_in) operations!
294
+ """
295
+
296
+ # %% nbgrader={"grade": false, "grade_id": "conv2d-class", "solution": true}
297
+
298
+ #| export
299
+
300
+ class Conv2d:
301
+ """
302
+ 2D Convolution layer for spatial feature extraction.
303
+
304
+ Implements convolution with explicit loops to demonstrate
305
+ computational complexity and memory access patterns.
306
+
307
+ Args:
308
+ in_channels: Number of input channels
309
+ out_channels: Number of output feature maps
310
+ kernel_size: Size of convolution kernel (int or tuple)
311
+ stride: Stride of convolution (default: 1)
312
+ padding: Zero-padding added to input (default: 0)
313
+ bias: Whether to add learnable bias (default: True)
314
+ """
315
+
316
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
317
+ """
318
+ Initialize Conv2d layer with proper weight initialization.
319
+
320
+ TODO: Complete Conv2d initialization
321
+
322
+ APPROACH:
323
+ 1. Store hyperparameters (channels, kernel_size, stride, padding)
324
+ 2. Initialize weights using He initialization for ReLU compatibility
325
+ 3. Initialize bias (if enabled) to zeros
326
+ 4. Use proper shapes: weight (out_channels, in_channels, kernel_h, kernel_w)
327
+
328
+ WEIGHT INITIALIZATION:
329
+ - He init: std = sqrt(2 / (in_channels * kernel_h * kernel_w))
330
+ - This prevents vanishing/exploding gradients with ReLU
331
+
332
+ HINT: Convert kernel_size to tuple if it's an integer
333
+ """
334
+ super().__init__()
335
+
336
+ ### BEGIN SOLUTION
337
+ self.in_channels = in_channels
338
+ self.out_channels = out_channels
339
+
340
+ # Handle kernel_size as int or tuple
341
+ if isinstance(kernel_size, int):
342
+ self.kernel_size = (kernel_size, kernel_size)
343
+ else:
344
+ self.kernel_size = kernel_size
345
+
346
+ self.stride = stride
347
+ self.padding = padding
348
+
349
+ # He initialization for ReLU networks
350
+ kernel_h, kernel_w = self.kernel_size
351
+ fan_in = in_channels * kernel_h * kernel_w
352
+ std = np.sqrt(2.0 / fan_in)
353
+
354
+ # Weight shape: (out_channels, in_channels, kernel_h, kernel_w)
355
+ self.weight = Tensor(np.random.normal(0, std,
356
+ (out_channels, in_channels, kernel_h, kernel_w)))
357
+
358
+ # Bias initialization
359
+ if bias:
360
+ self.bias = Tensor(np.zeros(out_channels))
361
+ else:
362
+ self.bias = None
363
+ ### END SOLUTION
364
+
365
+ def forward(self, x):
366
+ """
367
+ Forward pass through Conv2d layer.
368
+
369
+ TODO: Implement convolution with explicit loops
370
+
371
+ APPROACH:
372
+ 1. Extract input dimensions and validate
373
+ 2. Calculate output dimensions
374
+ 3. Apply padding if needed
375
+ 4. Implement 6 nested loops for full convolution
376
+ 5. Add bias if present
377
+
378
+ LOOP STRUCTURE:
379
+ for batch in range(batch_size):
380
+ for out_ch in range(out_channels):
381
+ for out_h in range(out_height):
382
+ for out_w in range(out_width):
383
+ for k_h in range(kernel_height):
384
+ for k_w in range(kernel_width):
385
+ for in_ch in range(in_channels):
386
+ # Accumulate: out += input * weight
387
+
388
+ EXAMPLE:
389
+ >>> conv = Conv2d(3, 16, kernel_size=3, padding=1)
390
+ >>> x = Tensor(np.random.randn(2, 3, 32, 32)) # batch=2, RGB, 32x32
391
+ >>> out = conv(x)
392
+ >>> print(out.shape) # Should be (2, 16, 32, 32)
393
+
394
+ HINTS:
395
+ - Handle padding by creating padded input array
396
+ - Watch array bounds in inner loops
397
+ - Accumulate products for each output position
398
+ """
399
+ ### BEGIN SOLUTION
400
+ # Input validation and shape extraction
401
+ if len(x.shape) != 4:
402
+ raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
403
+
404
+ batch_size, in_channels, in_height, in_width = x.shape
405
+ out_channels = self.out_channels
406
+ kernel_h, kernel_w = self.kernel_size
407
+
408
+ # Calculate output dimensions
409
+ out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
410
+ out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
411
+
412
+ # Apply padding if needed
413
+ if self.padding > 0:
414
+ padded_input = np.pad(x.data,
415
+ ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
416
+ mode='constant', constant_values=0)
417
+ else:
418
+ padded_input = x.data
419
+
420
+ # Initialize output
421
+ output = np.zeros((batch_size, out_channels, out_height, out_width))
422
+
423
+ # Explicit 6-nested loop convolution to show complexity
424
+ for b in range(batch_size):
425
+ for out_ch in range(out_channels):
426
+ for out_h in range(out_height):
427
+ for out_w in range(out_width):
428
+ # Calculate input region for this output position
429
+ in_h_start = out_h * self.stride
430
+ in_w_start = out_w * self.stride
431
+
432
+ # Accumulate convolution result
433
+ conv_sum = 0.0
434
+ for k_h in range(kernel_h):
435
+ for k_w in range(kernel_w):
436
+ for in_ch in range(in_channels):
437
+ # Get input and weight values
438
+ input_val = padded_input[b, in_ch,
439
+ in_h_start + k_h,
440
+ in_w_start + k_w]
441
+ weight_val = self.weight.data[out_ch, in_ch, k_h, k_w]
442
+
443
+ # Accumulate
444
+ conv_sum += input_val * weight_val
445
+
446
+ # Store result
447
+ output[b, out_ch, out_h, out_w] = conv_sum
448
+
449
+ # Add bias if present
450
+ if self.bias is not None:
451
+ # Broadcast bias across spatial dimensions
452
+ for out_ch in range(out_channels):
453
+ output[:, out_ch, :, :] += self.bias.data[out_ch]
454
+
455
+ return Tensor(output)
456
+ ### END SOLUTION
457
+
458
+ def parameters(self):
459
+ """Return trainable parameters."""
460
+ params = [self.weight]
461
+ if self.bias is not None:
462
+ params.append(self.bias)
463
+ return params
464
+
465
+ def __call__(self, x):
466
+ """Enable model(x) syntax."""
467
+ return self.forward(x)
468
+
469
+ # %% [markdown]
470
+ """
471
+ ### πŸ§ͺ Unit Test: Conv2d Implementation
472
+ This test validates our convolution implementation with different configurations.
473
+ **What we're testing**: Shape preservation, padding, stride effects
474
+ **Why it matters**: Convolution is the foundation of computer vision
475
+ **Expected**: Correct output shapes and reasonable value ranges
476
+ """
477
+
478
+ # %% nbgrader={"grade": true, "grade_id": "test-conv2d", "locked": true, "points": 15}
479
+
480
+
481
+ def test_unit_conv2d():
482
+ """πŸ”¬ Test Conv2d implementation with multiple configurations."""
483
+ print("πŸ”¬ Unit Test: Conv2d...")
484
+
485
+ # Test 1: Basic convolution without padding
486
+ print(" Testing basic convolution...")
487
+ conv1 = Conv2d(in_channels=3, out_channels=16, kernel_size=3)
488
+ x1 = Tensor(np.random.randn(2, 3, 32, 32))
489
+ out1 = conv1(x1)
490
+
491
+ expected_h = (32 - 3) + 1 # 30
492
+ expected_w = (32 - 3) + 1 # 30
493
+ assert out1.shape == (2, 16, expected_h, expected_w), f"Expected (2, 16, 30, 30), got {out1.shape}"
494
+
495
+ # Test 2: Convolution with padding (same size)
496
+ print(" Testing convolution with padding...")
497
+ conv2 = Conv2d(in_channels=3, out_channels=8, kernel_size=3, padding=1)
498
+ x2 = Tensor(np.random.randn(1, 3, 28, 28))
499
+ out2 = conv2(x2)
500
+
501
+ # With padding=1, output should be same size as input
502
+ assert out2.shape == (1, 8, 28, 28), f"Expected (1, 8, 28, 28), got {out2.shape}"
503
+
504
+ # Test 3: Convolution with stride
505
+ print(" Testing convolution with stride...")
506
+ conv3 = Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=2)
507
+ x3 = Tensor(np.random.randn(1, 1, 16, 16))
508
+ out3 = conv3(x3)
509
+
510
+ expected_h = (16 - 3) // 2 + 1 # 7
511
+ expected_w = (16 - 3) // 2 + 1 # 7
512
+ assert out3.shape == (1, 4, expected_h, expected_w), f"Expected (1, 4, 7, 7), got {out3.shape}"
513
+
514
+ # Test 4: Parameter counting
515
+ print(" Testing parameter counting...")
516
+ conv4 = Conv2d(in_channels=64, out_channels=128, kernel_size=3, bias=True)
517
+ params = conv4.parameters()
518
+
519
+ # Weight: (128, 64, 3, 3) = 73,728 parameters
520
+ # Bias: (128,) = 128 parameters
521
+ # Total: 73,856 parameters
522
+ weight_params = 128 * 64 * 3 * 3
523
+ bias_params = 128
524
+ total_params = weight_params + bias_params
525
+
526
+ actual_weight_params = np.prod(conv4.weight.shape)
527
+ actual_bias_params = np.prod(conv4.bias.shape) if conv4.bias is not None else 0
528
+ actual_total = actual_weight_params + actual_bias_params
529
+
530
+ assert actual_total == total_params, f"Expected {total_params} parameters, got {actual_total}"
531
+ assert len(params) == 2, f"Expected 2 parameter tensors, got {len(params)}"
532
+
533
+ # Test 5: No bias configuration
534
+ print(" Testing no bias configuration...")
535
+ conv5 = Conv2d(in_channels=3, out_channels=16, kernel_size=5, bias=False)
536
+ params5 = conv5.parameters()
537
+ assert len(params5) == 1, f"Expected 1 parameter tensor (no bias), got {len(params5)}"
538
+ assert conv5.bias is None, "Bias should be None when bias=False"
539
+
540
+ print("βœ… Conv2d works correctly!")
541
+
542
+ if __name__ == "__main__":
543
+ test_unit_conv2d()
544
+
545
+ # %% [markdown]
546
+ """
547
+ ## πŸ—οΈ Pooling Operations - Spatial Dimension Reduction
548
+
549
+ Pooling operations compress spatial information while keeping the most important features. Think of them as creating "thumbnail summaries" of local regions.
550
+
551
+ ### MaxPool2d: Keeping the Strongest Signals
552
+
553
+ Max pooling finds the strongest activation in each window, preserving sharp features like edges and corners.
554
+
555
+ ```
556
+ MaxPool2d Example (2Γ—2 kernel, stride=2):
557
+ Input (4Γ—4): Windows: Output (2Γ—2):
558
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”
559
+ β”‚ 1 3 β”‚ 2 8 β”‚ β”‚ 1 3 β”‚ 2 8 β”‚ β”‚ 6 8 β”‚
560
+ β”‚ 5 6 β”‚ 7 4 β”‚ β†’ β”‚ 5 6 β”‚ 7 4 β”‚ β†’ β”‚ 9 7 β”‚
561
+ β”œβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€ β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€ β””β”€β”€β”€β”€β”€β”€β”€β”˜
562
+ β”‚ 2 9 β”‚ 1 7 β”‚ β”‚ 2 9 β”‚ 1 7 β”‚
563
+ β”‚ 0 1 β”‚ 3 6 β”‚ β”‚ 0 1 β”‚ 3 6 β”‚
564
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
565
+
566
+ Window Computations:
567
+ Top-left: max(1,3,5,6) = 6 Top-right: max(2,8,7,4) = 8
568
+ Bottom-left: max(2,9,0,1) = 9 Bottom-right: max(1,7,3,6) = 7
569
+ ```
570
+
571
+ ### AvgPool2d: Smoothing Local Features
572
+
573
+ Average pooling computes the mean of each window, creating smoother, more general features.
574
+
575
+ ```
576
+ AvgPool2d Example (same 2Γ—2 kernel, stride=2):
577
+ Input (4Γ—4): Output (2Γ—2):
578
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
579
+ β”‚ 1 3 β”‚ 2 8 β”‚ β”‚ 3.75 5.25 β”‚
580
+ β”‚ 5 6 β”‚ 7 4 β”‚ β†’ β”‚ 3.0 4.25 β”‚
581
+ β”œβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
582
+ β”‚ 2 9 β”‚ 1 7 β”‚
583
+ β”‚ 0 1 β”‚ 3 6 β”‚
584
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
585
+
586
+ Window Computations:
587
+ Top-left: (1+3+5+6)/4 = 3.75 Top-right: (2+8+7+4)/4 = 5.25
588
+ Bottom-left: (2+9+0+1)/4 = 3.0 Bottom-right: (1+7+3+6)/4 = 4.25
589
+ ```
590
+
591
+ ### Why Pooling Matters for Computer Vision
592
+
593
+ ```
594
+ Memory Impact:
595
+ Input: 224Γ—224Γ—64 = 3.2M values After 2Γ—2 pooling: 112Γ—112Γ—64 = 0.8M values
596
+ Memory reduction: 4Γ— less! Computation reduction: 4Γ— less!
597
+
598
+ Information Trade-off:
599
+ βœ… Preserves important features ⚠️ Loses fine spatial detail
600
+ βœ… Provides translation invariance ⚠️ Reduces localization precision
601
+ βœ… Reduces overfitting ⚠️ May lose small objects
602
+ ```
603
+
604
+ ### Sliding Window Pattern
605
+
606
+ Both pooling operations follow the same sliding window pattern:
607
+
608
+ ```
609
+ Sliding 2Γ—2 window with stride=2:
610
+ Step 1: Step 2: Step 3: Step 4:
611
+ β”Œβ”€β”€β” β”Œβ”€β”€β”
612
+ β”‚β–“β–“β”‚ β”‚β–“β–“β”‚
613
+ β””β”€β”€β”˜ β””β”€β”€β”˜ β”Œβ”€β”€β” β”Œβ”€β”€β”
614
+ β”‚β–“β–“β”‚ β”‚β–“β–“β”‚
615
+ β””β”€β”€β”˜ β””β”€β”€β”˜
616
+
617
+ Non-overlapping windows β†’ Each input pixel used exactly once
618
+ Stride=2 β†’ Output dimensions halved in each direction
619
+ ```
620
+
621
+ The key difference: MaxPool takes max(window), AvgPool takes mean(window).
622
+ """
623
+
624
+ # %% [markdown]
625
+ """
626
+ ### MaxPool2d Implementation - Preserving Strong Features
627
+
628
+ MaxPool2d finds the strongest activation in each spatial window, creating a compressed representation that keeps the most important information.
629
+
630
+ #### Why Max Pooling Works for Computer Vision
631
+
632
+ ```
633
+ Edge Detection Example:
634
+ Input Window (2Γ—2): Max Pooling Result:
635
+ β”Œβ”€β”€οΏ½οΏ½οΏ½β”€β”€β”¬β”€β”€β”€β”€β”€β”
636
+ β”‚ 0.1 β”‚ 0.8 β”‚ ← Strong edge signal
637
+ β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
638
+ β”‚ 0.2 β”‚ 0.1 β”‚ Output: 0.8 (preserves edge)
639
+ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
640
+
641
+ Noise Reduction Example:
642
+ Input Window (2Γ—2):
643
+ β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
644
+ β”‚ 0.9 β”‚ 0.1 β”‚ ← Feature + noise
645
+ β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
646
+ β”‚ 0.2 β”‚ 0.1 β”‚ Output: 0.9 (removes noise)
647
+ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
648
+ ```
649
+
650
+ #### The Sliding Window Pattern
651
+
652
+ ```
653
+ MaxPool with 2Γ—2 kernel, stride=2:
654
+
655
+ Input (4Γ—4): Output (2Γ—2):
656
+ β”Œβ”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”
657
+ β”‚ a β”‚ b β”‚ c β”‚ d β”‚ β”‚max(a,bβ”‚max(c,dβ”‚
658
+ β”œβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”€ β†’ β”‚ e,f)β”‚ g,h)β”‚
659
+ β”‚ e β”‚ f β”‚ g β”‚ h β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€
660
+ β”œβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”€ β”‚max(i,jβ”‚max(k,lβ”‚
661
+ β”‚ i β”‚ j β”‚ k β”‚ l β”‚ β”‚ m,n)β”‚ o,p)β”‚
662
+ β”œβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”€ β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”˜
663
+ β”‚ m β”‚ n β”‚ o β”‚ p β”‚
664
+ β””β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”˜
665
+
666
+ Benefits:
667
+ βœ“ Translation invariance (cat moved 1 pixel still detected)
668
+ βœ“ Computational efficiency (4Γ— fewer values to process)
669
+ βœ“ Hierarchical feature building (next layer sees larger receptive field)
670
+ ```
671
+
672
+ #### Memory and Computation Impact
673
+
674
+ For input (1, 64, 224, 224) with 2Γ—2 pooling:
675
+ - **Input memory**: 64 Γ— 224 Γ— 224 Γ— 4 bytes = 12.8 MB
676
+ - **Output memory**: 64 Γ— 112 Γ— 112 Γ— 4 bytes = 3.2 MB
677
+ - **Memory reduction**: 4Γ— less memory needed
678
+ - **Computation**: No parameters, minimal compute cost
679
+ """
680
+
681
+ # %% nbgrader={"grade": false, "grade_id": "maxpool2d-class", "solution": true}
682
+
683
+ #| export
684
+
685
+ class MaxPool2d:
686
+ """
687
+ 2D Max Pooling layer for spatial dimension reduction.
688
+
689
+ Applies maximum operation over spatial windows, preserving
690
+ the strongest activations while reducing computational load.
691
+
692
+ Args:
693
+ kernel_size: Size of pooling window (int or tuple)
694
+ stride: Stride of pooling operation (default: same as kernel_size)
695
+ padding: Zero-padding added to input (default: 0)
696
+ """
697
+
698
+ def __init__(self, kernel_size, stride=None, padding=0):
699
+ """
700
+ Initialize MaxPool2d layer.
701
+
702
+ TODO: Store pooling parameters
703
+
704
+ APPROACH:
705
+ 1. Convert kernel_size to tuple if needed
706
+ 2. Set stride to kernel_size if not provided (non-overlapping)
707
+ 3. Store padding parameter
708
+
709
+ HINT: Default stride equals kernel_size for non-overlapping windows
710
+ """
711
+ super().__init__()
712
+
713
+ ### BEGIN SOLUTION
714
+ # Handle kernel_size as int or tuple
715
+ if isinstance(kernel_size, int):
716
+ self.kernel_size = (kernel_size, kernel_size)
717
+ else:
718
+ self.kernel_size = kernel_size
719
+
720
+ # Default stride equals kernel_size (non-overlapping)
721
+ if stride is None:
722
+ self.stride = self.kernel_size[0]
723
+ else:
724
+ self.stride = stride
725
+
726
+ self.padding = padding
727
+ ### END SOLUTION
728
+
729
+ def forward(self, x):
730
+ """
731
+ Forward pass through MaxPool2d layer.
732
+
733
+ TODO: Implement max pooling with explicit loops
734
+
735
+ APPROACH:
736
+ 1. Extract input dimensions
737
+ 2. Calculate output dimensions
738
+ 3. Apply padding if needed
739
+ 4. Implement nested loops for pooling windows
740
+ 5. Find maximum value in each window
741
+
742
+ LOOP STRUCTURE:
743
+ for batch in range(batch_size):
744
+ for channel in range(channels):
745
+ for out_h in range(out_height):
746
+ for out_w in range(out_width):
747
+ # Find max in window [in_h:in_h+k_h, in_w:in_w+k_w]
748
+ max_val = -infinity
749
+ for k_h in range(kernel_height):
750
+ for k_w in range(kernel_width):
751
+ max_val = max(max_val, input[...])
752
+
753
+ EXAMPLE:
754
+ >>> pool = MaxPool2d(kernel_size=2, stride=2)
755
+ >>> x = Tensor(np.random.randn(1, 3, 8, 8))
756
+ >>> out = pool(x)
757
+ >>> print(out.shape) # Should be (1, 3, 4, 4)
758
+
759
+ HINTS:
760
+ - Initialize max_val to negative infinity
761
+ - Handle stride correctly when accessing input
762
+ - No parameters to update (pooling has no weights)
763
+ """
764
+ ### BEGIN SOLUTION
765
+ # Input validation and shape extraction
766
+ if len(x.shape) != 4:
767
+ raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
768
+
769
+ batch_size, channels, in_height, in_width = x.shape
770
+ kernel_h, kernel_w = self.kernel_size
771
+
772
+ # Calculate output dimensions
773
+ out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
774
+ out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
775
+
776
+ # Apply padding if needed
777
+ if self.padding > 0:
778
+ padded_input = np.pad(x.data,
779
+ ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
780
+ mode='constant', constant_values=-np.inf)
781
+ else:
782
+ padded_input = x.data
783
+
784
+ # Initialize output
785
+ output = np.zeros((batch_size, channels, out_height, out_width))
786
+
787
+ # Explicit nested loop max pooling
788
+ for b in range(batch_size):
789
+ for c in range(channels):
790
+ for out_h in range(out_height):
791
+ for out_w in range(out_width):
792
+ # Calculate input region for this output position
793
+ in_h_start = out_h * self.stride
794
+ in_w_start = out_w * self.stride
795
+
796
+ # Find maximum in window
797
+ max_val = -np.inf
798
+ for k_h in range(kernel_h):
799
+ for k_w in range(kernel_w):
800
+ input_val = padded_input[b, c,
801
+ in_h_start + k_h,
802
+ in_w_start + k_w]
803
+ max_val = max(max_val, input_val)
804
+
805
+ # Store result
806
+ output[b, c, out_h, out_w] = max_val
807
+
808
+ return Tensor(output)
809
+ ### END SOLUTION
810
+
811
+ def parameters(self):
812
+ """Return empty list (pooling has no parameters)."""
813
+ return []
814
+
815
+ def __call__(self, x):
816
+ """Enable model(x) syntax."""
817
+ return self.forward(x)
818
+
819
+ # %% [markdown]
820
+ """
821
+ ### AvgPool2d Implementation - Smoothing and Generalizing Features
822
+
823
+ AvgPool2d computes the average of each spatial window, creating smoother features that are less sensitive to noise and exact pixel positions.
824
+
825
+ #### MaxPool vs AvgPool: Different Philosophies
826
+
827
+ ```
828
+ Same Input Window (2Γ—2): MaxPool Output: AvgPool Output:
829
+ β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
830
+ β”‚ 0.1 β”‚ 0.9 β”‚ 0.9 0.425
831
+ β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€ (max) (mean)
832
+ β”‚ 0.3 β”‚ 0.3 β”‚
833
+ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
834
+
835
+ Interpretation:
836
+ MaxPool: "What's the strongest feature here?"
837
+ AvgPool: "What's the general feature level here?"
838
+ ```
839
+
840
+ #### When to Use Average Pooling
841
+
842
+ ```
843
+ Use Cases:
844
+ βœ“ Global Average Pooling (GAP) for classification
845
+ βœ“ When you want smoother, less noisy features
846
+ βœ“ When exact feature location doesn't matter
847
+ βœ“ In shallower networks where sharp features aren't critical
848
+
849
+ Typical Pattern:
850
+ Feature Maps β†’ Global Average Pool β†’ Dense β†’ Classification
851
+ (256Γ—7Γ—7) β†’ (256Γ—1Γ—1) β†’ FC β†’ (10)
852
+ Replaces flatten+dense with parameter reduction
853
+ ```
854
+
855
+ #### Mathematical Implementation
856
+
857
+ ```
858
+ Average Pooling Computation:
859
+ Window: [a, b] Result = (a + b + c + d) / 4
860
+ [c, d]
861
+
862
+ For efficiency, we:
863
+ 1. Sum all values in window: window_sum = a + b + c + d
864
+ 2. Divide by window area: result = window_sum / (kernel_h Γ— kernel_w)
865
+ 3. Store result at output position
866
+
867
+ Memory access pattern identical to MaxPool, just different aggregation!
868
+ ```
869
+
870
+ #### Practical Considerations
871
+
872
+ - **Memory**: Same 4Γ— reduction as MaxPool
873
+ - **Computation**: Slightly more expensive (sum + divide vs max)
874
+ - **Features**: Smoother, more generalized than MaxPool
875
+ - **Use**: Often in final layers (Global Average Pooling) to reduce parameters
876
+ """
877
+
878
+ # %% nbgrader={"grade": false, "grade_id": "avgpool2d-class", "solution": true}
879
+
880
+ #| export
881
+
882
+ class AvgPool2d:
883
+ """
884
+ 2D Average Pooling layer for spatial dimension reduction.
885
+
886
+ Applies average operation over spatial windows, smoothing
887
+ features while reducing computational load.
888
+
889
+ Args:
890
+ kernel_size: Size of pooling window (int or tuple)
891
+ stride: Stride of pooling operation (default: same as kernel_size)
892
+ padding: Zero-padding added to input (default: 0)
893
+ """
894
+
895
+ def __init__(self, kernel_size, stride=None, padding=0):
896
+ """
897
+ Initialize AvgPool2d layer.
898
+
899
+ TODO: Store pooling parameters (same as MaxPool2d)
900
+
901
+ APPROACH:
902
+ 1. Convert kernel_size to tuple if needed
903
+ 2. Set stride to kernel_size if not provided
904
+ 3. Store padding parameter
905
+ """
906
+ super().__init__()
907
+
908
+ ### BEGIN SOLUTION
909
+ # Handle kernel_size as int or tuple
910
+ if isinstance(kernel_size, int):
911
+ self.kernel_size = (kernel_size, kernel_size)
912
+ else:
913
+ self.kernel_size = kernel_size
914
+
915
+ # Default stride equals kernel_size (non-overlapping)
916
+ if stride is None:
917
+ self.stride = self.kernel_size[0]
918
+ else:
919
+ self.stride = stride
920
+
921
+ self.padding = padding
922
+ ### END SOLUTION
923
+
924
+ def forward(self, x):
925
+ """
926
+ Forward pass through AvgPool2d layer.
927
+
928
+ TODO: Implement average pooling with explicit loops
929
+
930
+ APPROACH:
931
+ 1. Similar structure to MaxPool2d
932
+ 2. Instead of max, compute average of window
933
+ 3. Divide sum by window area for true average
934
+
935
+ LOOP STRUCTURE:
936
+ for batch in range(batch_size):
937
+ for channel in range(channels):
938
+ for out_h in range(out_height):
939
+ for out_w in range(out_width):
940
+ # Compute average in window
941
+ window_sum = 0
942
+ for k_h in range(kernel_height):
943
+ for k_w in range(kernel_width):
944
+ window_sum += input[...]
945
+ avg_val = window_sum / (kernel_height * kernel_width)
946
+
947
+ HINT: Remember to divide by window area to get true average
948
+ """
949
+ ### BEGIN SOLUTION
950
+ # Input validation and shape extraction
951
+ if len(x.shape) != 4:
952
+ raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
953
+
954
+ batch_size, channels, in_height, in_width = x.shape
955
+ kernel_h, kernel_w = self.kernel_size
956
+
957
+ # Calculate output dimensions
958
+ out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
959
+ out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
960
+
961
+ # Apply padding if needed
962
+ if self.padding > 0:
963
+ padded_input = np.pad(x.data,
964
+ ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
965
+ mode='constant', constant_values=0)
966
+ else:
967
+ padded_input = x.data
968
+
969
+ # Initialize output
970
+ output = np.zeros((batch_size, channels, out_height, out_width))
971
+
972
+ # Explicit nested loop average pooling
973
+ for b in range(batch_size):
974
+ for c in range(channels):
975
+ for out_h in range(out_height):
976
+ for out_w in range(out_width):
977
+ # Calculate input region for this output position
978
+ in_h_start = out_h * self.stride
979
+ in_w_start = out_w * self.stride
980
+
981
+ # Compute sum in window
982
+ window_sum = 0.0
983
+ for k_h in range(kernel_h):
984
+ for k_w in range(kernel_w):
985
+ input_val = padded_input[b, c,
986
+ in_h_start + k_h,
987
+ in_w_start + k_w]
988
+ window_sum += input_val
989
+
990
+ # Compute average
991
+ avg_val = window_sum / (kernel_h * kernel_w)
992
+
993
+ # Store result
994
+ output[b, c, out_h, out_w] = avg_val
995
+
996
+ # Return Tensor with gradient tracking (consistent with MaxPool2d)
997
+ result = Tensor(output, requires_grad=x.requires_grad)
998
+ return result
999
+ ### END SOLUTION
1000
+
1001
+ def parameters(self):
1002
+ """Return empty list (pooling has no parameters)."""
1003
+ return []
1004
+
1005
+ def __call__(self, x):
1006
+ """Enable model(x) syntax."""
1007
+ return self.forward(x)
1008
+
1009
+ # %% [markdown]
1010
+ """
1011
+ ## πŸ—οΈ Batch Normalization - Stabilizing Deep Network Training
1012
+
1013
+ Batch Normalization (BatchNorm) is one of the most important techniques for training deep networks. It normalizes activations across the batch dimension, dramatically improving training stability and speed.
1014
+
1015
+ ### Why BatchNorm Matters
1016
+
1017
+ ```
1018
+ Without BatchNorm: With BatchNorm:
1019
+ Layer outputs can have Layer outputs are normalized
1020
+ wildly varying scales: to consistent scale:
1021
+
1022
+ Layer 1: mean=0.5, std=0.3 Layer 1: meanβ‰ˆ0, stdβ‰ˆ1
1023
+ Layer 5: mean=12.7, std=8.4 β†’ Layer 5: meanβ‰ˆ0, stdβ‰ˆ1
1024
+ Layer 10: mean=0.001, std=0.0003 Layer 10: meanβ‰ˆ0, stdβ‰ˆ1
1025
+
1026
+ Result: Unstable gradients Result: Stable training
1027
+ Slow convergence Fast convergence
1028
+ Careful learning rate Robust to hyperparameters
1029
+ ```
1030
+
1031
+ ### The BatchNorm Computation
1032
+
1033
+ For each channel c, BatchNorm computes:
1034
+ ```
1035
+ 1. Batch Statistics (during training):
1036
+ ΞΌ_c = mean(x[:, c, :, :]) # Mean over batch and spatial dims
1037
+ σ²_c = var(x[:, c, :, :]) # Variance over batch and spatial dims
1038
+
1039
+ 2. Normalize:
1040
+ xΜ‚_c = (x[:, c, :, :] - ΞΌ_c) / sqrt(σ²_c + Ξ΅)
1041
+
1042
+ 3. Scale and Shift (learnable parameters):
1043
+ y_c = Ξ³_c * xΜ‚_c + Ξ²_c # Ξ³ (gamma) and Ξ² (beta) are learned
1044
+ ```
1045
+
1046
+ ### Train vs Eval Mode
1047
+
1048
+ This is a critical systems concept:
1049
+
1050
+ ```
1051
+ Training Mode: Eval Mode:
1052
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1053
+ β”‚ Use batch stats β”‚ β”‚ Use running stats β”‚
1054
+ β”‚ Update running οΏ½οΏ½οΏ½ β”‚ (accumulated from β”‚
1055
+ β”‚ mean/variance β”‚ β”‚ training) β”‚
1056
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1057
+ ↓ ↓
1058
+ Computes ΞΌ, σ² from Uses frozen ΞΌ, σ² for
1059
+ current batch consistent inference
1060
+ ```
1061
+
1062
+ **Why this matters**: During inference, you might process just 1 image. Batch statistics from 1 sample would be meaningless. Running statistics provide stable normalization.
1063
+ """
1064
+
1065
+ # %% nbgrader={"grade": false, "grade_id": "batchnorm2d-class", "solution": true}
1066
+
1067
+ #| export
1068
+
1069
+ class BatchNorm2d:
1070
+ """
1071
+ Batch Normalization for 2D spatial inputs (images).
1072
+
1073
+ Normalizes activations across batch and spatial dimensions for each channel,
1074
+ then applies learnable scale (gamma) and shift (beta) parameters.
1075
+
1076
+ Key behaviors:
1077
+ - Training: Uses batch statistics, updates running statistics
1078
+ - Eval: Uses frozen running statistics for consistent inference
1079
+
1080
+ Args:
1081
+ num_features: Number of channels (C in NCHW format)
1082
+ eps: Small constant for numerical stability (default: 1e-5)
1083
+ momentum: Momentum for running statistics update (default: 0.1)
1084
+ """
1085
+
1086
+ def __init__(self, num_features, eps=1e-5, momentum=0.1):
1087
+ """
1088
+ Initialize BatchNorm2d layer.
1089
+
1090
+ TODO: Initialize learnable and running parameters
1091
+
1092
+ APPROACH:
1093
+ 1. Store hyperparameters (num_features, eps, momentum)
1094
+ 2. Initialize gamma (scale) to ones - identity at start
1095
+ 3. Initialize beta (shift) to zeros - no shift at start
1096
+ 4. Initialize running_mean to zeros
1097
+ 5. Initialize running_var to ones
1098
+ 6. Set training mode to True initially
1099
+
1100
+ EXAMPLE:
1101
+ >>> bn = BatchNorm2d(64) # For 64-channel feature maps
1102
+ >>> print(bn.gamma.shape) # (64,)
1103
+ >>> print(bn.training) # True
1104
+ """
1105
+ super().__init__()
1106
+
1107
+ ### BEGIN SOLUTION
1108
+ self.num_features = num_features
1109
+ self.eps = eps
1110
+ self.momentum = momentum
1111
+
1112
+ # Learnable parameters (requires_grad=True for training)
1113
+ # gamma (scale): initialized to 1 so output = normalized input initially
1114
+ self.gamma = Tensor(np.ones(num_features), requires_grad=True)
1115
+ # beta (shift): initialized to 0 so no shift initially
1116
+ self.beta = Tensor(np.zeros(num_features), requires_grad=True)
1117
+
1118
+ # Running statistics (not trained, accumulated during training)
1119
+ # These are used during evaluation for consistent normalization
1120
+ self.running_mean = np.zeros(num_features)
1121
+ self.running_var = np.ones(num_features)
1122
+
1123
+ # Training mode flag
1124
+ self.training = True
1125
+ ### END SOLUTION
1126
+
1127
+ def train(self):
1128
+ """Set layer to training mode."""
1129
+ self.training = True
1130
+ return self
1131
+
1132
+ def eval(self):
1133
+ """Set layer to evaluation mode."""
1134
+ self.training = False
1135
+ return self
1136
+
1137
+ def forward(self, x):
1138
+ """
1139
+ Forward pass through BatchNorm2d.
1140
+
1141
+ TODO: Implement batch normalization forward pass
1142
+
1143
+ APPROACH:
1144
+ 1. Validate input shape (must be 4D: batch, channels, height, width)
1145
+ 2. If training:
1146
+ a. Compute batch mean and variance per channel
1147
+ b. Normalize using batch statistics
1148
+ c. Update running statistics with momentum
1149
+ 3. If eval:
1150
+ a. Use running mean and variance
1151
+ b. Normalize using frozen statistics
1152
+ 4. Apply scale (gamma) and shift (beta)
1153
+
1154
+ EXAMPLE:
1155
+ >>> bn = BatchNorm2d(16)
1156
+ >>> x = Tensor(np.random.randn(2, 16, 8, 8)) # batch=2, channels=16, 8x8
1157
+ >>> y = bn(x)
1158
+ >>> print(y.shape) # (2, 16, 8, 8) - same shape
1159
+
1160
+ HINTS:
1161
+ - Compute mean/var over axes (0, 2, 3) to get per-channel statistics
1162
+ - Reshape gamma/beta to (1, C, 1, 1) for broadcasting
1163
+ - Running stat update: running = (1 - momentum) * running + momentum * batch
1164
+ """
1165
+ ### BEGIN SOLUTION
1166
+ # Input validation
1167
+ if len(x.shape) != 4:
1168
+ raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
1169
+
1170
+ batch_size, channels, height, width = x.shape
1171
+
1172
+ if channels != self.num_features:
1173
+ raise ValueError(f"Expected {self.num_features} channels, got {channels}")
1174
+
1175
+ if self.training:
1176
+ # Compute batch statistics per channel
1177
+ # Mean over batch and spatial dimensions: axes (0, 2, 3)
1178
+ batch_mean = np.mean(x.data, axis=(0, 2, 3)) # Shape: (C,)
1179
+ batch_var = np.var(x.data, axis=(0, 2, 3)) # Shape: (C,)
1180
+
1181
+ # Update running statistics (exponential moving average)
1182
+ self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
1183
+ self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
1184
+
1185
+ # Use batch statistics for normalization
1186
+ mean = batch_mean
1187
+ var = batch_var
1188
+ else:
1189
+ # Use running statistics (frozen during eval)
1190
+ mean = self.running_mean
1191
+ var = self.running_var
1192
+
1193
+ # Normalize: (x - mean) / sqrt(var + eps)
1194
+ # Reshape mean and var for broadcasting: (C,) -> (1, C, 1, 1)
1195
+ mean_reshaped = mean.reshape(1, channels, 1, 1)
1196
+ var_reshaped = var.reshape(1, channels, 1, 1)
1197
+
1198
+ x_normalized = (x.data - mean_reshaped) / np.sqrt(var_reshaped + self.eps)
1199
+
1200
+ # Apply scale (gamma) and shift (beta)
1201
+ # Reshape for broadcasting: (C,) -> (1, C, 1, 1)
1202
+ gamma_reshaped = self.gamma.data.reshape(1, channels, 1, 1)
1203
+ beta_reshaped = self.beta.data.reshape(1, channels, 1, 1)
1204
+
1205
+ output = gamma_reshaped * x_normalized + beta_reshaped
1206
+
1207
+ # Return Tensor with gradient tracking
1208
+ result = Tensor(output, requires_grad=x.requires_grad or self.gamma.requires_grad)
1209
+
1210
+ return result
1211
+ ### END SOLUTION
1212
+
1213
+ def parameters(self):
1214
+ """Return learnable parameters (gamma and beta)."""
1215
+ return [self.gamma, self.beta]
1216
+
1217
+ def __call__(self, x):
1218
+ """Enable model(x) syntax."""
1219
+ return self.forward(x)
1220
+
1221
+ # %% [markdown]
1222
+ """
1223
+ ### πŸ§ͺ Unit Test: BatchNorm2d
1224
+ This test validates batch normalization implementation.
1225
+ **What we're testing**: Normalization behavior, train/eval mode, running statistics
1226
+ **Why it matters**: BatchNorm is essential for training deep CNNs effectively
1227
+ **Expected**: Normalized outputs with proper mean/variance characteristics
1228
+ """
1229
+
1230
+ # %% nbgrader={"grade": true, "grade_id": "test-batchnorm2d", "locked": true, "points": 10}
1231
+
1232
+
1233
+ def test_unit_batchnorm2d():
1234
+ """πŸ”¬ Test BatchNorm2d implementation."""
1235
+ print("πŸ”¬ Unit Test: BatchNorm2d...")
1236
+
1237
+ # Test 1: Basic forward pass shape
1238
+ print(" Testing basic forward pass...")
1239
+ bn = BatchNorm2d(num_features=16)
1240
+ x = Tensor(np.random.randn(4, 16, 8, 8)) # batch=4, channels=16, 8x8
1241
+ y = bn(x)
1242
+
1243
+ assert y.shape == x.shape, f"Output shape should match input, got {y.shape}"
1244
+
1245
+ # Test 2: Training mode normalization
1246
+ print(" Testing training mode normalization...")
1247
+ bn2 = BatchNorm2d(num_features=8)
1248
+ bn2.train() # Ensure training mode
1249
+
1250
+ # Create input with known statistics per channel
1251
+ x2 = Tensor(np.random.randn(32, 8, 4, 4) * 10 + 5) # Mean~5, std~10
1252
+ y2 = bn2(x2)
1253
+
1254
+ # After normalization, each channel should have meanβ‰ˆ0, stdβ‰ˆ1
1255
+ # (before gamma/beta are applied, since gamma=1, beta=0)
1256
+ for c in range(8):
1257
+ channel_mean = np.mean(y2.data[:, c, :, :])
1258
+ channel_std = np.std(y2.data[:, c, :, :])
1259
+ assert abs(channel_mean) < 0.1, f"Channel {c} mean should be ~0, got {channel_mean:.3f}"
1260
+ assert abs(channel_std - 1.0) < 0.1, f"Channel {c} std should be ~1, got {channel_std:.3f}"
1261
+
1262
+ # Test 3: Running statistics update
1263
+ print(" Testing running statistics update...")
1264
+ initial_running_mean = bn2.running_mean.copy()
1265
+
1266
+ # Forward pass updates running stats
1267
+ x3 = Tensor(np.random.randn(16, 8, 4, 4) + 3) # Offset mean
1268
+ _ = bn2(x3)
1269
+
1270
+ # Running mean should have moved toward batch mean
1271
+ assert not np.allclose(bn2.running_mean, initial_running_mean), \
1272
+ "Running mean should update during training"
1273
+
1274
+ # Test 4: Eval mode uses running statistics
1275
+ print(" Testing eval mode behavior...")
1276
+ bn3 = BatchNorm2d(num_features=4)
1277
+
1278
+ # Train on some data to establish running stats
1279
+ for _ in range(10):
1280
+ x_train = Tensor(np.random.randn(8, 4, 4, 4) * 2 + 1)
1281
+ _ = bn3(x_train)
1282
+
1283
+ saved_running_mean = bn3.running_mean.copy()
1284
+ saved_running_var = bn3.running_var.copy()
1285
+
1286
+ # Switch to eval mode
1287
+ bn3.eval()
1288
+
1289
+ # Process different data - running stats should NOT change
1290
+ x_eval = Tensor(np.random.randn(2, 4, 4, 4) * 5) # Different distribution
1291
+ _ = bn3(x_eval)
1292
+
1293
+ assert np.allclose(bn3.running_mean, saved_running_mean), \
1294
+ "Running mean should not change in eval mode"
1295
+ assert np.allclose(bn3.running_var, saved_running_var), \
1296
+ "Running var should not change in eval mode"
1297
+
1298
+ # Test 5: Parameter counting
1299
+ print(" Testing parameter counting...")
1300
+ bn4 = BatchNorm2d(num_features=64)
1301
+ params = bn4.parameters()
1302
+
1303
+ assert len(params) == 2, f"Should have 2 parameters (gamma, beta), got {len(params)}"
1304
+ assert params[0].shape == (64,), f"Gamma shape should be (64,), got {params[0].shape}"
1305
+ assert params[1].shape == (64,), f"Beta shape should be (64,), got {params[1].shape}"
1306
+
1307
+ print("βœ… BatchNorm2d works correctly!")
1308
+
1309
+ if __name__ == "__main__":
1310
+ test_unit_batchnorm2d()
1311
+
1312
+ # %% [markdown]
1313
+ """
1314
+ ### πŸ§ͺ Unit Test: Pooling Operations
1315
+ This test validates both max and average pooling implementations.
1316
+ **What we're testing**: Dimension reduction, aggregation correctness
1317
+ **Why it matters**: Pooling is essential for computational efficiency in CNNs
1318
+ **Expected**: Correct output shapes and proper value aggregation
1319
+ """
1320
+
1321
+ # %% nbgrader={"grade": true, "grade_id": "test-pooling", "locked": true, "points": 10}
1322
+
1323
+
1324
+ def test_unit_pooling():
1325
+ """πŸ”¬ Test MaxPool2d and AvgPool2d implementations."""
1326
+ print("πŸ”¬ Unit Test: Pooling Operations...")
1327
+
1328
+ # Test 1: MaxPool2d basic functionality
1329
+ print(" Testing MaxPool2d...")
1330
+ maxpool = MaxPool2d(kernel_size=2, stride=2)
1331
+ x1 = Tensor(np.random.randn(1, 3, 8, 8))
1332
+ out1 = maxpool(x1)
1333
+
1334
+ expected_shape = (1, 3, 4, 4) # 8/2 = 4
1335
+ assert out1.shape == expected_shape, f"MaxPool expected {expected_shape}, got {out1.shape}"
1336
+
1337
+ # Test 2: AvgPool2d basic functionality
1338
+ print(" Testing AvgPool2d...")
1339
+ avgpool = AvgPool2d(kernel_size=2, stride=2)
1340
+ x2 = Tensor(np.random.randn(2, 16, 16, 16))
1341
+ out2 = avgpool(x2)
1342
+
1343
+ expected_shape = (2, 16, 8, 8) # 16/2 = 8
1344
+ assert out2.shape == expected_shape, f"AvgPool expected {expected_shape}, got {out2.shape}"
1345
+
1346
+ # Test 3: MaxPool vs AvgPool on known data
1347
+ print(" Testing max vs avg behavior...")
1348
+ # Create simple test case with known values
1349
+ test_data = np.array([[[[1, 2, 3, 4],
1350
+ [5, 6, 7, 8],
1351
+ [9, 10, 11, 12],
1352
+ [13, 14, 15, 16]]]], dtype=np.float32)
1353
+ x3 = Tensor(test_data)
1354
+
1355
+ maxpool_test = MaxPool2d(kernel_size=2, stride=2)
1356
+ avgpool_test = AvgPool2d(kernel_size=2, stride=2)
1357
+
1358
+ max_out = maxpool_test(x3)
1359
+ avg_out = avgpool_test(x3)
1360
+
1361
+ # For 2x2 windows:
1362
+ # Top-left: max([1,2,5,6]) = 6, avg = 3.5
1363
+ # Top-right: max([3,4,7,8]) = 8, avg = 5.5
1364
+ # Bottom-left: max([9,10,13,14]) = 14, avg = 11.5
1365
+ # Bottom-right: max([11,12,15,16]) = 16, avg = 13.5
1366
+
1367
+ expected_max = np.array([[[[6, 8], [14, 16]]]])
1368
+ expected_avg = np.array([[[[3.5, 5.5], [11.5, 13.5]]]])
1369
+
1370
+ assert np.allclose(max_out.data, expected_max), f"MaxPool values incorrect: {max_out.data} vs {expected_max}"
1371
+ assert np.allclose(avg_out.data, expected_avg), f"AvgPool values incorrect: {avg_out.data} vs {expected_avg}"
1372
+
1373
+ # Test 4: Overlapping pooling (stride < kernel_size)
1374
+ print(" Testing overlapping pooling...")
1375
+ overlap_pool = MaxPool2d(kernel_size=3, stride=1)
1376
+ x4 = Tensor(np.random.randn(1, 1, 5, 5))
1377
+ out4 = overlap_pool(x4)
1378
+
1379
+ # Output: (5-3)/1 + 1 = 3
1380
+ expected_shape = (1, 1, 3, 3)
1381
+ assert out4.shape == expected_shape, f"Overlapping pool expected {expected_shape}, got {out4.shape}"
1382
+
1383
+ # Test 5: No parameters in pooling layers
1384
+ print(" Testing parameter counts...")
1385
+ assert len(maxpool.parameters()) == 0, "MaxPool should have no parameters"
1386
+ assert len(avgpool.parameters()) == 0, "AvgPool should have no parameters"
1387
+
1388
+ print("βœ… Pooling operations work correctly!")
1389
+
1390
+ if __name__ == "__main__":
1391
+ test_unit_pooling()
1392
+
1393
+ # %% [markdown]
1394
+ """
1395
+ ## πŸ“Š Systems Analysis - Understanding Spatial Operation Performance
1396
+
1397
+ Now let's analyze the computational complexity and memory trade-offs of spatial operations. This analysis reveals why certain design choices matter for real-world performance.
1398
+
1399
+ ### Key Questions We'll Answer:
1400
+ 1. How does convolution complexity scale with input size and kernel size?
1401
+ 2. What's the memory vs computation trade-off in different approaches?
1402
+ 3. How do modern optimizations (like im2col) change the performance characteristics?
1403
+ """
1404
+
1405
+ # %% nbgrader={"grade": false, "grade_id": "spatial-analysis", "solution": true}
1406
+
1407
+
1408
+ def analyze_convolution_complexity():
1409
+ """πŸ“Š Analyze convolution computational complexity across different configurations."""
1410
+ print("πŸ“Š Analyzing Convolution Complexity...")
1411
+
1412
+ # Test configurations optimized for educational demonstration (smaller sizes)
1413
+ configs = [
1414
+ {"input": (1, 3, 16, 16), "conv": (8, 3, 3), "name": "Small (16Γ—16)"},
1415
+ {"input": (1, 3, 24, 24), "conv": (12, 3, 3), "name": "Medium (24Γ—24)"},
1416
+ {"input": (1, 3, 32, 32), "conv": (16, 3, 3), "name": "Large (32Γ—32)"},
1417
+ {"input": (1, 3, 16, 16), "conv": (8, 3, 5), "name": "Large Kernel (5Γ—5)"},
1418
+ ]
1419
+
1420
+ print(f"{'Configuration':<20} {'FLOPs':<15} {'Memory (MB)':<12} {'Time (ms)':<10}")
1421
+ print("-" * 70)
1422
+
1423
+ for config in configs:
1424
+ # Create convolution layer
1425
+ in_ch = config["input"][1]
1426
+ out_ch, k_size = config["conv"][0], config["conv"][1]
1427
+ conv = Conv2d(in_ch, out_ch, kernel_size=k_size, padding=k_size//2)
1428
+
1429
+ # Create input tensor
1430
+ x = Tensor(np.random.randn(*config["input"]))
1431
+
1432
+ # Calculate theoretical FLOPs
1433
+ batch, in_channels, h, w = config["input"]
1434
+ out_channels, kernel_size = config["conv"][0], config["conv"][1]
1435
+
1436
+ # Each output element requires in_channels * kernel_sizeΒ² multiply-adds
1437
+ flops_per_output = in_channels * kernel_size * kernel_size * 2 # 2 for MAC
1438
+ total_outputs = batch * out_channels * h * w # Assuming same size with padding
1439
+ total_flops = flops_per_output * total_outputs
1440
+
1441
+ # Measure memory usage
1442
+ input_memory = np.prod(config["input"]) * 4 # float32 = 4 bytes
1443
+ weight_memory = out_channels * in_channels * kernel_size * kernel_size * 4
1444
+ output_memory = batch * out_channels * h * w * 4
1445
+ total_memory = (input_memory + weight_memory + output_memory) / (1024 * 1024) # MB
1446
+
1447
+ # Measure execution time
1448
+ start_time = time.time()
1449
+ _ = conv(x)
1450
+ end_time = time.time()
1451
+ exec_time = (end_time - start_time) * 1000 # ms
1452
+
1453
+ print(f"{config['name']:<20} {total_flops:<15,} {total_memory:<12.2f} {exec_time:<10.2f}")
1454
+
1455
+ print("\nπŸ’‘ Key Insights:")
1456
+ print("πŸ”Έ FLOPs scale as O(HΓ—WΓ—C_inΓ—C_outΓ—KΒ²) - quadratic in spatial and kernel size")
1457
+ print("πŸ”Έ Memory scales linearly with spatial dimensions and channels")
1458
+ print("πŸ”Έ Large kernels dramatically increase computational cost")
1459
+ print("πŸš€ This motivates depthwise separable convolutions and attention mechanisms")
1460
+
1461
+ # Analysis will be called in main execution
1462
+
1463
+ # %% nbgrader={"grade": false, "grade_id": "pooling-analysis", "solution": true}
1464
+
1465
+
1466
+ def analyze_pooling_effects():
1467
+ """πŸ“Š Analyze pooling's impact on spatial dimensions and features."""
1468
+ print("\nπŸ“Š Analyzing Pooling Effects...")
1469
+
1470
+ # Create sample input with spatial structure
1471
+ # Simple edge pattern that pooling should preserve differently
1472
+ pattern = np.zeros((1, 1, 8, 8))
1473
+ pattern[0, 0, :, 3:5] = 1.0 # Vertical edge
1474
+ pattern[0, 0, 3:5, :] = 1.0 # Horizontal edge
1475
+ x = Tensor(pattern)
1476
+
1477
+ print("Original 8Γ—8 pattern:")
1478
+ print(x.data[0, 0])
1479
+
1480
+ # Test different pooling strategies
1481
+ pools = [
1482
+ (MaxPool2d(2, stride=2), "MaxPool 2Γ—2"),
1483
+ (AvgPool2d(2, stride=2), "AvgPool 2Γ—2"),
1484
+ (MaxPool2d(4, stride=4), "MaxPool 4Γ—4"),
1485
+ (AvgPool2d(4, stride=4), "AvgPool 4Γ—4"),
1486
+ ]
1487
+
1488
+ print(f"\n{'Operation':<15} {'Output Shape':<15} {'Feature Preservation'}")
1489
+ print("-" * 60)
1490
+
1491
+ for pool_op, name in pools:
1492
+ result = pool_op(x)
1493
+ # Measure how much of the original pattern is preserved
1494
+ preservation = np.sum(result.data > 0.1) / np.prod(result.shape)
1495
+ print(f"{name:<15} {str(result.shape):<15} {preservation:<.2%}")
1496
+
1497
+ print(f" Output:")
1498
+ print(f" {result.data[0, 0]}")
1499
+ print()
1500
+
1501
+ print("πŸ’‘ Key Insights:")
1502
+ print("πŸ”Έ MaxPool preserves sharp features better (edge detection)")
1503
+ print("πŸ”Έ AvgPool smooths features (noise reduction)")
1504
+ print("πŸ”Έ Larger pooling windows lose more spatial detail")
1505
+ print("πŸš€ Choice depends on task: classification vs detection vs segmentation")
1506
+
1507
+ # Analysis will be called in main execution
1508
+
1509
+ # %% [markdown]
1510
+ """
1511
+ ## πŸ”§ Integration - Building a Complete CNN
1512
+
1513
+ Now let's combine convolution and pooling into a complete CNN architecture. You'll see how spatial operations work together to transform raw pixels into meaningful features.
1514
+
1515
+ ### CNN Architecture: From Pixels to Predictions
1516
+
1517
+ A CNN processes images through alternating convolution and pooling layers, gradually extracting higher-level features:
1518
+
1519
+ ```
1520
+ Complete CNN Pipeline:
1521
+
1522
+ Input Image (32Γ—32Γ—3) Raw RGB pixels
1523
+ ↓
1524
+ Conv2d(3β†’16, 3Γ—3) Detect edges, textures
1525
+ ↓
1526
+ ReLU Activation Remove negative values
1527
+ ↓
1528
+ MaxPool(2Γ—2) Reduce to (16Γ—16Γ—16)
1529
+ ↓
1530
+ Conv2d(16β†’32, 3Γ—3) Detect shapes, patterns
1531
+ ↓
1532
+ ReLU Activation Remove negative values
1533
+ ↓
1534
+ MaxPool(2Γ—2) Reduce to (8Γ—8Γ—32)
1535
+ ↓
1536
+ Flatten Reshape to vector (2048,)
1537
+ ↓
1538
+ Linear(2048β†’10) Final classification
1539
+ ↓
1540
+ Softmax Probability distribution
1541
+ ```
1542
+
1543
+ ### The Parameter Efficiency Story
1544
+
1545
+ ```
1546
+ CNN vs Dense Network Comparison:
1547
+
1548
+ CNN Approach: Dense Approach:
1549
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1550
+ β”‚ Conv1: 3β†’16 β”‚ β”‚ Input: 32Γ—32Γ—3 β”‚
1551
+ β”‚ Params: 448 β”‚ β”‚ = 3,072 values β”‚
1552
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
1553
+ β”‚ Conv2: 16β†’32 β”‚ β”‚ Hidden: 1,000 β”‚
1554
+ β”‚ Params: 4,640 β”‚ β”‚ Params: 3M+ β”‚
1555
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
1556
+ β”‚ Linear: 2048β†’10 β”‚ β”‚ Output: 10 β”‚
1557
+ β”‚ Params: 20,490 β”‚ β”‚ Params: 10K β”‚
1558
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1559
+ Total: ~25K params Total: ~3M params
1560
+
1561
+ CNN wins with 120Γ— fewer parameters!
1562
+ ```
1563
+
1564
+ ### Spatial Hierarchy: Why This Architecture Works
1565
+
1566
+ ```
1567
+ Layer-by-Layer Feature Evolution:
1568
+
1569
+ Layer 1 (Conv 3β†’16): Layer 2 (Conv 16β†’32):
1570
+ β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”
1571
+ β”‚Edge β”‚ β”‚Edge β”‚ β”‚Edge β”‚ β”‚Shapeβ”‚ β”‚Cornerβ”‚ β”‚Textureβ”‚
1572
+ β”‚ \\ /β”‚ β”‚ | β”‚ β”‚ / \\β”‚ β”‚ β—‡ β”‚ β”‚ L β”‚ β”‚ β‰ˆβ‰ˆβ‰ˆ β”‚
1573
+ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜
1574
+ Simple features Complex combinations
1575
+
1576
+ Why pooling between layers:
1577
+ βœ“ Reduces computation for next layer
1578
+ βœ“ Increases receptive field (each conv sees larger input area)
1579
+ βœ“ Provides translation invariance (cat moved 1 pixel still detected)
1580
+ ```
1581
+
1582
+ This hierarchical approach mirrors human vision: we first detect edges, then shapes, then objects!
1583
+ """
1584
+
1585
+ # %% [markdown]
1586
+ """
1587
+ ### SimpleCNN Implementation - Putting It All Together
1588
+
1589
+ Now we'll build a complete CNN that demonstrates how convolution and pooling work together. This is your first step from processing individual tensors to understanding complete images!
1590
+
1591
+ #### The CNN Architecture Pattern
1592
+
1593
+ ```
1594
+ SimpleCNN Architecture Visualization:
1595
+
1596
+ Input: (batch, 3, 32, 32) ← RGB images (CIFAR-10 size)
1597
+ ↓
1598
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1599
+ β”‚ Conv2d(3β†’16, 3Γ—3, p=1) β”‚ ← Detect edges, textures
1600
+ β”‚ ReLU() β”‚ ← Remove negative values
1601
+ β”‚ MaxPool(2Γ—2) β”‚ ← Reduce to (batch, 16, 16, 16)
1602
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1603
+ ↓
1604
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1605
+ β”‚ Conv2d(16β†’32, 3Γ—3, p=1) β”‚ ← Detect shapes, patterns
1606
+ β”‚ ReLU() β”‚ ← Remove negative values
1607
+ β”‚ MaxPool(2Γ—2) β”‚ ← Reduce to (batch, 32, 8, 8)
1608
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1609
+ ↓
1610
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1611
+ β”‚ Flatten() β”‚ ← Reshape to (batch, 2048)
1612
+ β”‚ Linear(2048β†’10) β”‚ ← Final classification
1613
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1614
+ ↓
1615
+ Output: (batch, 10) ← Class probabilities
1616
+ ```
1617
+
1618
+ #### Why This Architecture Works
1619
+
1620
+ ```
1621
+ Feature Hierarchy Development:
1622
+
1623
+ Layer 1 Features (3β†’16): Layer 2 Features (16β†’32):
1624
+ β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
1625
+ β”‚Edge β”‚Edge β”‚Edge β”‚Blob β”‚ β”‚Shapeβ”‚Cornerβ”‚Tex-β”‚Pat- β”‚
1626
+ β”‚ \\ β”‚ | β”‚ / β”‚ β—‹ β”‚ β”‚ β—‡ β”‚ L β”‚tureβ”‚tern β”‚
1627
+ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
1628
+ Simple features Complex combinations
1629
+
1630
+ Spatial Dimension Reduction:
1631
+ 32Γ—32 β†’ 16Γ—16 β†’ 8Γ—8
1632
+ 1024 256 64 (per channel)
1633
+
1634
+ Channel Expansion:
1635
+ 3 β†’ 16 β†’ 32
1636
+ More feature types at each level
1637
+ ```
1638
+
1639
+ #### Parameter Efficiency Demonstration
1640
+
1641
+ ```
1642
+ CNN vs Dense Comparison for 32Γ—32Γ—3 β†’ 10 classes:
1643
+
1644
+ CNN Approach: Dense Approach:
1645
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
1646
+ β”‚ Conv1: 3β†’16, 3Γ—3 β”‚ β”‚ Input: 3072 values β”‚
1647
+ β”‚ Params: 448 β”‚ β”‚ ↓ β”‚
1648
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ Dense: 3072β†’512 β”‚
1649
+ β”‚ Conv2: 16β†’32, 3Γ—3 β”‚ β”‚ Params: 1.57M β”‚
1650
+ β”‚ Params: 4,640 β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
1651
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ Dense: 512β†’10 β”‚
1652
+ β”‚ Dense: 2048β†’10 β”‚ β”‚ Params: 5,120 β”‚
1653
+ β”‚ Params: 20,490 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
1654
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Total: 1.58M params
1655
+ Total: 25,578 params
1656
+
1657
+ CNN has 62Γ— fewer parameters while preserving spatial structure!
1658
+ ```
1659
+
1660
+ #### Receptive Field Growth
1661
+
1662
+ ```
1663
+ How each layer sees progressively larger input regions:
1664
+
1665
+ Layer 1 Conv (3Γ—3): Layer 2 Conv (3Γ—3):
1666
+ Each output pixel sees Each output pixel sees
1667
+ 3Γ—3 = 9 input pixels 7Γ—7 = 49 input pixels
1668
+ (due to pooling+conv)
1669
+
1670
+ Final Result: Layer 2 can detect complex patterns
1671
+ spanning 7Γ—7 regions of original image!
1672
+ ```
1673
+ """
1674
+
1675
+ # %% nbgrader={"grade": false, "grade_id": "simple-cnn", "solution": true}
1676
+
1677
+ #| export
1678
+
1679
+ class SimpleCNN:
1680
+ """
1681
+ Simple CNN demonstrating spatial operations integration.
1682
+
1683
+ Architecture:
1684
+ - Conv2d(3β†’16, 3Γ—3) + ReLU + MaxPool(2Γ—2)
1685
+ - Conv2d(16β†’32, 3Γ—3) + ReLU + MaxPool(2Γ—2)
1686
+ - Flatten + Linear(features→num_classes)
1687
+ """
1688
+
1689
+ def __init__(self, num_classes=10):
1690
+ """
1691
+ Initialize SimpleCNN.
1692
+
1693
+ TODO: Build CNN architecture with spatial and dense layers
1694
+
1695
+ APPROACH:
1696
+ 1. Conv layer 1: 3 β†’ 16 channels, 3Γ—3 kernel, padding=1
1697
+ 2. Pool layer 1: 2Γ—2 max pooling
1698
+ 3. Conv layer 2: 16 β†’ 32 channels, 3Γ—3 kernel, padding=1
1699
+ 4. Pool layer 2: 2Γ—2 max pooling
1700
+ 5. Calculate flattened size and add final linear layer
1701
+
1702
+ HINT: For 32Γ—32 input β†’ 32β†’16β†’8β†’4 spatial reduction
1703
+ Final feature size: 32 channels Γ— 4Γ—4 = 512 features
1704
+ """
1705
+ super().__init__()
1706
+
1707
+ ### BEGIN SOLUTION
1708
+ # Convolutional layers
1709
+ self.conv1 = Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
1710
+ self.pool1 = MaxPool2d(kernel_size=2, stride=2)
1711
+
1712
+ self.conv2 = Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
1713
+ self.pool2 = MaxPool2d(kernel_size=2, stride=2)
1714
+
1715
+ # Calculate flattened size
1716
+ # Input: 32Γ—32 β†’ Conv1+Pool1: 16Γ—16 β†’ Conv2+Pool2: 8Γ—8
1717
+ # Wait, let's recalculate: 32Γ—32 β†’ Pool1: 16Γ—16 β†’ Pool2: 8Γ—8
1718
+ # Final: 32 channels Γ— 8Γ—8 = 2048 features
1719
+ self.flattened_size = 32 * 8 * 8
1720
+
1721
+ # Import Linear layer (we'll implement a simple version)
1722
+ # For now, we'll use a placeholder that we can replace
1723
+ # This represents the final classification layer
1724
+ self.num_classes = num_classes
1725
+ self.flattened_size = 32 * 8 * 8 # Will be used when we add Linear layer
1726
+ ### END SOLUTION
1727
+
1728
+ def forward(self, x):
1729
+ """
1730
+ Forward pass through SimpleCNN.
1731
+
1732
+ TODO: Implement CNN forward pass
1733
+
1734
+ APPROACH:
1735
+ 1. Apply conv1 β†’ ReLU β†’ pool1
1736
+ 2. Apply conv2 β†’ ReLU β†’ pool2
1737
+ 3. Flatten spatial dimensions
1738
+ 4. Apply final linear layer (when available)
1739
+
1740
+ For now, return features before final linear layer
1741
+ since we haven't imported Linear from layers module yet.
1742
+ """
1743
+ ### BEGIN SOLUTION
1744
+ # First conv block
1745
+ x = self.conv1(x)
1746
+ x = self.relu(x) # ReLU activation
1747
+ x = self.pool1(x)
1748
+
1749
+ # Second conv block
1750
+ x = self.conv2(x)
1751
+ x = self.relu(x) # ReLU activation
1752
+ x = self.pool2(x)
1753
+
1754
+ # Flatten for classification (reshape to 2D)
1755
+ batch_size = x.shape[0]
1756
+ x_flat = x.data.reshape(batch_size, -1)
1757
+
1758
+ # Return flattened features
1759
+ # In a complete implementation, this would go through a Linear layer
1760
+ return Tensor(x_flat)
1761
+ ### END SOLUTION
1762
+
1763
+ def relu(self, x):
1764
+ """Simple ReLU implementation for CNN."""
1765
+ return Tensor(np.maximum(0, x.data))
1766
+
1767
+ def parameters(self):
1768
+ """Return all trainable parameters."""
1769
+ params = []
1770
+ params.extend(self.conv1.parameters())
1771
+ params.extend(self.conv2.parameters())
1772
+ # Linear layer parameters would be added here
1773
+ return params
1774
+
1775
+ def __call__(self, x):
1776
+ """Enable model(x) syntax."""
1777
+ return self.forward(x)
1778
+
1779
+ # %% [markdown]
1780
+ """
1781
+ ### πŸ§ͺ Unit Test: SimpleCNN Integration
1782
+ This test validates that spatial operations work together in a complete CNN architecture.
1783
+ **What we're testing**: End-to-end spatial processing pipeline
1784
+ **Why it matters**: Spatial operations must compose correctly for real CNNs
1785
+ **Expected**: Proper dimension reduction and feature extraction
1786
+ """
1787
+
1788
+ # %% nbgrader={"grade": true, "grade_id": "test-simple-cnn", "locked": true, "points": 10}
1789
+
1790
+
1791
+ def test_unit_simple_cnn():
1792
+ """πŸ”¬ Test SimpleCNN integration with spatial operations."""
1793
+ print("πŸ”¬ Unit Test: SimpleCNN Integration...")
1794
+
1795
+ # Test 1: Forward pass with CIFAR-10 sized input
1796
+ print(" Testing forward pass...")
1797
+ model = SimpleCNN(num_classes=10)
1798
+ x = Tensor(np.random.randn(2, 3, 32, 32)) # Batch of 2, RGB, 32Γ—32
1799
+
1800
+ features = model(x)
1801
+
1802
+ # Expected: 2 samples, 32 channels Γ— 8Γ—8 spatial = 2048 features
1803
+ expected_shape = (2, 2048)
1804
+ assert features.shape == expected_shape, f"Expected {expected_shape}, got {features.shape}"
1805
+
1806
+ # Test 2: Parameter counting
1807
+ print(" Testing parameter counting...")
1808
+ params = model.parameters()
1809
+
1810
+ # Conv1: (16, 3, 3, 3) + bias (16,) = 432 + 16 = 448
1811
+ # Conv2: (32, 16, 3, 3) + bias (32,) = 4608 + 32 = 4640
1812
+ # Total: 448 + 4640 = 5088 parameters
1813
+
1814
+ conv1_params = 16 * 3 * 3 * 3 + 16 # weights + bias
1815
+ conv2_params = 32 * 16 * 3 * 3 + 32 # weights + bias
1816
+ expected_total = conv1_params + conv2_params
1817
+
1818
+ actual_total = sum(np.prod(p.shape) for p in params)
1819
+ assert actual_total == expected_total, f"Expected {expected_total} parameters, got {actual_total}"
1820
+
1821
+ # Test 3: Different input sizes
1822
+ print(" Testing different input sizes...")
1823
+
1824
+ # Test with different spatial dimensions
1825
+ x_small = Tensor(np.random.randn(1, 3, 16, 16))
1826
+ features_small = model(x_small)
1827
+
1828
+ # 16Γ—16 β†’ 8Γ—8 β†’ 4Γ—4, so 32 Γ— 4Γ—4 = 512 features
1829
+ expected_small = (1, 512)
1830
+ assert features_small.shape == expected_small, f"Expected {expected_small}, got {features_small.shape}"
1831
+
1832
+ # Test 4: Batch processing
1833
+ print(" Testing batch processing...")
1834
+ x_batch = Tensor(np.random.randn(8, 3, 32, 32))
1835
+ features_batch = model(x_batch)
1836
+
1837
+ expected_batch = (8, 2048)
1838
+ assert features_batch.shape == expected_batch, f"Expected {expected_batch}, got {features_batch.shape}"
1839
+
1840
+ print("βœ… SimpleCNN integration works correctly!")
1841
+
1842
+ if __name__ == "__main__":
1843
+ test_unit_simple_cnn()
1844
+
1845
+ # %% [markdown]
1846
+ """
1847
+ ## πŸ§ͺ Module Integration Test
1848
+
1849
+ Final validation that everything works together correctly.
1850
+ """
1851
+
1852
+ # %% nbgrader={"grade": true, "grade_id": "module-integration", "locked": true, "points": 15}
1853
+
1854
+
1855
+ def test_module():
1856
+ """πŸ§ͺ Module Test: Complete Integration
1857
+
1858
+ Comprehensive test of entire spatial module functionality.
1859
+
1860
+ This final test runs before module summary to ensure:
1861
+ - All unit tests pass
1862
+ - Functions work together correctly
1863
+ - Module is ready for integration with TinyTorch
1864
+ """
1865
+ print("πŸ§ͺ RUNNING MODULE INTEGRATION TEST")
1866
+ print("=" * 50)
1867
+
1868
+ # Run all unit tests
1869
+ print("Running unit tests...")
1870
+ test_unit_conv2d()
1871
+ test_unit_batchnorm2d()
1872
+ test_unit_pooling()
1873
+ test_unit_simple_cnn()
1874
+
1875
+ print("\nRunning integration scenarios...")
1876
+
1877
+ # Test realistic CNN workflow with BatchNorm
1878
+ print("πŸ”¬ Integration Test: Complete CNN pipeline with BatchNorm...")
1879
+
1880
+ # Create a mini CNN for CIFAR-10 with BatchNorm (modern architecture)
1881
+ conv1 = Conv2d(3, 8, kernel_size=3, padding=1)
1882
+ bn1 = BatchNorm2d(8)
1883
+ pool1 = MaxPool2d(2, stride=2)
1884
+ conv2 = Conv2d(8, 16, kernel_size=3, padding=1)
1885
+ bn2 = BatchNorm2d(16)
1886
+ pool2 = AvgPool2d(2, stride=2)
1887
+
1888
+ # Process batch of images (training mode)
1889
+ batch_images = Tensor(np.random.randn(4, 3, 32, 32))
1890
+
1891
+ # Forward pass: Conv β†’ BatchNorm β†’ ReLU β†’ Pool (modern pattern)
1892
+ x = conv1(batch_images) # (4, 8, 32, 32)
1893
+ x = bn1(x) # (4, 8, 32, 32) - normalized
1894
+ x = Tensor(np.maximum(0, x.data)) # ReLU
1895
+ x = pool1(x) # (4, 8, 16, 16)
1896
+
1897
+ x = conv2(x) # (4, 16, 16, 16)
1898
+ x = bn2(x) # (4, 16, 16, 16) - normalized
1899
+ x = Tensor(np.maximum(0, x.data)) # ReLU
1900
+ features = pool2(x) # (4, 16, 8, 8)
1901
+
1902
+ # Validate shapes at each step
1903
+ assert features.shape[0] == 4, f"Batch size should be preserved, got {features.shape[0]}"
1904
+ assert features.shape == (4, 16, 8, 8), f"Final features shape incorrect: {features.shape}"
1905
+
1906
+ # Test parameter collection across all layers
1907
+ all_params = []
1908
+ all_params.extend(conv1.parameters())
1909
+ all_params.extend(bn1.parameters())
1910
+ all_params.extend(conv2.parameters())
1911
+ all_params.extend(bn2.parameters())
1912
+
1913
+ # Pooling has no parameters
1914
+ assert len(pool1.parameters()) == 0
1915
+ assert len(pool2.parameters()) == 0
1916
+
1917
+ # BatchNorm has 2 params each (gamma, beta)
1918
+ assert len(bn1.parameters()) == 2, f"BatchNorm should have 2 parameters, got {len(bn1.parameters())}"
1919
+
1920
+ # Total: Conv1 (2) + BN1 (2) + Conv2 (2) + BN2 (2) = 8 parameters
1921
+ assert len(all_params) == 8, f"Expected 8 parameter tensors total, got {len(all_params)}"
1922
+
1923
+ # Test train/eval mode switching
1924
+ print("πŸ”¬ Integration Test: Train/Eval mode switching...")
1925
+ bn1.eval()
1926
+ bn2.eval()
1927
+
1928
+ # Run inference with single sample (would fail with batch stats)
1929
+ single_image = Tensor(np.random.randn(1, 3, 32, 32))
1930
+ x = conv1(single_image)
1931
+ x = bn1(x) # Uses running stats, not batch stats
1932
+ assert x.shape == (1, 8, 32, 32), f"Single sample inference should work in eval mode"
1933
+
1934
+ print("βœ… CNN pipeline with BatchNorm works correctly!")
1935
+
1936
+ # Test memory efficiency comparison
1937
+ print("πŸ”¬ Integration Test: Memory efficiency analysis...")
1938
+
1939
+ # Compare different pooling strategies (reduced size for faster execution)
1940
+ input_data = Tensor(np.random.randn(1, 16, 32, 32))
1941
+
1942
+ # No pooling: maintain spatial size
1943
+ conv_only = Conv2d(16, 32, kernel_size=3, padding=1)
1944
+ no_pool_out = conv_only(input_data)
1945
+ no_pool_size = np.prod(no_pool_out.shape) * 4 # float32 bytes
1946
+
1947
+ # With pooling: reduce spatial size
1948
+ conv_with_pool = Conv2d(16, 32, kernel_size=3, padding=1)
1949
+ pool = MaxPool2d(2, stride=2)
1950
+ pool_out = pool(conv_with_pool(input_data))
1951
+ pool_size = np.prod(pool_out.shape) * 4 # float32 bytes
1952
+
1953
+ memory_reduction = no_pool_size / pool_size
1954
+ assert memory_reduction == 4.0, f"2Γ—2 pooling should give 4Γ— memory reduction, got {memory_reduction:.1f}Γ—"
1955
+
1956
+ print(f" Memory reduction with pooling: {memory_reduction:.1f}Γ—")
1957
+ print("βœ… Memory efficiency analysis complete!")
1958
+
1959
+ print("\n" + "=" * 50)
1960
+ print("πŸŽ‰ ALL TESTS PASSED! Module ready for export.")
1961
+ print("Run: tito module complete 09")
1962
+
1963
+ # Run module test when this cell is executed
1964
+ if __name__ == "__main__":
1965
+ test_module()
1966
+
1967
+ # %% [markdown]
1968
+ """
1969
+ ## πŸ”§ Main Execution Block
1970
+
1971
+ Running all module components including systems analysis and final validation.
1972
+ """
1973
+
1974
+ # %% nbgrader={"grade": false, "grade_id": "main-execution", "solution": true}
1975
+
1976
+ if __name__ == "__main__":
1977
+ print("=" * 70)
1978
+ print("MODULE 09: SPATIAL OPERATIONS - TEST EXECUTION")
1979
+ print("=" * 70)
1980
+
1981
+ test_module()
1982
+
1983
+ print("\n" + "="*70)
1984
+ print("MODULE 09 TESTS COMPLETE!")
1985
+ print("="*70)
1986
+
1987
+
1988
+ # %% [markdown]
1989
+ """
1990
+ ## πŸ€” ML Systems Reflection Questions
1991
+
1992
+ Before completing this module, reflect on what you've learned about spatial operations and their systems implications:
1993
+
1994
+ ### Question 1: Conv2d Memory Footprint
1995
+ A Conv2d layer with 64 filters (3Γ—3) processes a (224Γ—224Γ—3) image.
1996
+ - Calculate the memory footprint during the forward pass
1997
+ - Consider: input activations, output activations, filter weights, and biases
1998
+ - What happens when batch size increases from 1 to 32?
1999
+
2000
+ **Think about**: Why do modern vision models use techniques like gradient checkpointing?
2001
+
2002
+ ### Question 2: Spatial Locality and CPU Performance
2003
+ Why are CNNs faster on CPUs than fully-connected networks of similar parameter count?
2004
+
2005
+ **Consider**:
2006
+ - Cache locality in convolution operations
2007
+ - Data reuse patterns in sliding windows
2008
+ - Memory access patterns (sequential vs random)
2009
+
2010
+ **Hint**: Think about what happens when the same filter is applied across the image.
2011
+
2012
+ ### Question 3: Im2col Trade-off
2013
+ The im2col algorithm transforms convolution into matrix multiplication, using more memory but speeding up computation.
2014
+
2015
+ **When is this trade-off worthwhile?**
2016
+ - Small vs large batch sizes
2017
+ - Small vs large images
2018
+ - Training vs inference
2019
+ - Mobile vs server deployment
2020
+
2021
+ **Think about**: Why don't mobile devices always use im2col?
2022
+
2023
+ ### Question 4: Pooling's Systems Benefits
2024
+ MaxPool2d reduces spatial dimensions (e.g., 224Γ—224 β†’ 112Γ—112).
2025
+
2026
+ **What's the systems benefit beyond reducing parameters?**
2027
+ - Memory bandwidth requirements
2028
+ - Computation in subsequent layers
2029
+ - Gradient memory during backpropagation
2030
+ - Cache efficiency in deeper layers
2031
+
2032
+ **Calculate**: If 5 layers each use 2Γ—2 pooling, what's the total memory reduction?
2033
+
2034
+ ### Question 5: Mobile ML Deployment
2035
+ Why do mobile ML models prefer depthwise-separable convolutions over standard Conv2d?
2036
+
2037
+ **Analyze the FLOPs**:
2038
+ - Standard 3Γ—3 conv: C_in Γ— C_out Γ— H Γ— W Γ— 9
2039
+ - Depthwise + Pointwise: (C_in Γ— H Γ— W Γ— 9) + (C_in Γ— C_out Γ— H Γ— W)
2040
+
2041
+ **When does the trade-off favor depthwise separable?**
2042
+ - As number of channels increases
2043
+ - As spatial dimensions change
2044
+ - Energy consumption vs accuracy
2045
+
2046
+ **Real-world context**: This is why MobileNet and EfficientNet architectures exist.
2047
+
2048
+ ---
2049
+
2050
+ **These questions help you think like an ML systems engineer, not just an algorithm implementer.**
2051
+ """
2052
+
2053
+ # %% [markdown]
2054
+ """
2055
+ ## ⭐ Aha Moment: Convolution Extracts Features
2056
+
2057
+ **What you built:** Convolutional layers that process spatial data like images.
2058
+
2059
+ **Why it matters:** Conv2d looks at local neighborhoods, detecting edges, textures, and patterns.
2060
+ Unlike Linear layers that see pixels independently, Conv2d understands that nearby pixels are
2061
+ related. This is why CNNs revolutionized computer vision!
2062
+
2063
+ In the milestones, you'll use these spatial operations to build a CNN that recognizes digits.
2064
+ """
2065
+
2066
+ # %%
2067
+ def demo_spatial():
2068
+ """🎯 See Conv2d process spatial data."""
2069
+ print("🎯 AHA MOMENT: Convolution Extracts Features")
2070
+ print("=" * 45)
2071
+
2072
+ # Create a simple 8x8 "image" with 1 channel
2073
+ image = Tensor(np.random.randn(1, 1, 8, 8))
2074
+
2075
+ # Conv2d: 1 input channel β†’ 4 feature maps
2076
+ conv = Conv2d(in_channels=1, out_channels=4, kernel_size=3)
2077
+
2078
+ output = conv(image)
2079
+
2080
+ print(f"Input: {image.shape} ← 1 image, 1 channel, 8Γ—8")
2081
+ print(f"Output: {output.shape} ← 1 image, 4 features, 6Γ—6")
2082
+ print(f"\nConv kernel: 3Γ—3 sliding window")
2083
+ print(f"Output smaller: 8 - 3 + 1 = 6 (no padding)")
2084
+
2085
+ print("\n✨ Conv2d detects spatial patterns in images!")
2086
+
2087
+ # %%
2088
+ if __name__ == "__main__":
2089
+ test_module()
2090
+ print("\n")
2091
+ demo_spatial()
2092
+
2093
+ # %% [markdown]
2094
+ """
2095
+ ## 🎯 Module Summary
2096
+
2097
+ ## πŸš€ MODULE SUMMARY: Spatial Operations
2098
+
2099
+ Congratulations! You've built the spatial processing foundation that powers computer vision!
2100
+
2101
+ ### Key Accomplishments
2102
+ - Built Conv2d with explicit loops showing O(NΒ²MΒ²KΒ²) complexity βœ…
2103
+ - Implemented BatchNorm2d with train/eval mode and running statistics βœ…
2104
+ - Implemented MaxPool2d and AvgPool2d for spatial dimension reduction βœ…
2105
+ - Created SimpleCNN demonstrating spatial operation integration βœ…
2106
+ - Analyzed computational complexity and memory trade-offs in spatial processing βœ…
2107
+ - All tests pass including complete CNN pipeline validation βœ…
2108
+
2109
+ ### Systems Insights Discovered
2110
+ - **Convolution Complexity**: Quadratic scaling with spatial size, kernel size significantly impacts cost
2111
+ - **Batch Normalization**: Train vs eval mode is critical - batch stats during training, running stats during inference
2112
+ - **Memory Patterns**: Pooling provides 4Γ— memory reduction while preserving important features
2113
+ - **Architecture Design**: Strategic spatial reduction enables parameter-efficient feature extraction
2114
+ - **Cache Performance**: Spatial locality in convolution benefits from optimal memory access patterns
2115
+
2116
+ ### Ready for Next Steps
2117
+ Your spatial operations enable building complete CNNs for computer vision tasks!
2118
+ Export with: `tito module complete 09`
2119
+
2120
+ **Next**: Milestone 03 will combine your spatial operations with training pipeline to build a CNN for CIFAR-10!
2121
+
2122
+ Your implementation shows why:
2123
+ - Modern CNNs use small kernels (3Γ—3) instead of large ones (computational efficiency)
2124
+ - Pooling layers are crucial for managing memory in deep networks (4Γ— reduction per layer)
2125
+ - Explicit loops reveal the true computational cost hidden by optimized implementations
2126
+ - Spatial operations unlock computer vision - from MLPs processing vectors to CNNs understanding images!
2127
+ """
tracer.py CHANGED
@@ -44,16 +44,25 @@ class Tracer:
44
  self.sink = sink
45
  self._next_tid = 1
46
  self._next_cid = 1
47
- self._id_map: Dict[int, str] = {} # id(obj)->tN
48
  self._names: Dict[str, str] = {} # tN->name
49
 
50
  def _tid(self, obj: Any) -> str:
51
- oid = id(obj)
52
- tid = self._id_map.get(oid)
53
- if tid is None:
54
- tid = f"t{self._next_tid}"
55
- self._next_tid += 1
56
- self._id_map[oid] = tid
 
 
 
 
 
 
 
 
 
 
57
  return tid
58
 
59
  def _cid(self) -> str:
 
44
  self.sink = sink
45
  self._next_tid = 1
46
  self._next_cid = 1
 
47
  self._names: Dict[str, str] = {} # tN->name
48
 
49
  def _tid(self, obj: Any) -> str:
50
+ """
51
+ Get or assign a unique ID for a tensor-like object.
52
+
53
+ We store the ID directly on the object to avoid issues with Python's
54
+ id() function, which can return the same value for different objects
55
+ if one is garbage collected and the other reuses its memory address.
56
+ """
57
+ # Check if we've already assigned an ID to this object
58
+ if hasattr(obj, '_tracer_id'):
59
+ return obj._tracer_id
60
+
61
+ # Create new ID and store it on the object
62
+ tid = f"t{self._next_tid}"
63
+ self._next_tid += 1
64
+ obj._tracer_id = tid
65
+
66
  return tid
67
 
68
  def _cid(self) -> str: