Adrian Gabriel commited on
Commit
0ffa4e9
·
1 Parent(s): 108a2f3

latest changs

Browse files
app.py CHANGED
@@ -19,12 +19,11 @@ from instrumentation import Instrumentor
19
  import numpy as np
20
  from tinytorch.core.tensor import Tensor
21
  from tinytorch.core.layers import Linear, Dropout, Layer, Sequential
22
- from tinytorch.core.activations import ReLU, Sigmoid, Tanh, GELU, Softmax
23
  from tinytorch.core.losses import MSELoss, CrossEntropyLoss, log_softmax
24
 
25
  # Import additional modules
26
  from tinytorch.core.autograd import Function, enable_autograd
27
- from tinytorch.core.dataloader import Dataset, TensorDataset, DataLoader, RandomHorizontalFlip, RandomCrop, Compose
28
  from tinytorch.core.optimizers import Optimizer, SGD, Adam, AdamW
29
  from tinytorch.core.tokenization import Tokenizer, CharTokenizer, BPETokenizer, create_tokenizer, tokenize_dataset
30
  from tinytorch.core.training import CosineSchedule, clip_grad_norm, Trainer
@@ -169,6 +168,7 @@ def _make_exec_env(tracer: Tracer) -> Dict[str, Any]:
169
  "Tanh": Tanh,
170
  "GELU": GELU,
171
  "Softmax": Softmax,
 
172
  # Losses
173
  "MSELoss": MSELoss,
174
  "CrossEntropyLoss": CrossEntropyLoss,
@@ -176,13 +176,6 @@ def _make_exec_env(tracer: Tracer) -> Dict[str, Any]:
176
  # Autograd
177
  "Function": Function,
178
  "enable_autograd": enable_autograd,
179
- # DataLoader
180
- "Dataset": Dataset,
181
- "TensorDataset": TensorDataset,
182
- "DataLoader": DataLoader,
183
- "RandomHorizontalFlip": RandomHorizontalFlip,
184
- "RandomCrop": RandomCrop,
185
- "Compose": Compose,
186
  # Optimizers
187
  "Optimizer": Optimizer,
188
  "SGD": SGD,
 
19
  import numpy as np
20
  from tinytorch.core.tensor import Tensor
21
  from tinytorch.core.layers import Linear, Dropout, Layer, Sequential
22
+ from tinytorch.core.activations import ReLU, Sigmoid, Tanh, GELU, Softmax, LogSoftmax
23
  from tinytorch.core.losses import MSELoss, CrossEntropyLoss, log_softmax
24
 
25
  # Import additional modules
26
  from tinytorch.core.autograd import Function, enable_autograd
 
27
  from tinytorch.core.optimizers import Optimizer, SGD, Adam, AdamW
28
  from tinytorch.core.tokenization import Tokenizer, CharTokenizer, BPETokenizer, create_tokenizer, tokenize_dataset
29
  from tinytorch.core.training import CosineSchedule, clip_grad_norm, Trainer
 
168
  "Tanh": Tanh,
169
  "GELU": GELU,
170
  "Softmax": Softmax,
171
+ "LogSoftmax": LogSoftmax,
172
  # Losses
173
  "MSELoss": MSELoss,
174
  "CrossEntropyLoss": CrossEntropyLoss,
 
176
  # Autograd
177
  "Function": Function,
178
  "enable_autograd": enable_autograd,
 
 
 
 
 
 
 
179
  # Optimizers
180
  "Optimizer": Optimizer,
181
  "SGD": SGD,
instrumentation.py CHANGED
@@ -7,7 +7,7 @@ emit trace events for visualization.
7
 
8
  from tinytorch.core.tensor import Tensor
9
  from tinytorch.core.layers import Layer
10
- from tinytorch.core.activations import ReLU, Sigmoid, Tanh, GELU, Softmax
11
  from tinytorch.core.losses import MSELoss, CrossEntropyLoss
12
 
13
 
@@ -220,7 +220,7 @@ class Instrumentor:
220
  self._wrap_layer_forward()
221
 
222
  # Activations - these don't inherit from Layer
223
- for activation_cls in [ReLU, Sigmoid, Tanh, GELU, Softmax]:
224
  self._wrap_activation(activation_cls)
225
 
226
  # Losses
@@ -238,6 +238,7 @@ class Instrumentor:
238
  "Tanh": Tanh,
239
  "GELU": GELU,
240
  "Softmax": Softmax,
 
241
  "MSELoss": MSELoss,
242
  "CrossEntropyLoss": CrossEntropyLoss,
243
  }
 
7
 
8
  from tinytorch.core.tensor import Tensor
9
  from tinytorch.core.layers import Layer
10
+ from tinytorch.core.activations import ReLU, Sigmoid, Tanh, GELU, Softmax, LogSoftmax
11
  from tinytorch.core.losses import MSELoss, CrossEntropyLoss
12
 
13
 
 
220
  self._wrap_layer_forward()
221
 
222
  # Activations - these don't inherit from Layer
223
+ for activation_cls in [ReLU, Sigmoid, Tanh, GELU, Softmax, LogSoftmax]:
224
  self._wrap_activation(activation_cls)
225
 
226
  # Losses
 
238
  "Tanh": Tanh,
239
  "GELU": GELU,
240
  "Softmax": Softmax,
241
+ "LogSoftmax": LogSoftmax,
242
  "MSELoss": MSELoss,
243
  "CrossEntropyLoss": CrossEntropyLoss,
244
  }
static/index.html CHANGED
@@ -1739,7 +1739,7 @@ box("Loss Computation", [y_pred, target_probs, loss], "6")
1739
  grid.appendChild(outputCard);
1740
 
1741
  // Add element-wise hover highlighting for activations and element-wise ops
1742
- const isElementwiseUnary = ['relu', 'sigmoid', 'tanh', 'gelu', 'softmax'].includes(type);
1743
  if (isElementwiseUnary && inputCards.length > 0) {
1744
  setupElementwiseHover(inputCards, outputCard);
1745
  }
 
1739
  grid.appendChild(outputCard);
1740
 
1741
  // Add element-wise hover highlighting for activations and element-wise ops
1742
+ const isElementwiseUnary = ['relu', 'sigmoid', 'tanh', 'gelu', 'softmax', 'logsoftmax'].includes(type);
1743
  if (isElementwiseUnary && inputCards.length > 0) {
1744
  setupElementwiseHover(inputCards, outputCard);
1745
  }
tinytorch/core/activations.py CHANGED
@@ -781,6 +781,46 @@ class Softmax:
781
  """Allows the activation to be called like a function."""
782
  return self.forward(x, dim)
783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
  # %% [markdown]
785
  """
786
  ### 🔬 Unit Test: Softmax
 
781
  """Allows the activation to be called like a function."""
782
  return self.forward(x, dim)
783
 
784
+
785
+ class LogSoftmax:
786
+ """
787
+ Log-Softmax activation: log(softmax(x))
788
+
789
+ Computes log-softmax with numerical stability using the log-sum-exp trick.
790
+ More numerically stable than computing softmax then log separately.
791
+ Essential for cross-entropy loss computation.
792
+ """
793
+
794
+ def parameters(self):
795
+ """Return empty list (activations have no learnable parameters)."""
796
+ return []
797
+
798
+ def forward(self, x: Tensor, dim: int = -1) -> Tensor:
799
+ """
800
+ Apply log-softmax activation along specified dimension.
801
+
802
+ Uses the log-sum-exp trick for numerical stability:
803
+ log_softmax(x) = x - max(x) - log(sum(exp(x - max(x))))
804
+ """
805
+ # Step 1: Find max along dimension for numerical stability
806
+ max_vals = np.max(x.data, axis=dim, keepdims=True)
807
+
808
+ # Step 2: Subtract max to prevent overflow
809
+ shifted = x.data - max_vals
810
+
811
+ # Step 3: Compute log(sum(exp(shifted)))
812
+ log_sum_exp = np.log(np.sum(np.exp(shifted), axis=dim, keepdims=True))
813
+
814
+ # Step 4: Return log_softmax = input - max - log_sum_exp
815
+ result = x.data - max_vals - log_sum_exp
816
+
817
+ return Tensor(result)
818
+
819
+ def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
820
+ """Allows the activation to be called like a function."""
821
+ return self.forward(x, dim)
822
+
823
+
824
  # %% [markdown]
825
  """
826
  ### 🔬 Unit Test: Softmax
tinytorch/core/dataloader.py DELETED
@@ -1,1990 +0,0 @@
1
- # ---
2
- # jupyter:
3
- # jupytext:
4
- # text_representation:
5
- # extension: .py
6
- # format_name: percent
7
- # format_version: '1.3'
8
- # jupytext_version: 1.17.1
9
- # kernelspec:
10
- # display_name: Python 3 (ipykernel)
11
- # language: python
12
- # name: python3
13
- # ---
14
-
15
- #| default_exp core.dataloader
16
- #| export
17
-
18
- # %% [markdown]
19
- """
20
- # Module 05: DataLoader - Efficient Data Pipeline for ML Training
21
-
22
- Welcome to Module 05! You're about to build the data loading infrastructure that transforms how ML models consume data during training.
23
-
24
- ## 🔗 Prerequisites & Progress
25
- **You've Built**: Tensor operations, activations, layers, and losses
26
- **You'll Build**: Dataset abstraction, DataLoader with batching/shuffling, and real dataset support
27
- **You'll Enable**: Efficient data pipelines that will feed hungry neural networks with properly formatted batches
28
-
29
- **Connection Map**:
30
- ```
31
- Losses → DataLoader → Autograd → Optimizers → Training
32
- (Module 04) (Module 05) (Module 06) (Module 07) (Module 08)
33
- ```
34
-
35
- ## 🎯 Learning Objectives
36
- By the end of this module, you will:
37
- 1. Understand the data pipeline: individual samples → batches → training
38
- 2. Implement Dataset abstraction and TensorDataset for tensor-based data
39
- 3. Build DataLoader with intelligent batching, shuffling, and memory-efficient iteration
40
- 4. Experience data pipeline performance characteristics firsthand
41
- 5. Create download functions for real computer vision datasets
42
-
43
- Let's transform scattered data into organized learning batches!
44
-
45
- ## 📦 Where This Code Lives in the Final Package
46
-
47
- **Learning Side:** You work in `modules/05_dataloader/dataloader.ipynb`
48
- **Building Side:** Code exports to `tinytorch.core.dataloader`
49
-
50
- ```python
51
- # How to use this module:
52
- from tinytorch.core.dataloader import Dataset, DataLoader, TensorDataset
53
- from tinytorch.core.dataloader import download_mnist, download_cifar10
54
- ```
55
-
56
- **Why this matters:**
57
- - **Learning:** Complete data loading system in one focused module for deep understanding
58
- - **Production:** Proper organization like PyTorch's torch.utils.data with all core data utilities
59
- - **Efficiency:** Optimized data pipelines are crucial for training speed and memory usage
60
- - **Integration:** Works seamlessly with training loops to create complete ML systems
61
- """
62
-
63
- # %%
64
- #| export
65
- # Essential imports for data loading
66
- import numpy as np
67
- import random
68
- import time
69
- import sys
70
- from typing import Iterator, Tuple, List, Optional, Union
71
- from abc import ABC, abstractmethod
72
-
73
- # Import real Tensor class from tinytorch package
74
- from tinytorch.core.tensor import Tensor
75
-
76
- # %% [markdown]
77
- """
78
- ## 💡 Understanding the Data Pipeline
79
-
80
- Before we implement anything, let's understand what happens when neural networks "eat" data. The journey from raw data to trained models follows a specific pipeline that every ML engineer must master.
81
-
82
- ### The Data Pipeline Journey
83
-
84
- Imagine you have 50,000 images of cats and dogs, and you want to train a neural network to classify them:
85
-
86
- ```
87
- Raw Data Storage Dataset Interface DataLoader Batching Training Loop
88
- ┌─────────────────┐ ┌──────────────────┐ ┌────────────────────┐ ┌─────────────┐
89
- │ cat_001.jpg │ │ dataset[0] │ │ Batch 1: │ │ model(batch)│
90
- │ dog_023.jpg │ ───> │ dataset[1] │ ───> │ [cat, dog, cat] │ ───> │ optimizer │
91
- │ cat_045.jpg │ │ dataset[2] │ │ Batch 2: │ │ loss │
92
- │ ... │ │ ... │ │ [dog, cat, dog] │ │ backward │
93
- │ (50,000 files) │ │ dataset[49999] │ │ ... │ │ step │
94
- └─────────────────┘ └──────────────────┘ └────────────────────┘ └─────────────┘
95
- ```
96
-
97
- ### Why This Pipeline Matters
98
-
99
- **Individual Access (Dataset)**: Neural networks can't process 50,000 files at once. We need a way to access one sample at a time: "Give me image #1,247".
100
-
101
- **Batch Processing (DataLoader)**: GPUs are parallel machines - they're much faster processing 32 images simultaneously than 1 image 32 times.
102
-
103
- **Memory Efficiency**: Loading all 50,000 images into memory would require ~150GB. Instead, we load only the current batch (~150MB).
104
-
105
- **Training Variety**: Shuffling ensures the model sees different combinations each epoch, preventing memorization.
106
-
107
- ### The Dataset Abstraction
108
-
109
- The Dataset class provides a uniform interface for accessing data, regardless of whether it's stored as files, in memory, in databases, or generated on-the-fly:
110
-
111
- ```
112
- Dataset Interface
113
- ┌─────────────��───────────────────────┐
114
- │ __len__() → "How many samples?" │
115
- │ __getitem__(i) → "Give me sample i" │
116
- └─────────────────────────────────────┘
117
- ↑ ↑
118
- Enables for Enables indexing
119
- loops/iteration dataset[index]
120
- ```
121
-
122
- **Connection to systems**: This abstraction is crucial because it separates *how data is stored* from *how it's accessed*, enabling optimizations like caching, prefetching, and parallel loading.
123
- """
124
-
125
- # %% nbgrader={"grade": false, "grade_id": "dataset-implementation", "solution": true}
126
- #| export
127
- class Dataset(ABC):
128
- """
129
- Abstract base class for all datasets.
130
-
131
- Provides the fundamental interface that all datasets must implement:
132
- - __len__(): Returns the total number of samples
133
- - __getitem__(idx): Returns the sample at given index
134
-
135
- TODO: Implement the abstract Dataset base class
136
-
137
- APPROACH:
138
- 1. Use ABC (Abstract Base Class) to define interface
139
- 2. Mark methods as @abstractmethod to force implementation
140
- 3. Provide clear docstrings for subclasses
141
-
142
- EXAMPLE:
143
- >>> class MyDataset(Dataset):
144
- ... def __len__(self): return 100
145
- ... def __getitem__(self, idx): return idx
146
- >>> dataset = MyDataset()
147
- >>> print(len(dataset)) # 100
148
- >>> print(dataset[42]) # 42
149
-
150
- HINT: Abstract methods force subclasses to implement core functionality
151
- """
152
-
153
- ### BEGIN SOLUTION
154
- @abstractmethod
155
- def __len__(self) -> int:
156
- """
157
- Return the total number of samples in the dataset.
158
-
159
- This method must be implemented by all subclasses to enable
160
- len(dataset) calls and batch size calculations.
161
- """
162
- pass
163
-
164
- @abstractmethod
165
- def __getitem__(self, idx: int):
166
- """
167
- Return the sample at the given index.
168
-
169
- Args:
170
- idx: Index of the sample to retrieve (0 <= idx < len(dataset))
171
-
172
- Returns:
173
- The sample at index idx. Format depends on the dataset implementation.
174
- Could be (data, label) tuple, single tensor, etc.
175
- """
176
- pass
177
- ### END SOLUTION
178
-
179
-
180
- # %% nbgrader={"grade": true, "grade_id": "test-dataset", "locked": true, "points": 10}
181
- def test_unit_dataset():
182
- """🔬 Test Dataset abstract base class."""
183
- print("🔬 Unit Test: Dataset Abstract Base Class...")
184
-
185
- # Test that Dataset is properly abstract
186
- try:
187
- dataset = Dataset()
188
- assert False, "Should not be able to instantiate abstract Dataset"
189
- except TypeError:
190
- print("✅ Dataset is properly abstract")
191
-
192
- # Test concrete implementation
193
- class TestDataset(Dataset):
194
- def __init__(self, size):
195
- self.size = size
196
-
197
- def __len__(self):
198
- return self.size
199
-
200
- def __getitem__(self, idx):
201
- return f"item_{idx}"
202
-
203
- dataset = TestDataset(10)
204
- assert len(dataset) == 10
205
- assert dataset[0] == "item_0"
206
- assert dataset[9] == "item_9"
207
-
208
- print("✅ Dataset interface works correctly!")
209
-
210
- if __name__ == "__main__":
211
- test_unit_dataset()
212
-
213
-
214
- # %% [markdown]
215
- """
216
- ## 🏗️ TensorDataset - When Data Lives in Memory
217
-
218
- Now let's implement TensorDataset, the most common dataset type for when your data is already loaded into tensors. This is perfect for datasets like MNIST where you can fit everything in memory.
219
-
220
- ### Understanding TensorDataset Structure
221
-
222
- TensorDataset takes multiple tensors and aligns them by their first dimension (the sample dimension):
223
-
224
- ```
225
- Input Tensors (aligned by first dimension):
226
- Features Tensor Labels Tensor Metadata Tensor
227
- ┌─────────────────┐ ┌───────────────┐ ┌─────────────────┐
228
- │ [1.2, 3.4, 5.6] │ │ 0 (cat) │ │ "image_001.jpg" │ ← Sample 0
229
- │ [2.1, 4.3, 6.5] │ │ 1 (dog) │ │ "image_002.jpg" │ ← Sample 1
230
- │ [3.0, 5.2, 7.4] │ │ 0 (cat) │ │ "image_003.jpg" │ ← Sample 2
231
- │ ... │ │ ... │ │ ... │
232
- └─────────────────┘ └───────────────┘ └─────────────────┘
233
- (N, 3) (N,) (N,)
234
-
235
- Dataset Access:
236
- dataset[1] → (Tensor([2.1, 4.3, 6.5]), Tensor(1), "image_002.jpg")
237
- ```
238
-
239
- ### Why TensorDataset is Powerful
240
-
241
- **Memory Locality**: All data is pre-loaded and stored contiguously in memory, enabling fast access patterns.
242
-
243
- **Vectorized Operations**: Since everything is already tensors, no conversion overhead during training.
244
-
245
- **Supervised Learning Perfect**: Naturally handles (features, labels) pairs, plus any additional metadata.
246
-
247
- **Batch-Friendly**: When DataLoader needs a batch, it can slice multiple samples efficiently.
248
-
249
- ### Real-World Usage Patterns
250
-
251
- ```
252
- # Computer Vision
253
- images = Tensor(shape=(50000, 32, 32, 3)) # CIFAR-10 images
254
- labels = Tensor(shape=(50000,)) # Class labels 0-9
255
- dataset = TensorDataset(images, labels)
256
-
257
- # Natural Language Processing
258
- token_ids = Tensor(shape=(10000, 512)) # Tokenized sentences
259
- labels = Tensor(shape=(10000,)) # Sentiment labels
260
- dataset = TensorDataset(token_ids, labels)
261
-
262
- # Time Series
263
- sequences = Tensor(shape=(1000, 100, 5)) # 100 timesteps, 5 features
264
- targets = Tensor(shape=(1000, 10)) # 10-step ahead prediction
265
- dataset = TensorDataset(sequences, targets)
266
- ```
267
-
268
- The key insight: TensorDataset transforms "arrays of data" into "a dataset that serves samples".
269
- """
270
-
271
- # %% nbgrader={"grade": false, "grade_id": "tensordataset-implementation", "solution": true}
272
- #| export
273
- class TensorDataset(Dataset):
274
- """
275
- Dataset wrapping tensors for supervised learning.
276
-
277
- Each sample is a tuple of tensors from the same index across all input tensors.
278
- All tensors must have the same size in their first dimension.
279
-
280
- TODO: Implement TensorDataset for tensor-based data
281
-
282
- APPROACH:
283
- 1. Store all input tensors
284
- 2. Validate they have same first dimension (number of samples)
285
- 3. Return tuple of tensor slices for each index
286
-
287
- EXAMPLE:
288
- >>> features = Tensor([[1, 2], [3, 4], [5, 6]]) # 3 samples, 2 features each
289
- >>> labels = Tensor([0, 1, 0]) # 3 labels
290
- >>> dataset = TensorDataset(features, labels)
291
- >>> print(len(dataset)) # 3
292
- >>> print(dataset[1]) # (Tensor([3, 4]), Tensor(1))
293
-
294
- HINTS:
295
- - Use *tensors to accept variable number of tensor arguments
296
- - Check all tensors have same length in dimension 0
297
- - Return tuple of tensor[idx] for all tensors
298
- """
299
-
300
- def __init__(self, *tensors):
301
- """
302
- Create dataset from multiple tensors.
303
-
304
- Args:
305
- *tensors: Variable number of Tensor objects
306
-
307
- All tensors must have the same size in their first dimension.
308
- """
309
- ### BEGIN SOLUTION
310
- assert len(tensors) > 0, "Must provide at least one tensor"
311
-
312
- # Store all tensors
313
- self.tensors = tensors
314
-
315
- # Validate all tensors have same first dimension
316
- first_size = len(tensors[0].data) # Size of first dimension
317
- for i, tensor in enumerate(tensors):
318
- if len(tensor.data) != first_size:
319
- raise ValueError(
320
- f"All tensors must have same size in first dimension. "
321
- f"Tensor 0: {first_size}, Tensor {i}: {len(tensor.data)}"
322
- )
323
- ### END SOLUTION
324
-
325
- def __len__(self) -> int:
326
- """
327
- Return number of samples (size of first dimension).
328
-
329
- TODO: Return the total number of samples in the dataset
330
-
331
- APPROACH:
332
- 1. Access the first tensor from self.tensors
333
- 2. Return length of its data (first dimension size)
334
-
335
- EXAMPLE:
336
- >>> features = Tensor([[1, 2], [3, 4], [5, 6]]) # 3 samples
337
- >>> labels = Tensor([0, 1, 0])
338
- >>> dataset = TensorDataset(features, labels)
339
- >>> print(len(dataset)) # 3
340
-
341
- HINT: All tensors have same first dimension (validated in __init__)
342
- """
343
- ### BEGIN SOLUTION
344
- return len(self.tensors[0].data)
345
- ### END SOLUTION
346
-
347
- def __getitem__(self, idx: int) -> Tuple[Tensor, ...]:
348
- """
349
- Return tuple of tensor slices at given index.
350
-
351
- TODO: Return the sample at the given index
352
-
353
- APPROACH:
354
- 1. Validate index is within bounds
355
- 2. Extract data at index from each tensor
356
- 3. Wrap each slice in a Tensor and return as tuple
357
-
358
- Args:
359
- idx: Sample index
360
-
361
- Returns:
362
- Tuple containing tensor[idx] for each input tensor
363
-
364
- EXAMPLE:
365
- >>> features = Tensor([[1, 2], [3, 4], [5, 6]])
366
- >>> labels = Tensor([0, 1, 0])
367
- >>> dataset = TensorDataset(features, labels)
368
- >>> sample = dataset[1]
369
- >>> # Returns: (Tensor([3, 4]), Tensor(1))
370
-
371
- HINTS:
372
- - Check idx < len(self) to prevent out-of-bounds access
373
- - Use generator expression with tuple() for clean syntax
374
- """
375
- ### BEGIN SOLUTION
376
- if idx >= len(self) or idx < 0:
377
- raise IndexError(f"Index {idx} out of range for dataset of size {len(self)}")
378
-
379
- # Return tuple of slices from all tensors
380
- return tuple(Tensor(tensor.data[idx]) for tensor in self.tensors)
381
- ### END SOLUTION
382
-
383
-
384
- # %% nbgrader={"grade": true, "grade_id": "test-tensordataset", "locked": true, "points": 15}
385
- def test_unit_tensordataset():
386
- """🔬 Test TensorDataset implementation."""
387
- print("🔬 Unit Test: TensorDataset...")
388
-
389
- # Test basic functionality
390
- features = Tensor([[1, 2], [3, 4], [5, 6]]) # 3 samples, 2 features
391
- labels = Tensor([0, 1, 0]) # 3 labels
392
-
393
- dataset = TensorDataset(features, labels)
394
-
395
- # Test length
396
- assert len(dataset) == 3, f"Expected length 3, got {len(dataset)}"
397
-
398
- # Test indexing
399
- sample = dataset[0]
400
- assert len(sample) == 2, "Should return tuple with 2 tensors"
401
- assert np.array_equal(sample[0].data, [1, 2]), f"Wrong features: {sample[0].data}"
402
- assert sample[1].data == 0, f"Wrong label: {sample[1].data}"
403
-
404
- sample = dataset[1]
405
- assert np.array_equal(sample[1].data, 1), f"Wrong label at index 1: {sample[1].data}"
406
-
407
- # Test error handling
408
- try:
409
- dataset[10] # Out of bounds
410
- assert False, "Should raise IndexError for out of bounds access"
411
- except IndexError:
412
- pass
413
-
414
- # Test mismatched tensor sizes
415
- try:
416
- bad_features = Tensor([[1, 2], [3, 4]]) # Only 2 samples
417
- bad_labels = Tensor([0, 1, 0]) # 3 labels - mismatch!
418
- TensorDataset(bad_features, bad_labels)
419
- assert False, "Should raise error for mismatched tensor sizes"
420
- except ValueError:
421
- pass
422
-
423
- print("✅ TensorDataset works correctly!")
424
-
425
- if __name__ == "__main__":
426
- test_unit_tensordataset()
427
-
428
-
429
- # %% [markdown]
430
- """
431
- ## 🏗️ DataLoader - The Batch Factory
432
-
433
- Now we build the DataLoader, the component that transforms individual dataset samples into the batches that neural networks crave. This is where data loading becomes a systems challenge.
434
-
435
- ### Understanding Batching: From Samples to Tensors
436
-
437
- DataLoader performs a crucial transformation - it collects individual samples and stacks them into batch tensors:
438
-
439
- ```
440
- Step 1: Individual Samples from Dataset
441
- dataset[0] → (features: [1, 2, 3], label: 0)
442
- dataset[1] → (features: [4, 5, 6], label: 1)
443
- dataset[2] → (features: [7, 8, 9], label: 0)
444
- dataset[3] → (features: [2, 3, 4], label: 1)
445
-
446
- Step 2: DataLoader Groups into Batch (batch_size=2)
447
- Batch 1:
448
- features: [[1, 2, 3], ← Stacked into shape (2, 3)
449
- [4, 5, 6]]
450
- labels: [0, 1] ← Stacked into shape (2,)
451
-
452
- Batch 2:
453
- features: [[7, 8, 9], ← Stacked into shape (2, 3)
454
- [2, 3, 4]]
455
- labels: [0, 1] ← Stacked into shape (2,)
456
- ```
457
-
458
- ### The Shuffling Process
459
-
460
- Shuffling randomizes which samples appear in which batches, crucial for good training:
461
-
462
- ```
463
- Without Shuffling (epoch 1): With Shuffling (epoch 1):
464
- Batch 1: [sample 0, sample 1] Batch 1: [sample 2, sample 0]
465
- Batch 2: [sample 2, sample 3] Batch 2: [sample 3, sample 1]
466
- Batch 3: [sample 4, sample 5] Batch 3: [sample 5, sample 4]
467
-
468
- Without Shuffling (epoch 2): With Shuffling (epoch 2):
469
- Batch 1: [sample 0, sample 1] ✗ Batch 1: [sample 1, sample 4] ✓
470
- Batch 2: [sample 2, sample 3] ✗ Batch 2: [sample 0, sample 5] ✓
471
- Batch 3: [sample 4, sample 5] ✗ Batch 3: [sample 2, sample 3] ✓
472
-
473
- (Same every epoch = overfitting!) (Different combinations = better learning!)
474
- ```
475
-
476
- ### DataLoader as a Systems Component
477
-
478
- **Memory Management**: DataLoader only holds one batch in memory at a time, not the entire dataset.
479
-
480
- **Iteration Interface**: Provides Python iterator protocol so training loops can use `for batch in dataloader:`.
481
-
482
- **Collation Strategy**: Automatically stacks tensors from individual samples into batch tensors.
483
-
484
- **Performance Critical**: This is often the bottleneck in training pipelines - loading and preparing data can be slower than the forward pass!
485
-
486
- ### The DataLoader Algorithm
487
-
488
- ```
489
- 1. Create indices list: [0, 1, 2, ..., dataset_length-1]
490
- 2. If shuffle=True: randomly shuffle the indices
491
- 3. Group indices into chunks of batch_size
492
- 4. For each chunk:
493
- a. Retrieve samples: [dataset[i] for i in chunk]
494
- b. Collate samples: stack individual tensors into batch tensors
495
- c. Yield the batch tensor tuple
496
- ```
497
-
498
- This transforms the dataset from "access one sample" to "iterate through batches" - exactly what training loops need.
499
- """
500
-
501
- # %% nbgrader={"grade": false, "grade_id": "dataloader-implementation", "solution": true}
502
- #| export
503
- class DataLoader:
504
- """
505
- Data loader with batching and shuffling support.
506
-
507
- Wraps a dataset to provide batched iteration with optional shuffling.
508
- Essential for efficient training with mini-batch gradient descent.
509
-
510
- TODO: Implement DataLoader with batching and shuffling
511
-
512
- APPROACH:
513
- 1. Store dataset, batch_size, and shuffle settings
514
- 2. Create iterator that groups samples into batches
515
- 3. Handle shuffling by randomizing indices
516
- 4. Collate individual samples into batch tensors
517
-
518
- EXAMPLE:
519
- >>> dataset = TensorDataset(Tensor([[1,2], [3,4], [5,6]]), Tensor([0,1,0]))
520
- >>> loader = DataLoader(dataset, batch_size=2, shuffle=True)
521
- >>> for batch in loader:
522
- ... features_batch, labels_batch = batch
523
- ... print(f"Features: {features_batch.shape}, Labels: {labels_batch.shape}")
524
-
525
- HINTS:
526
- - Use random.shuffle() for index shuffling
527
- - Group consecutive samples into batches
528
- - Stack individual tensors using np.stack()
529
- """
530
-
531
- def __init__(self, dataset: Dataset, batch_size: int, shuffle: bool = False):
532
- """
533
- Create DataLoader for batched iteration.
534
-
535
- Args:
536
- dataset: Dataset to load from
537
- batch_size: Number of samples per batch
538
- shuffle: Whether to shuffle data each epoch
539
- """
540
- ### BEGIN SOLUTION
541
- self.dataset = dataset
542
- self.batch_size = batch_size
543
- self.shuffle = shuffle
544
- ### END SOLUTION
545
-
546
- def __len__(self) -> int:
547
- """
548
- Return number of batches per epoch.
549
-
550
- TODO: Calculate the number of batches based on dataset size and batch_size
551
-
552
- APPROACH:
553
- 1. Use ceiling division: (dataset_size + batch_size - 1) // batch_size
554
- 2. This ensures we count the last partial batch
555
-
556
- EXAMPLE:
557
- >>> dataset = TensorDataset(Tensor([[1], [2], [3], [4], [5]]))
558
- >>> loader = DataLoader(dataset, batch_size=2)
559
- >>> print(len(loader)) # 3 (batches: [2, 2, 1])
560
-
561
- HINT: Ceiling division handles uneven splits correctly
562
- """
563
- ### BEGIN SOLUTION
564
- # Calculate number of complete batches
565
- return (len(self.dataset) + self.batch_size - 1) // self.batch_size
566
- ### END SOLUTION
567
-
568
- def __iter__(self) -> Iterator:
569
- """
570
- Return iterator over batches.
571
-
572
- TODO: Implement iteration that yields batches of data
573
-
574
- APPROACH:
575
- 1. Create list of indices [0, 1, 2, ..., len(dataset)-1]
576
- 2. Shuffle indices if self.shuffle is True
577
- 3. Group indices into chunks of batch_size
578
- 4. For each chunk, retrieve samples and collate into batch
579
-
580
- EXAMPLE:
581
- >>> dataset = TensorDataset(Tensor([[1], [2], [3], [4]]))
582
- >>> loader = DataLoader(dataset, batch_size=2)
583
- >>> for batch in loader:
584
- ... print(batch[0].shape) # (2, 1)
585
-
586
- HINTS:
587
- - Use random.shuffle() to randomize indices
588
- - Use range(0, len(indices), batch_size) to create chunks
589
- - Call self._collate_batch() to convert list of samples to batch tensors
590
- """
591
- ### BEGIN SOLUTION
592
- # Create list of indices
593
- indices = list(range(len(self.dataset)))
594
-
595
- # Shuffle if requested
596
- if self.shuffle:
597
- random.shuffle(indices)
598
-
599
- # Yield batches
600
- for i in range(0, len(indices), self.batch_size):
601
- batch_indices = indices[i:i + self.batch_size]
602
- batch = [self.dataset[idx] for idx in batch_indices]
603
-
604
- # Collate batch - convert list of tuples to tuple of tensors
605
- yield self._collate_batch(batch)
606
- ### END SOLUTION
607
-
608
- def _collate_batch(self, batch: List[Tuple[Tensor, ...]]) -> Tuple[Tensor, ...]:
609
- """
610
- Collate individual samples into batch tensors.
611
-
612
- TODO: Stack individual sample tensors into batch tensors
613
-
614
- APPROACH:
615
- 1. Handle empty batch edge case
616
- 2. Determine how many tensors per sample (e.g., 2 for features + labels)
617
- 3. For each tensor position, extract all samples at that position
618
- 4. Stack them using np.stack() to create batch dimension
619
- 5. Wrap result in Tensor and return tuple
620
-
621
- Args:
622
- batch: List of sample tuples from dataset
623
-
624
- Returns:
625
- Tuple of batched tensors
626
-
627
- EXAMPLE:
628
- >>> # batch = [(Tensor([1,2]), Tensor(0)),
629
- ... (Tensor([3,4]), Tensor(1))]
630
- >>> # Returns: (Tensor([[1,2], [3,4]]), Tensor([0, 1]))
631
-
632
- HINTS:
633
- - Use len(batch[0]) to get number of tensors per sample
634
- - Extract .data from each tensor before stacking
635
- - np.stack() creates new axis at position 0 (batch dimension)
636
- """
637
- ### BEGIN SOLUTION
638
- if len(batch) == 0:
639
- return ()
640
-
641
- # Determine number of tensors per sample
642
- num_tensors = len(batch[0])
643
-
644
- # Group tensors by position
645
- batched_tensors = []
646
- for tensor_idx in range(num_tensors):
647
- # Extract all tensors at this position
648
- tensor_list = [sample[tensor_idx].data for sample in batch]
649
-
650
- # Stack into batch tensor
651
- batched_data = np.stack(tensor_list, axis=0)
652
- batched_tensors.append(Tensor(batched_data))
653
-
654
- return tuple(batched_tensors)
655
- ### END SOLUTION
656
-
657
-
658
- # %% [markdown]
659
- """
660
- ## 🏗️ Data Augmentation - Preventing Overfitting Through Variety
661
-
662
- Data augmentation is one of the most effective techniques for improving model generalization. By applying random transformations during training, we artificially expand the dataset and force the model to learn robust, invariant features.
663
-
664
- ### Why Augmentation Matters
665
-
666
- ```
667
- Without Augmentation: With Augmentation:
668
- Model sees exact same images Model sees varied versions
669
- every epoch every epoch
670
-
671
- Cat photo #247 Cat #247 (original)
672
- Cat photo #247 Cat #247 (flipped)
673
- Cat photo #247 Cat #247 (cropped left)
674
- Cat photo #247 Cat #247 (cropped right)
675
- ↓ ↓
676
- Model memorizes position Model learns "cat-ness"
677
- Overfits to training set Generalizes to new cats
678
- ```
679
-
680
- ### Common Augmentation Strategies
681
-
682
- For CIFAR-10 and similar image datasets:
683
-
684
- ```
685
- RandomHorizontalFlip (50% probability):
686
- ┌──────────┐ ┌──────────┐
687
- │ 🐱 → │ → │ ← 🐱 │
688
- │ │ │ │
689
- └──────────┘ └──────────┘
690
- Cars, cats, dogs look similar when flipped!
691
-
692
- RandomCrop with Padding:
693
- ┌──────────┐ ┌────────────┐ ┌──────────┐
694
- │ 🐱 │ → │░░░░░░░░░░░░│ → │ 🐱 │
695
- │ │ │░░ 🐱 ░│ │ │
696
- └──────────┘ │░░░░░░░░░░░░│ └──────────┘
697
- Original Pad edges Random crop
698
- (with zeros) (back to 32×32)
699
- ```
700
-
701
- ### Training vs Evaluation
702
-
703
- **Critical**: Augmentation applies ONLY during training!
704
-
705
- ```
706
- Training: Evaluation:
707
- ┌─────────────────┐ ┌─────────────────┐
708
- │ Original Image │ │ Original Image │
709
- │ ↓ │ │ ↓ │
710
- │ Random Flip │ │ (no transforms) │
711
- │ ↓ │ │ ↓ │
712
- │ Random Crop │ │ Direct to Model │
713
- │ ↓ │ └─────────────────┘
714
- │ To Model │
715
- └─────────────────┘
716
- ```
717
-
718
- Why? During evaluation, we want consistent, reproducible predictions. Augmentation during test would add randomness to predictions, making them unreliable.
719
- """
720
-
721
- # %% nbgrader={"grade": false, "grade_id": "augmentation-transforms", "solution": true}
722
-
723
- #| export
724
-
725
- class RandomHorizontalFlip:
726
- """
727
- Randomly flip images horizontally with given probability.
728
-
729
- A simple but effective augmentation for most image datasets.
730
- Flipping is appropriate when horizontal orientation doesn't change class
731
- (cats, dogs, cars - not digits or text!).
732
-
733
- Args:
734
- p: Probability of flipping (default: 0.5)
735
- """
736
-
737
- def __init__(self, p=0.5):
738
- """
739
- Initialize RandomHorizontalFlip.
740
-
741
- TODO: Store flip probability
742
-
743
- APPROACH:
744
- 1. Validate probability is in range [0, 1]
745
- 2. Store p as instance variable
746
-
747
- EXAMPLE:
748
- >>> flip = RandomHorizontalFlip(p=0.5) # 50% chance to flip
749
-
750
- HINT: Raise ValueError if p is outside valid range
751
- """
752
- ### BEGIN SOLUTION
753
- if not 0.0 <= p <= 1.0:
754
- raise ValueError(f"Probability must be between 0 and 1, got {p}")
755
- self.p = p
756
- ### END SOLUTION
757
-
758
- def __call__(self, x):
759
- """
760
- Apply random horizontal flip to input.
761
-
762
- TODO: Implement random horizontal flip
763
-
764
- APPROACH:
765
- 1. Generate random number in [0, 1)
766
- 2. If random < p, flip horizontally
767
- 3. Otherwise, return unchanged
768
-
769
- Args:
770
- x: Input array with shape (..., H, W) or (..., H, W, C)
771
- Flips along the last-1 axis (width dimension)
772
-
773
- Returns:
774
- Flipped or unchanged array (same shape as input)
775
-
776
- EXAMPLE:
777
- >>> flip = RandomHorizontalFlip(0.5)
778
- >>> img = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 image
779
- >>> # 50% chance output is [[3, 2, 1], [6, 5, 4]]
780
-
781
- HINT: Use np.flip(x, axis=-1) to flip along width axis
782
- """
783
- ### BEGIN SOLUTION
784
- if np.random.random() < self.p:
785
- # Flip along the width axis (last axis for HW format, second-to-last for HWC)
786
- # Using axis=-1 works for both (..., H, W) and (..., H, W, C)
787
- if isinstance(x, Tensor):
788
- return Tensor(np.flip(x.data, axis=-1).copy())
789
- else:
790
- return np.flip(x, axis=-1).copy()
791
- return x
792
- ### END SOLUTION
793
-
794
- #| export
795
-
796
- class RandomCrop:
797
- """
798
- Randomly crop image after padding.
799
-
800
- This is the standard augmentation for CIFAR-10:
801
- 1. Pad image by `padding` pixels on each side
802
- 2. Randomly crop back to original size
803
-
804
- This simulates small translations in the image, forcing the model
805
- to recognize objects regardless of their exact position.
806
-
807
- Args:
808
- size: Output crop size (int for square, or tuple (H, W))
809
- padding: Pixels to pad on each side before cropping (default: 4)
810
- """
811
-
812
- def __init__(self, size, padding=4):
813
- """
814
- Initialize RandomCrop.
815
-
816
- TODO: Store crop parameters
817
-
818
- APPROACH:
819
- 1. Convert size to tuple if it's an int (for square crops)
820
- 2. Store size and padding as instance variables
821
-
822
- EXAMPLE:
823
- >>> crop = RandomCrop(32, padding=4) # CIFAR-10 standard
824
- >>> # Pads to 40x40, then crops back to 32x32
825
-
826
- HINT: Handle both int and tuple sizes for flexibility
827
- """
828
- ### BEGIN SOLUTION
829
- if isinstance(size, int):
830
- self.size = (size, size)
831
- else:
832
- self.size = size
833
- self.padding = padding
834
- ### END SOLUTION
835
-
836
- def __call__(self, x):
837
- """
838
- Apply random crop after padding.
839
-
840
- TODO: Implement random crop with padding
841
-
842
- APPROACH:
843
- 1. Add zero-padding to all sides
844
- 2. Choose random top-left corner for crop
845
- 3. Extract crop of target size
846
-
847
- Args:
848
- x: Input image with shape (C, H, W) or (H, W) or (H, W, C)
849
- Assumes spatial dimensions are H, W
850
-
851
- Returns:
852
- Cropped image with target size
853
-
854
- EXAMPLE:
855
- >>> crop = RandomCrop(32, padding=4)
856
- >>> img = np.random.randn(3, 32, 32) # CIFAR-10 format (C, H, W)
857
- >>> out = crop(img)
858
- >>> print(out.shape) # (3, 32, 32)
859
-
860
- HINTS:
861
- - Use np.pad for adding zeros
862
- - Handle both (C, H, W) and (H, W) formats
863
- - Random offsets should be in [0, 2*padding]
864
- """
865
- ### BEGIN SOLUTION
866
- is_tensor = isinstance(x, Tensor)
867
- data = x.data if is_tensor else x
868
-
869
- target_h, target_w = self.size
870
-
871
- # Determine image format and dimensions
872
- if len(data.shape) == 2:
873
- # (H, W) format
874
- h, w = data.shape
875
- padded = np.pad(data, self.padding, mode='constant', constant_values=0)
876
-
877
- # Random crop position
878
- top = np.random.randint(0, 2 * self.padding + h - target_h + 1)
879
- left = np.random.randint(0, 2 * self.padding + w - target_w + 1)
880
-
881
- cropped = padded[top:top + target_h, left:left + target_w]
882
-
883
- elif len(data.shape) == 3:
884
- if data.shape[0] <= 4: # Likely (C, H, W) format
885
- c, h, w = data.shape
886
- # Pad only spatial dimensions
887
- padded = np.pad(data,
888
- ((0, 0), (self.padding, self.padding), (self.padding, self.padding)),
889
- mode='constant', constant_values=0)
890
-
891
- # Random crop position
892
- top = np.random.randint(0, 2 * self.padding + 1)
893
- left = np.random.randint(0, 2 * self.padding + 1)
894
-
895
- cropped = padded[:, top:top + target_h, left:left + target_w]
896
- else: # Likely (H, W, C) format
897
- h, w, c = data.shape
898
- padded = np.pad(data,
899
- ((self.padding, self.padding), (self.padding, self.padding), (0, 0)),
900
- mode='constant', constant_values=0)
901
-
902
- top = np.random.randint(0, 2 * self.padding + 1)
903
- left = np.random.randint(0, 2 * self.padding + 1)
904
-
905
- cropped = padded[top:top + target_h, left:left + target_w, :]
906
- else:
907
- raise ValueError(f"Expected 2D or 3D input, got shape {data.shape}")
908
-
909
- return Tensor(cropped) if is_tensor else cropped
910
- ### END SOLUTION
911
-
912
- #| export
913
-
914
- class Compose:
915
- """
916
- Compose multiple transforms into a pipeline.
917
-
918
- Applies transforms in sequence, passing output of each
919
- as input to the next.
920
-
921
- Args:
922
- transforms: List of transform callables
923
- """
924
-
925
- def __init__(self, transforms):
926
- """
927
- Initialize Compose with list of transforms.
928
-
929
- EXAMPLE:
930
- >>> transforms = Compose([
931
- ... RandomHorizontalFlip(0.5),
932
- ... RandomCrop(32, padding=4)
933
- ... ])
934
- """
935
- self.transforms = transforms
936
-
937
- def __call__(self, x):
938
- """Apply all transforms in sequence."""
939
- for transform in self.transforms:
940
- x = transform(x)
941
- return x
942
-
943
-
944
- # %% [markdown]
945
- """
946
- ### 🧪 Unit Test: Data Augmentation Transforms
947
- This test validates our augmentation implementations.
948
- **What we're testing**: RandomHorizontalFlip, RandomCrop, Compose pipeline
949
- **Why it matters**: Augmentation is critical for training models that generalize
950
- **Expected**: Correct shapes and appropriate randomness
951
- """
952
-
953
- # %% nbgrader={"grade": true, "grade_id": "test-augmentation", "locked": true, "points": 10}
954
-
955
-
956
- def test_unit_augmentation():
957
- """🔬 Test data augmentation transforms."""
958
- print("🔬 Unit Test: Data Augmentation...")
959
-
960
- # Test 1: RandomHorizontalFlip
961
- print(" Testing RandomHorizontalFlip...")
962
- flip = RandomHorizontalFlip(p=1.0) # Always flip for deterministic test
963
-
964
- img = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 image
965
- flipped = flip(img)
966
- expected = np.array([[3, 2, 1], [6, 5, 4]])
967
- assert np.array_equal(flipped, expected), f"Flip failed: {flipped} vs {expected}"
968
-
969
- # Test never flip
970
- no_flip = RandomHorizontalFlip(p=0.0)
971
- unchanged = no_flip(img)
972
- assert np.array_equal(unchanged, img), "p=0 should never flip"
973
-
974
- # Test 2: RandomCrop shape preservation
975
- print(" Testing RandomCrop...")
976
- crop = RandomCrop(32, padding=4)
977
-
978
- # Test with (C, H, W) format (CIFAR-10 style)
979
- img_chw = np.random.randn(3, 32, 32)
980
- cropped = crop(img_chw)
981
- assert cropped.shape == (3, 32, 32), f"CHW crop shape wrong: {cropped.shape}"
982
-
983
- # Test with (H, W) format
984
- img_hw = np.random.randn(28, 28)
985
- crop_hw = RandomCrop(28, padding=4)
986
- cropped_hw = crop_hw(img_hw)
987
- assert cropped_hw.shape == (28, 28), f"HW crop shape wrong: {cropped_hw.shape}"
988
-
989
- # Test 3: Compose pipeline
990
- print(" Testing Compose...")
991
- transforms = Compose([
992
- RandomHorizontalFlip(p=0.5),
993
- RandomCrop(32, padding=4)
994
- ])
995
-
996
- img = np.random.randn(3, 32, 32)
997
- augmented = transforms(img)
998
- assert augmented.shape == (3, 32, 32), f"Compose output shape wrong: {augmented.shape}"
999
-
1000
- # Test 4: Transforms work with Tensor
1001
- print(" Testing Tensor compatibility...")
1002
- tensor_img = Tensor(np.random.randn(3, 32, 32))
1003
-
1004
- flip_result = RandomHorizontalFlip(p=1.0)(tensor_img)
1005
- assert isinstance(flip_result, Tensor), "Flip should return Tensor when given Tensor"
1006
-
1007
- crop_result = RandomCrop(32, padding=4)(tensor_img)
1008
- assert isinstance(crop_result, Tensor), "Crop should return Tensor when given Tensor"
1009
-
1010
- # Test 5: Randomness verification
1011
- print(" Testing randomness...")
1012
- flip_random = RandomHorizontalFlip(p=0.5)
1013
-
1014
- # Run many times and check we get both outcomes
1015
- flips = 0
1016
- no_flips = 0
1017
- test_img = np.array([[1, 2]])
1018
-
1019
- for _ in range(100):
1020
- result = flip_random(test_img)
1021
- if np.array_equal(result, np.array([[2, 1]])):
1022
- flips += 1
1023
- else:
1024
- no_flips += 1
1025
-
1026
- # With p=0.5, we should get roughly 50/50 (allow for randomness)
1027
- assert flips > 20 and no_flips > 20, f"Flip randomness seems broken: {flips} flips, {no_flips} no-flips"
1028
-
1029
- print("✅ Data Augmentation works correctly!")
1030
-
1031
- if __name__ == "__main__":
1032
- test_unit_augmentation()
1033
-
1034
- # %% nbgrader={"grade": true, "grade_id": "test-dataloader", "locked": true, "points": 20}
1035
- def test_unit_dataloader():
1036
- """🔬 Test DataLoader implementation."""
1037
- print("🔬 Unit Test: DataLoader...")
1038
-
1039
- # Create test dataset
1040
- features = Tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # 5 samples
1041
- labels = Tensor([0, 1, 0, 1, 0])
1042
- dataset = TensorDataset(features, labels)
1043
-
1044
- # Test basic batching (no shuffle)
1045
- loader = DataLoader(dataset, batch_size=2, shuffle=False)
1046
-
1047
- # Test length calculation
1048
- assert len(loader) == 3, f"Expected 3 batches, got {len(loader)}" # ceil(5/2) = 3
1049
-
1050
- batches = list(loader)
1051
- assert len(batches) == 3, f"Expected 3 batches, got {len(batches)}"
1052
-
1053
- # Test first batch
1054
- batch_features, batch_labels = batches[0]
1055
- assert batch_features.data.shape == (2, 2), f"Wrong batch features shape: {batch_features.data.shape}"
1056
- assert batch_labels.data.shape == (2,), f"Wrong batch labels shape: {batch_labels.data.shape}"
1057
-
1058
- # Test last batch (should have 1 sample)
1059
- batch_features, batch_labels = batches[2]
1060
- assert batch_features.data.shape == (1, 2), f"Wrong last batch features shape: {batch_features.data.shape}"
1061
- assert batch_labels.data.shape == (1,), f"Wrong last batch labels shape: {batch_labels.data.shape}"
1062
-
1063
- # Test that data is preserved
1064
- assert np.array_equal(batches[0][0].data[0], [1, 2]), "First sample should be [1,2]"
1065
- assert batches[0][1].data[0] == 0, "First label should be 0"
1066
-
1067
- # Test shuffling produces different order
1068
- loader_shuffle = DataLoader(dataset, batch_size=5, shuffle=True)
1069
- loader_no_shuffle = DataLoader(dataset, batch_size=5, shuffle=False)
1070
-
1071
- batch_shuffle = list(loader_shuffle)[0]
1072
- batch_no_shuffle = list(loader_no_shuffle)[0]
1073
-
1074
- # Note: This might occasionally fail due to random chance, but very unlikely
1075
- # We'll just test that both contain all the original data
1076
- shuffle_features = set(tuple(row) for row in batch_shuffle[0].data)
1077
- no_shuffle_features = set(tuple(row) for row in batch_no_shuffle[0].data)
1078
- expected_features = {(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)}
1079
-
1080
- assert shuffle_features == expected_features, "Shuffle should preserve all data"
1081
- assert no_shuffle_features == expected_features, "No shuffle should preserve all data"
1082
-
1083
- print("✅ DataLoader works correctly!")
1084
-
1085
- if __name__ == "__main__":
1086
- test_unit_dataloader()
1087
-
1088
-
1089
- # %% nbgrader={"grade": true, "grade_id": "test-dataloader-deterministic", "locked": true, "points": 5}
1090
- def test_unit_dataloader_deterministic():
1091
- """🔬 Test DataLoader deterministic shuffling with fixed seed."""
1092
- print("🔬 Unit Test: DataLoader Deterministic Shuffling...")
1093
-
1094
- # Create test dataset
1095
- features = Tensor([[1, 2], [3, 4], [5, 6], [7, 8]])
1096
- labels = Tensor([0, 1, 0, 1])
1097
- dataset = TensorDataset(features, labels)
1098
-
1099
- # Test that same seed produces same shuffle
1100
- random.seed(42)
1101
- loader1 = DataLoader(dataset, batch_size=2, shuffle=True)
1102
- batches1 = list(loader1)
1103
-
1104
- random.seed(42)
1105
- loader2 = DataLoader(dataset, batch_size=2, shuffle=True)
1106
- batches2 = list(loader2)
1107
-
1108
- # Should produce identical batches with same seed
1109
- for i, (batch1, batch2) in enumerate(zip(batches1, batches2)):
1110
- assert np.array_equal(batch1[0].data, batch2[0].data), \
1111
- f"Batch {i} features should be identical with same seed"
1112
- assert np.array_equal(batch1[1].data, batch2[1].data), \
1113
- f"Batch {i} labels should be identical with same seed"
1114
-
1115
- # Test that different seeds produce different shuffles
1116
- random.seed(42)
1117
- loader3 = DataLoader(dataset, batch_size=2, shuffle=True)
1118
- batches3 = list(loader3)
1119
-
1120
- random.seed(123) # Different seed
1121
- loader4 = DataLoader(dataset, batch_size=2, shuffle=True)
1122
- batches4 = list(loader4)
1123
-
1124
- # Should produce different batches with different seeds (very likely)
1125
- different = False
1126
- for batch3, batch4 in zip(batches3, batches4):
1127
- if not np.array_equal(batch3[0].data, batch4[0].data):
1128
- different = True
1129
- break
1130
-
1131
- assert different, "Different seeds should produce different shuffles"
1132
-
1133
- print("✅ Deterministic shuffling works correctly!")
1134
-
1135
- if __name__ == "__main__":
1136
- test_unit_dataloader_deterministic()
1137
-
1138
-
1139
- # %% [markdown]
1140
- """
1141
- ## 🔧 Working with Real Datasets
1142
-
1143
- Now that you've built the DataLoader abstraction, you're ready to use it with real data!
1144
-
1145
- ### Using Real Datasets: The TinyTorch Approach
1146
-
1147
- TinyTorch separates **mechanics** (this module) from **application** (examples/milestones):
1148
-
1149
- ```
1150
- Module 05 (DataLoader) Examples & Milestones
1151
- ┌──────────────────────┐ ┌────────────────────────┐
1152
- │ Dataset abstraction │ │ Real MNIST digits │
1153
- │ TensorDataset impl │ ───> │ CIFAR-10 images │
1154
- │ DataLoader batching │ │ Custom datasets │
1155
- │ Shuffle & iteration │ │ Download utilities │
1156
- └──────────────────────┘ └────────────────────────┘
1157
- (Learn mechanics) (Apply to real data)
1158
- ```
1159
-
1160
- ### Understanding Image Data
1161
-
1162
- **What does image data actually look like?**
1163
-
1164
- Images are just 2D arrays of numbers (pixels). Here are actual 8×8 handwritten digits:
1165
-
1166
- ```
1167
- Digit "5" (8×8): Digit "3" (8×8): Digit "8" (8×8):
1168
- 0 0 12 13 5 0 0 0 0 0 11 12 0 0 0 0 0 0 10 14 8 1 0 0
1169
- 0 0 13 15 10 0 0 0 0 2 16 16 16 7 0 0 0 0 16 15 15 9 0 0
1170
- 0 3 15 13 16 7 0 0 0 0 8 16 8 0 0 0 0 0 15 5 5 13 0 0
1171
- 0 8 13 6 15 4 0 0 0 0 0 12 13 0 0 0 0 1 16 5 5 13 0 0
1172
- 0 0 0 6 16 5 0 0 0 0 1 16 15 9 0 0 0 6 16 16 16 16 1 0
1173
- 0 0 5 15 16 9 0 0 0 0 14 16 16 16 7 0 1 16 3 1 1 15 1 0
1174
- 0 0 9 16 9 0 0 0 0 5 16 8 8 16 0 0 0 9 16 16 16 15 0 0
1175
- 0 0 0 0 0 0 0 0 0 3 16 16 16 12 0 0 0 0 0 0 0 0 0 0
1176
-
1177
- Visual representation:
1178
- ░█████░ ░█████░ ░█████░
1179
- ░█░░░█░ ░░░░░█░ █░░░░█░
1180
- ░░░░█░░ ░░███░░ ░█████░
1181
- ░░░█░░░ ░░░░█░░ █░░░░█░
1182
- ░░█░░░░ ░█████░ ░█████░
1183
- ```
1184
-
1185
- **Shape transformations in DataLoader:**
1186
-
1187
- ```
1188
- Individual Sample (from Dataset):
1189
- image: (8, 8) ← Single 8×8 image
1190
- label: scalar ← Single digit (0-9)
1191
-
1192
- After DataLoader batching (batch_size=32):
1193
- images: (32, 8, 8) ← Stack of 32 images
1194
- labels: (32,) ← Array of 32 labels
1195
-
1196
- This is what your model sees during training!
1197
- ```
1198
-
1199
- ### Quick Start with Real Data
1200
-
1201
- **Tiny Datasets (ships with TinyTorch):**
1202
- ```python
1203
- # 8×8 handwritten digits - instant, no downloads!
1204
- import numpy as np
1205
- data = np.load('datasets/tiny/digits_8x8.npz')
1206
- images = Tensor(data['images']) # (1797, 8, 8)
1207
- labels = Tensor(data['labels']) # (1797,)
1208
-
1209
- dataset = TensorDataset(images, labels)
1210
- loader = DataLoader(dataset, batch_size=32, shuffle=True)
1211
-
1212
- # Each batch contains real digit images!
1213
- for batch_images, batch_labels in loader:
1214
- # batch_images: (32, 8, 8) - 32 digit images
1215
- # batch_labels: (32,) - their labels (0-9)
1216
- break
1217
- ```
1218
-
1219
- **Full Datasets (for serious training):**
1220
- ```python
1221
- # See milestones/03_mlp_revival_1986/ for MNIST download (28×28 images)
1222
- # See milestones/04_cnn_revolution_1998/ for CIFAR-10 download (32×32×3 images)
1223
- ```
1224
-
1225
- ### What You've Accomplished
1226
-
1227
- You've built the **data loading infrastructure** that powers all modern ML:
1228
- - ✅ Dataset abstraction (universal interface)
1229
- - ✅ TensorDataset (in-memory efficiency)
1230
- - ✅ DataLoader (batching, shuffling, iteration)
1231
- - ✅ Data Augmentation (RandomHorizontalFlip, RandomCrop, Compose)
1232
-
1233
- **Next steps:** Apply your DataLoader and augmentation to real datasets in the milestones!
1234
-
1235
- **Real-world connection:** You've implemented the same patterns as:
1236
- - PyTorch's `torch.utils.data.DataLoader`
1237
- - PyTorch's `torchvision.transforms`
1238
- - TensorFlow's `tf.data.Dataset`
1239
- - Production ML pipelines everywhere
1240
- """
1241
-
1242
-
1243
- # %% [markdown]
1244
- """
1245
- ## 📊 Systems Analysis - Data Pipeline Performance
1246
-
1247
- **Note:** This section provides performance analysis tools for understanding DataLoader behavior. The analysis functions are defined below but not run automatically. To explore performance characteristics, uncomment and run `analyze_dataloader_performance()` or `analyze_memory_usage()` manually.
1248
-
1249
- Now let's understand data pipeline performance like production ML engineers. Understanding where time and memory go is crucial for building systems that scale.
1250
-
1251
- ### The Performance Question: Where Does Time Go?
1252
-
1253
- In a typical training step, time is split between data loading and computation:
1254
-
1255
- ```
1256
- Training Step Breakdown:
1257
- ┌─────────────────────────────────────────────────────────────┐
1258
- │ Data Loading │ Forward Pass │ Backward Pass │
1259
- │ ████████████ │ ███████ │ ████████ │
1260
- │ 40ms │ 25ms │ 35ms │
1261
- └─────────────────────────────────────────────────────────────┘
1262
- 100ms total per step
1263
-
1264
- Bottleneck Analysis:
1265
- - If data loading > forward+backward: "Data starved" (CPU bottleneck)
1266
- - If forward+backward > data loading: "Compute bound" (GPU bottleneck)
1267
- - Ideal: Data loading ≈ computation time (balanced pipeline)
1268
- ```
1269
-
1270
- ### Memory Scaling: The Batch Size Trade-off
1271
-
1272
- Batch size creates a fundamental trade-off in memory vs efficiency:
1273
-
1274
- ```
1275
- Batch Size Impact:
1276
-
1277
- Small Batches (batch_size=8):
1278
- ┌─────────────────────────────────────────┐
1279
- │ Memory: 8 × 28 × 28 × 4 bytes = 25KB │ ← Low memory
1280
- │ Overhead: High (many small batches) │ ← High overhead
1281
- │ GPU Util: Poor (underutilized) │ ← Poor efficiency
1282
- └─────────────────────────────────────────┘
1283
-
1284
- Large Batches (batch_size=512):
1285
- ┌─────────────────────────────────────────┐
1286
- │ Memory: 512 × 28 × 28 × 4 bytes = 1.6MB │ ← Higher memory
1287
- │ Overhead: Low (fewer large batches) │ ← Lower overhead
1288
- │ GPU Util: Good (well utilized) │ ← Better efficiency
1289
- └─────────────────────────────────────────┘
1290
- ```
1291
-
1292
- ### Shuffling Overhead Analysis
1293
-
1294
- Shuffling seems simple, but let's measure its real cost:
1295
-
1296
- ```
1297
- Shuffle Operation Breakdown:
1298
-
1299
- 1. Index Generation: O(n) - create [0, 1, 2, ..., n-1]
1300
- 2. Shuffle Operation: O(n) - randomize the indices
1301
- 3. Sample Access: O(1) per sample - dataset[shuffled_idx]
1302
-
1303
- Memory Impact:
1304
- - No Shuffle: 0 extra memory (sequential access)
1305
- - With Shuffle: 8 bytes × dataset_size (store indices)
1306
-
1307
- For 50,000 samples: 8 × 50,000 = 400KB extra memory
1308
- ```
1309
-
1310
- The key insight: shuffling overhead is typically negligible compared to the actual data loading and tensor operations.
1311
-
1312
- ### Pipeline Bottleneck Identification
1313
-
1314
- We'll measure three critical metrics:
1315
-
1316
- 1. **Throughput**: Samples processed per second
1317
- 2. **Memory Usage**: Peak memory during batch loading
1318
- 3. **Overhead**: Time spent on data vs computation
1319
-
1320
- These measurements will reveal whether our pipeline is CPU-bound (slow data loading) or compute-bound (slow model).
1321
- """
1322
-
1323
- # %% nbgrader={"grade": false, "grade_id": "systems-analysis", "solution": true}
1324
- def analyze_dataloader_performance():
1325
- """📊 Analyze DataLoader performance characteristics."""
1326
- print("📊 Analyzing DataLoader Performance...")
1327
-
1328
- # Create test dataset of varying sizes
1329
- sizes = [1000, 5000, 10000]
1330
- batch_sizes = [16, 64, 256]
1331
-
1332
- print("\n🔍 Batch Size vs Loading Time:")
1333
-
1334
- for size in sizes:
1335
- # Create synthetic dataset
1336
- features = Tensor(np.random.randn(size, 100)) # 100 features
1337
- labels = Tensor(np.random.randint(0, 10, size))
1338
- dataset = TensorDataset(features, labels)
1339
-
1340
- print(f"\nDataset size: {size} samples")
1341
-
1342
- for batch_size in batch_sizes:
1343
- # Time data loading
1344
- loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
1345
-
1346
- start_time = time.time()
1347
- batch_count = 0
1348
- for batch in loader:
1349
- batch_count += 1
1350
- end_time = time.time()
1351
-
1352
- elapsed = end_time - start_time
1353
- throughput = size / elapsed if elapsed > 0 else float('inf')
1354
-
1355
- print(f" Batch size {batch_size:3d}: {elapsed:.3f}s ({throughput:,.0f} samples/sec)")
1356
-
1357
- # Analyze shuffle overhead
1358
- print("\n🔄 Shuffle Overhead Analysis:")
1359
-
1360
- dataset_size = 10000
1361
- features = Tensor(np.random.randn(dataset_size, 50))
1362
- labels = Tensor(np.random.randint(0, 5, dataset_size))
1363
- dataset = TensorDataset(features, labels)
1364
-
1365
- batch_size = 64
1366
-
1367
- # No shuffle
1368
- loader_no_shuffle = DataLoader(dataset, batch_size=batch_size, shuffle=False)
1369
- start_time = time.time()
1370
- batches_no_shuffle = list(loader_no_shuffle)
1371
- time_no_shuffle = time.time() - start_time
1372
-
1373
- # With shuffle
1374
- loader_shuffle = DataLoader(dataset, batch_size=batch_size, shuffle=True)
1375
- start_time = time.time()
1376
- batches_shuffle = list(loader_shuffle)
1377
- time_shuffle = time.time() - start_time
1378
-
1379
- shuffle_overhead = ((time_shuffle - time_no_shuffle) / time_no_shuffle) * 100
1380
-
1381
- print(f" No shuffle: {time_no_shuffle:.3f}s")
1382
- print(f" With shuffle: {time_shuffle:.3f}s")
1383
- print(f" Shuffle overhead: {shuffle_overhead:.1f}%")
1384
-
1385
- print("\n💡 Key Insights:")
1386
- print("• Larger batch sizes reduce per-sample overhead")
1387
- print("• Shuffle adds minimal overhead for reasonable dataset sizes")
1388
- print("• Memory usage scales linearly with batch size")
1389
- print("🚀 Production tip: Balance batch size with GPU memory limits")
1390
-
1391
-
1392
- def analyze_memory_usage():
1393
- """📊 Analyze memory usage patterns in data loading."""
1394
- print("\n📊 Analyzing Memory Usage Patterns...")
1395
-
1396
- # Memory usage estimation
1397
- def estimate_memory_mb(batch_size, feature_size, dtype_bytes=4):
1398
- """Estimate memory usage for a batch."""
1399
- return (batch_size * feature_size * dtype_bytes) / (1024 * 1024)
1400
-
1401
- print("\n💾 Memory Usage by Batch Configuration:")
1402
-
1403
- feature_sizes = [784, 3072, 50176] # MNIST, CIFAR-10, ImageNet-like
1404
- feature_names = ["MNIST (28×28)", "CIFAR-10 (32×32×3)", "ImageNet (224×224×1)"]
1405
- batch_sizes = [1, 32, 128, 512]
1406
-
1407
- for feature_size, name in zip(feature_sizes, feature_names):
1408
- print(f"\n{name}:")
1409
- for batch_size in batch_sizes:
1410
- memory_mb = estimate_memory_mb(batch_size, feature_size)
1411
- print(f" Batch {batch_size:3d}: {memory_mb:6.1f} MB")
1412
-
1413
- print("\n🎯 Memory Trade-offs:")
1414
- print("• Larger batches: More memory, better GPU utilization")
1415
- print("• Smaller batches: Less memory, more noisy gradients")
1416
- print("• Sweet spot: Usually 32-128 depending on model size")
1417
-
1418
- # Demonstrate actual memory usage with our tensors
1419
- print("\n🔬 Actual Tensor Memory Usage:")
1420
-
1421
- # Create different sized tensors
1422
- tensor_small = Tensor(np.random.randn(32, 784)) # Small batch
1423
- tensor_large = Tensor(np.random.randn(512, 784)) # Large batch
1424
-
1425
- # Measure actual memory (data array + object overhead)
1426
- small_bytes = tensor_small.data.nbytes
1427
- large_bytes = tensor_large.data.nbytes
1428
-
1429
- # Also measure Python object overhead
1430
- small_total = sys.getsizeof(tensor_small.data) + sys.getsizeof(tensor_small)
1431
- large_total = sys.getsizeof(tensor_large.data) + sys.getsizeof(tensor_large)
1432
-
1433
- print(f" Small batch (32×784):")
1434
- print(f" - Data only: {small_bytes / 1024:.1f} KB")
1435
- print(f" - With object overhead: {small_total / 1024:.1f} KB")
1436
- print(f" Large batch (512×784):")
1437
- print(f" - Data only: {large_bytes / 1024:.1f} KB")
1438
- print(f" - With object overhead: {large_total / 1024:.1f} KB")
1439
- print(f" Ratio: {large_bytes / small_bytes:.1f}× (data scales linearly)")
1440
-
1441
- print("\n🎯 Memory Optimization Tips:")
1442
- print("• Object overhead becomes negligible with larger batches")
1443
- print("• Use float32 instead of float64 to halve memory usage")
1444
- print("• Consider gradient accumulation for effective larger batches")
1445
-
1446
-
1447
- def analyze_collation_overhead():
1448
- """📊 Analyze the cost of collating samples into batches."""
1449
- print("\n📊 Analyzing Collation Overhead...")
1450
-
1451
- # Test different batch sizes to see collation cost
1452
- dataset_size = 1000
1453
- feature_size = 100
1454
- features = Tensor(np.random.randn(dataset_size, feature_size))
1455
- labels = Tensor(np.random.randint(0, 10, dataset_size))
1456
- dataset = TensorDataset(features, labels)
1457
-
1458
- print("\n⚡ Collation Time by Batch Size:")
1459
-
1460
- for batch_size in [8, 32, 128, 512]:
1461
- loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
1462
-
1463
- start_time = time.time()
1464
- for batch in loader:
1465
- pass # Just iterate, measuring collation overhead
1466
- total_time = time.time() - start_time
1467
-
1468
- batches = len(loader)
1469
- time_per_batch = (total_time / batches) * 1000 # Convert to ms
1470
-
1471
- print(f" Batch size {batch_size:3d}: {time_per_batch:.2f}ms per batch ({batches} batches total)")
1472
-
1473
- print("\n💡 Collation Insights:")
1474
- print("• Larger batches take longer to collate (more np.stack operations)")
1475
- print("• But fewer large batches are more efficient than many small ones")
1476
- print("• Optimal: Balance between batch size and iteration overhead")
1477
-
1478
-
1479
- # %% [markdown]
1480
- """
1481
- ## ⚠️ Common Pitfalls and Best Practices
1482
-
1483
- Before we move to integration testing, let's cover common mistakes students and practitioners make with data loading:
1484
-
1485
- ### ⚠️ Common Mistakes to Avoid
1486
-
1487
- **1. Forgetting to Shuffle Training Data**
1488
- ```python
1489
- # ❌ WRONG - No shuffling means same batches every epoch
1490
- train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
1491
-
1492
- # ✅ CORRECT - Shuffle for training, but not for validation
1493
- train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
1494
- val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
1495
- ```
1496
- **Why it matters:** Without shuffling, your model sees the same batch combinations every epoch, leading to overfitting to batch-specific patterns rather than general patterns.
1497
-
1498
- **2. Batch Size Too Large (Out of Memory)**
1499
- ```python
1500
- # ❌ WRONG - Batch size might exceed GPU memory
1501
- loader = DataLoader(dataset, batch_size=1024) # Might cause OOM!
1502
-
1503
- # ✅ CORRECT - Start small and increase gradually
1504
- loader = DataLoader(dataset, batch_size=32) # Safe starting point
1505
- # Monitor GPU memory, then try 64, 128, etc.
1506
- ```
1507
- **Why it matters:** Batch size directly determines peak memory usage. Too large = crash. Too small = slow training.
1508
-
1509
- **3. Improper Train/Validation Split**
1510
- ```python
1511
- # ❌ WRONG - Validation data leaking into training
1512
- all_data = dataset
1513
- train_loader = DataLoader(all_data, shuffle=True) # No split!
1514
-
1515
- # ✅ CORRECT - Separate train and validation
1516
- train_size = int(0.8 * len(dataset))
1517
- train_data = dataset[:train_size]
1518
- val_data = dataset[train_size:]
1519
- train_loader = DataLoader(train_data, shuffle=True)
1520
- val_loader = DataLoader(val_data, shuffle=False)
1521
- ```
1522
- **Why it matters:** Using the same data for training and validation gives falsely optimistic performance metrics.
1523
-
1524
- **4. Not Handling Uneven Batches**
1525
- ```python
1526
- # Dataset with 1000 samples, batch_size=128
1527
- # Creates: [128, 128, 128, 128, 128, 128, 128, 104] samples per batch
1528
- # Your model must handle variable batch sizes!
1529
-
1530
- # Example: Don't assume batch_size in forward pass
1531
- def forward(self, x):
1532
- batch_size = x.shape[0] # ✅ Get actual batch size
1533
- # Don't hardcode: batch_size = 128 # ❌ Breaks on last batch
1534
- ```
1535
-
1536
- ### 🚀 Best Practices for Production
1537
-
1538
- **1. Batch Size Selection Strategy**
1539
- ```
1540
- Start with: 32 (almost always works)
1541
-
1542
- Monitor GPU memory usage
1543
-
1544
- If memory < 80%: double to 64
1545
- If memory > 90%: keep at 32
1546
-
1547
- Repeat until you find the sweet spot (usually 32-256)
1548
- ```
1549
-
1550
- **2. Data Augmentation Placement**
1551
- - **Option A:** In Dataset's `__getitem__` (random crop, flip, etc.)
1552
- - **Option B:** After DataLoader in training loop (batch-level operations)
1553
- - **Rule:** Image-level augmentation in Dataset, batch-level in loop
1554
-
1555
- **3. Shuffling Strategy**
1556
- - **Training:** Always shuffle (`shuffle=True`)
1557
- - **Validation:** Never shuffle (`shuffle=False`)
1558
- - **Testing:** Never shuffle (`shuffle=False`)
1559
- - **Reason:** Validation/test need reproducible metrics
1560
-
1561
- **4. Memory-Constrained Training**
1562
- ```python
1563
- # Technique: Gradient Accumulation (effective larger batch)
1564
- effective_batch_size = 128
1565
- actual_batch_size = 32
1566
- accumulation_steps = effective_batch_size // actual_batch_size # = 4
1567
-
1568
- loader = DataLoader(dataset, batch_size=32) # Fits in memory
1569
- # In training loop: accumulate 4 batches before optimizer step
1570
- # Result: Same as batch_size=128 but uses less memory!
1571
- ```
1572
-
1573
- These patterns will save you hours of debugging and help you build robust training pipelines!
1574
- """
1575
-
1576
- # %% [markdown]
1577
- """
1578
- ## 🔧 Integration Testing
1579
-
1580
- Let's test how our DataLoader integrates with a complete training workflow, simulating real ML pipeline usage.
1581
- """
1582
-
1583
- # %% nbgrader={"grade": false, "grade_id": "integration-test", "solution": true}
1584
- def test_training_integration():
1585
- """🔬 Test DataLoader integration with training workflow."""
1586
- print("🔬 Integration Test: Training Workflow...")
1587
-
1588
- # Create a realistic dataset
1589
- num_samples = 1000
1590
- num_features = 20
1591
- num_classes = 5
1592
-
1593
- # Synthetic classification data
1594
- features = Tensor(np.random.randn(num_samples, num_features))
1595
- labels = Tensor(np.random.randint(0, num_classes, num_samples))
1596
-
1597
- dataset = TensorDataset(features, labels)
1598
-
1599
- # Create train/val splits
1600
- train_size = int(0.8 * len(dataset))
1601
- val_size = len(dataset) - train_size
1602
-
1603
- # Manual split (in production, you'd use proper splitting utilities)
1604
- train_indices = list(range(train_size))
1605
- val_indices = list(range(train_size, len(dataset)))
1606
-
1607
- # Create subset datasets
1608
- train_samples = [dataset[i] for i in train_indices]
1609
- val_samples = [dataset[i] for i in val_indices]
1610
-
1611
- # Convert back to tensors for TensorDataset
1612
- train_features = Tensor(np.stack([sample[0].data for sample in train_samples]))
1613
- train_labels = Tensor(np.stack([sample[1].data for sample in train_samples]))
1614
- val_features = Tensor(np.stack([sample[0].data for sample in val_samples]))
1615
- val_labels = Tensor(np.stack([sample[1].data for sample in val_samples]))
1616
-
1617
- train_dataset = TensorDataset(train_features, train_labels)
1618
- val_dataset = TensorDataset(val_features, val_labels)
1619
-
1620
- # Create DataLoaders
1621
- batch_size = 32
1622
- train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
1623
- val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
1624
-
1625
- print(f"📊 Dataset splits:")
1626
- print(f" Training: {len(train_dataset)} samples, {len(train_loader)} batches")
1627
- print(f" Validation: {len(val_dataset)} samples, {len(val_loader)} batches")
1628
-
1629
- # Simulate training loop
1630
- print("\n🏃 Simulated Training Loop:")
1631
-
1632
- epoch_samples = 0
1633
- batch_count = 0
1634
-
1635
- for batch_idx, (batch_features, batch_labels) in enumerate(train_loader):
1636
- batch_count += 1
1637
- epoch_samples += len(batch_features.data)
1638
-
1639
- # Simulate forward pass (just check shapes)
1640
- assert batch_features.data.shape[0] <= batch_size, "Batch size exceeded"
1641
- assert batch_features.data.shape[1] == num_features, "Wrong feature count"
1642
- assert len(batch_labels.data) == len(batch_features.data), "Mismatched batch sizes"
1643
-
1644
- if batch_idx < 3: # Show first few batches
1645
- print(f" Batch {batch_idx + 1}: {batch_features.data.shape[0]} samples")
1646
-
1647
- print(f" Total: {batch_count} batches, {epoch_samples} samples processed")
1648
-
1649
- # Validate that all samples were seen
1650
- assert epoch_samples == len(train_dataset), f"Expected {len(train_dataset)}, processed {epoch_samples}"
1651
-
1652
- print("✅ Training integration works correctly!")
1653
-
1654
- if __name__ == "__main__":
1655
- test_training_integration()
1656
-
1657
- # %% [markdown]
1658
- """
1659
- ## 🧪 Module Integration Test
1660
-
1661
- Final validation that everything works together correctly.
1662
- """
1663
-
1664
- # %%
1665
- def test_module():
1666
- """🧪 Module Test: Complete Integration
1667
-
1668
- Comprehensive test of entire module functionality.
1669
-
1670
- This final test runs before module summary to ensure:
1671
- - All unit tests pass
1672
- - Functions work together correctly
1673
- - Module is ready for integration with TinyTorch
1674
- """
1675
- print("🧪 RUNNING MODULE INTEGRATION TEST")
1676
- print("=" * 50)
1677
-
1678
- # Run all unit tests
1679
- print("Running unit tests...")
1680
- test_unit_dataset()
1681
- test_unit_tensordataset()
1682
- test_unit_dataloader()
1683
- test_unit_dataloader_deterministic()
1684
- test_unit_augmentation()
1685
-
1686
- print("\nRunning integration scenarios...")
1687
-
1688
- # Test complete workflow
1689
- test_training_integration()
1690
-
1691
- # Test augmentation with DataLoader
1692
- print("🔬 Integration Test: Augmentation with DataLoader...")
1693
-
1694
- # Create dataset with augmentation
1695
- train_transforms = Compose([
1696
- RandomHorizontalFlip(0.5),
1697
- RandomCrop(8, padding=2) # Small images for test
1698
- ])
1699
-
1700
- # Simulate CIFAR-style images (C, H, W)
1701
- images = np.random.randn(100, 3, 8, 8)
1702
- labels = np.random.randint(0, 10, 100)
1703
-
1704
- # Apply augmentation manually (how you'd use in practice)
1705
- augmented_images = np.array([train_transforms(img) for img in images])
1706
-
1707
- dataset = TensorDataset(Tensor(augmented_images), Tensor(labels))
1708
- loader = DataLoader(dataset, batch_size=16, shuffle=True)
1709
-
1710
- batch_count = 0
1711
- for batch_x, batch_y in loader:
1712
- assert batch_x.shape[1:] == (3, 8, 8), f"Augmented batch shape wrong: {batch_x.shape}"
1713
- batch_count += 1
1714
-
1715
- assert batch_count > 0, "DataLoader should produce batches"
1716
- print("✅ Augmentation + DataLoader integration works!")
1717
-
1718
- print("\n" + "=" * 50)
1719
- print("🎉 ALL TESTS PASSED! Module ready for export.")
1720
- print("Run: tito module complete 08")
1721
-
1722
- # %% [markdown]
1723
- """
1724
- ## 🤔 ML Systems Thinking
1725
-
1726
- Now that you've implemented DataLoader, let's explore the critical systems trade-offs that affect real training pipelines. Understanding these decisions will help you build efficient ML systems in production.
1727
-
1728
- ### Question 1: The Batch Size Dilemma
1729
-
1730
- You're training a ResNet-50 on ImageNet. Your GPU has 16GB memory. Consider these batch size choices:
1731
-
1732
- **Option A: batch_size=256**
1733
- - Peak memory: 14GB (near limit)
1734
- - Training time: 12 hours
1735
- - Final accuracy: 76.2%
1736
-
1737
- **Option B: batch_size=32**
1738
- - Peak memory: 4GB (plenty of headroom)
1739
- - Training time: 18 hours
1740
- - Final accuracy: 75.1%
1741
-
1742
- **Which would you choose and why?** Consider:
1743
- - What happens if Option A occasionally spikes to 17GB during certain layers?
1744
- - How does batch size affect gradient noise and convergence?
1745
- - What's the real cost difference between 12 and 18 hours?
1746
- - Could you use gradient accumulation to get benefits of both?
1747
-
1748
- **Systems insight**: Batch size creates a three-way trade-off between memory usage, training speed, and model convergence. The "right" answer depends on whether you're memory-constrained, time-constrained, or accuracy-constrained.
1749
-
1750
- ### Question 2: To Shuffle or Not to Shuffle?
1751
-
1752
- You're training on a medical dataset where samples are ordered by patient (first 1000 samples = Patient A, next 1000 = Patient B, etc.). Consider these scenarios:
1753
-
1754
- **Scenario 1: Training with shuffle=True**
1755
- ```
1756
- Epoch 1 batches: [Patient B, Patient C, Patient A, Patient D...]
1757
- Epoch 2 batches: [Patient D, Patient A, Patient C, Patient B...]
1758
- ```
1759
-
1760
- **Scenario 2: Training with shuffle=False**
1761
- ```
1762
- Epoch 1 batches: [Patient A, Patient A, Patient A, Patient B...]
1763
- Epoch 2 batches: [Patient A, Patient A, Patient A, Patient B...]
1764
- ```
1765
-
1766
- **What happens in Scenario 2?**
1767
- - The model sees 30+ batches of only Patient A's data first
1768
- - It might overfit to Patient A's specific characteristics
1769
- - Early batches update weights strongly toward Patient A's patterns
1770
- - This is called "catastrophic learning" of patient-specific features
1771
-
1772
- **Your DataLoader's shuffle prevents this by mixing patients in every batch!**
1773
-
1774
- **Systems insight**: Shuffling isn't just about randomness—it's about ensuring the model sees representative samples in every batch, preventing order-dependent biases.
1775
-
1776
- ### Question 3: Data Loading Bottlenecks
1777
-
1778
- Your training loop reports these timings per batch:
1779
-
1780
- ```
1781
- Data loading: 45ms
1782
- Forward pass: 30ms
1783
- Backward pass: 35ms
1784
- Optimizer step: 10ms
1785
- Total: 120ms
1786
- ```
1787
-
1788
- **Where's the bottleneck?** Data loading takes 37.5% of the time!
1789
-
1790
- **What's causing it?**
1791
- - Disk I/O: Reading images from storage
1792
- - Decompression: JPEG/PNG decoding
1793
- - Augmentation: Random crops, flips, color jitter
1794
- - Collation: Stacking individual samples into batches
1795
-
1796
- **How to fix it:**
1797
-
1798
- **Option 1: Prefetch next batch during computation**
1799
- ```python
1800
- # While GPU computes current batch, CPU loads next batch
1801
- DataLoader(..., num_workers=4) # PyTorch feature
1802
- ```
1803
- Result: Data loading and compute overlap, ~30% speedup
1804
-
1805
- **Option 2: Cache decoded images in memory**
1806
- ```python
1807
- # Decode once, reuse across epochs
1808
- cached_dataset = [decode_image(path) for path in paths]
1809
- ```
1810
- Result: Eliminate repeated decode overhead
1811
-
1812
- **Option 3: Use faster image formats**
1813
- - Replace JPEG (slow decode) with WebP (fast decode)
1814
- - Or pre-convert to NumPy .npy files (fastest)
1815
-
1816
- **In your implementation:** You used TensorDataset with pre-loaded tensors, avoiding I/O entirely! This is why research code often loads MNIST/CIFAR-10 fully into memory.
1817
-
1818
- **Systems insight**: Data loading is often the hidden bottleneck in training. Profile first, optimize second.
1819
-
1820
- ### Question 4: Memory Explosion with Large Datasets
1821
-
1822
- You're training on 100GB of high-resolution medical scans. Your DataLoader code:
1823
-
1824
- ```python
1825
- # ❌ This tries to load ALL data into memory!
1826
- all_images = Tensor(np.load('100gb_scans.npy'))
1827
- dataset = TensorDataset(all_images, labels)
1828
- loader = DataLoader(dataset, batch_size=32)
1829
- ```
1830
-
1831
- **Problem:** This crashes immediately (OOM) because you're loading 100GB into RAM before training even starts!
1832
-
1833
- **Solution: Lazy Loading Dataset**
1834
- ```python
1835
- class LazyImageDataset(Dataset):
1836
- def __init__(self, image_paths, labels):
1837
- self.image_paths = image_paths # Just store paths (tiny memory)
1838
- self.labels = labels
1839
-
1840
- def __len__(self):
1841
- return len(self.image_paths)
1842
-
1843
- def __getitem__(self, idx):
1844
- # Load image ONLY when requested (lazy)
1845
- image = load_image(self.image_paths[idx])
1846
- return Tensor(image), Tensor(self.labels[idx])
1847
-
1848
- # Memory usage: Only 32 images × batch_size at a time!
1849
- dataset = LazyImageDataset(paths, labels)
1850
- loader = DataLoader(dataset, batch_size=32)
1851
- ```
1852
-
1853
- **Memory comparison:**
1854
- - TensorDataset: 100GB (all data loaded upfront)
1855
- - LazyImageDataset: ~500MB (only current batch + buffer)
1856
-
1857
- **Your TensorDataset is perfect for small datasets (MNIST, CIFAR) but won't scale to ImageNet!**
1858
-
1859
- **Systems insight**: For large datasets, load data on-demand rather than upfront. Your DataLoader's `__getitem__` is called only when needed, enabling lazy loading patterns.
1860
-
1861
- ### Question 5: The Shuffle Memory Trap
1862
-
1863
- You implement shuffling like this:
1864
-
1865
- ```python
1866
- def __iter__(self):
1867
- # ❌ This loads ALL data into memory for shuffling!
1868
- all_samples = [self.dataset[i] for i in range(len(self.dataset))]
1869
- random.shuffle(all_samples)
1870
-
1871
- for i in range(0, len(all_samples), self.batch_size):
1872
- yield self._collate_batch(all_samples[i:i + self.batch_size])
1873
- ```
1874
-
1875
- **For a 50GB dataset, this requires 50GB RAM just to shuffle!**
1876
-
1877
- **Your implementation is smarter:**
1878
- ```python
1879
- def __iter__(self):
1880
- # ✅ Only shuffle INDICES (tiny memory footprint)
1881
- indices = list(range(len(self.dataset))) # Just integers!
1882
- random.shuffle(indices) # Shuffles integers, not data
1883
-
1884
- for i in range(0, len(indices), self.batch_size):
1885
- batch_indices = indices[i:i + self.batch_size]
1886
- batch = [self.dataset[idx] for idx in batch_indices] # Load only batch
1887
- yield self._collate_batch(batch)
1888
- ```
1889
-
1890
- **Memory usage:**
1891
- - Bad shuffle: 50GB (all samples in memory)
1892
- - Your shuffle: 400KB (50M indices × 8 bytes each)
1893
-
1894
- **Why this matters:** You can shuffle 100 million samples using just 800MB of RAM!
1895
-
1896
- **Systems insight**: Shuffle indices, not data. This is a classic systems pattern—operate on lightweight proxies (indices) rather than expensive objects (actual data).
1897
-
1898
- ### The Big Picture: Data Pipeline Design Patterns
1899
-
1900
- Your DataLoader implements three fundamental patterns:
1901
-
1902
- **1. Iterator Protocol** (memory efficiency)
1903
- ```python
1904
- for batch in loader: # Loads one batch at a time, not all batches
1905
- train_step(batch) # Previous batch memory is freed
1906
- ```
1907
-
1908
- **2. Lazy Evaluation** (on-demand computation)
1909
- ```python
1910
- dataset[42] # Computed only when requested, not upfront
1911
- ```
1912
-
1913
- **3. Separation of Concerns** (modularity)
1914
- ```python
1915
- Dataset: HOW to access individual samples
1916
- DataLoader: HOW to group samples into batches
1917
- Training: WHAT to do with batches
1918
- ```
1919
-
1920
- These patterns are why PyTorch's DataLoader scales from 1,000 samples (your laptop) to 1 billion samples (Google's TPU pods) using the same API!
1921
- """
1922
-
1923
- # %%
1924
- def demo_dataloader():
1925
- """🎯 See your DataLoader batch data correctly."""
1926
- print("🎯 AHA MOMENT: DataLoader Batches Your Data")
1927
- print("=" * 45)
1928
-
1929
- # Create a dataset
1930
- X = Tensor(np.random.randn(100, 64))
1931
- y = Tensor(np.arange(100))
1932
- dataset = TensorDataset(X, y)
1933
-
1934
- # Create DataLoader with batching
1935
- loader = DataLoader(dataset, batch_size=32, shuffle=True)
1936
-
1937
- print(f"Dataset: {len(dataset)} samples")
1938
- print(f"Batch size: 32")
1939
- print(f"Number of batches: {len(loader)}")
1940
-
1941
- print("\nBatches:")
1942
- for i, (batch_x, batch_y) in enumerate(loader):
1943
- print(f" Batch {i+1}: {batch_x.shape[0]} samples, shape {batch_x.shape}")
1944
-
1945
- print("\n✨ Your DataLoader organizes data for efficient training!")
1946
-
1947
- # %%
1948
- if __name__ == "__main__":
1949
- test_module()
1950
- print("\n")
1951
- demo_dataloader()
1952
-
1953
- # %% [markdown]
1954
- """
1955
- ## 🚀 MODULE SUMMARY: DataLoader
1956
-
1957
- Congratulations! You've built a complete data loading pipeline for ML training!
1958
-
1959
- ### Key Accomplishments
1960
- - Built Dataset abstraction and TensorDataset implementation with proper tensor alignment
1961
- - Created DataLoader with batching, shuffling, and memory-efficient iteration
1962
- - Analyzed data pipeline performance and discovered memory/speed trade-offs
1963
- - Learned how to apply DataLoader to real datasets (see examples/milestones)
1964
- - All tests pass ✅ (validated by `test_module()`)
1965
-
1966
- ### Systems Insights Discovered
1967
- - **Batch size directly impacts memory usage and training throughput**
1968
- - **Shuffling adds minimal overhead but prevents overfitting patterns**
1969
- - **Data loading can become a bottleneck without proper optimization**
1970
- - **Memory usage scales linearly with batch size and feature dimensions**
1971
-
1972
- ### Ready for Next Steps
1973
- Your DataLoader implementation enables efficient training of CNNs and larger models with proper data pipeline management.
1974
- Export with: `tito export 05_dataloader`
1975
-
1976
- **Apply your knowledge:**
1977
- - Milestone 03: Train MLP on real MNIST digits
1978
- - Milestone 04: Train CNN on CIFAR-10 images
1979
-
1980
- **Then continue with:** Module 09 (Convolutions) for Conv2d layers!
1981
-
1982
- ### Real-World Connection
1983
- You've implemented the same patterns used in:
1984
- - **PyTorch's DataLoader**: Same interface design for batching and shuffling
1985
- - **TensorFlow's Dataset API**: Similar abstraction for data pipeline optimization
1986
- - **Production ML**: Essential for handling large-scale training efficiently
1987
- - **Research**: Standard foundation for all deep learning experiments
1988
-
1989
- Your data loading pipeline is now ready to power the CNN training in Module 09!
1990
- """