Spaces:

jalFaizy
/

mnist-digit-classifier

Sleeping

faizan commited on Dec 28, 2025

Commit

e77a25a

1 Parent(s): 1c6a6f3

fix: resolve all 468 ruff linting errors (code quality enforcement complete)

- Added ruff.toml config with E402 ignore for sys.path test pattern
- Fixed 3 E501 line length violations manually
- Auto-fixed 468 W293/W291 whitespace errors
- Codebase now passes all linting checks (0 errors)
- Ready for deployment (Phase 3)

Files changed (16) hide show

app.py +29 -25
ruff.toml +13 -0
scripts/augmentation.py +19 -19
scripts/data_loader.py +17 -17
scripts/data_quality.py +54 -54
scripts/inference.py +57 -53
scripts/mlflow_setup.py +32 -30
scripts/models.py +61 -57
scripts/preprocessing.py +35 -35
scripts/test_data_loader.py +13 -13
scripts/test_data_quality.py +7 -7
scripts/test_preprocessing.py +30 -28
scripts/test_train.py +15 -14
scripts/train.py +46 -46
scripts/train_baseline.py +75 -44
scripts/train_with_mlflow.py +87 -61

app.py CHANGED Viewed

@@ -19,21 +19,21 @@ print(f"Model loaded on {classifier.device}")
 def predict_digit(image):
     """
     Predict digit from user-drawn image.
     Args:
         image: numpy array from Gradio Sketchpad (H, W, 3) or (H, W)
     Returns:
         Tuple of (predicted_digit, confidence_text, probability_dict)
     """
     if image is None:
         return "Please draw a digit", "", {}
     # Handle different image formats from Gradio
     if isinstance(image, dict):
         # Sketchpad returns dict with 'composite' key
         image = image.get('composite', image)
     # Convert to PIL Image
     if isinstance(image, np.ndarray):
         # If RGB, convert to grayscale
@@ -43,29 +43,29 @@ def predict_digit(image):
                 image = image[:, :, 3]  # Use alpha channel
             else:  # RGB
                 image = np.mean(image, axis=2).astype(np.uint8)
         # Ensure values are in [0, 255]
         if image.max() <= 1.0:
             image = (image * 255).astype(np.uint8)
         pil_image = Image.fromarray(image.astype(np.uint8), mode='L')
     else:
         pil_image = image
     # Get prediction
     result = classifier.predict(pil_image)
     # Format output
     digit = result['digit']
     confidence = result['confidence']
     probabilities = result['probabilities']
     # Create confidence text
     confidence_text = f"Confidence: {confidence*100:.1f}%"
     # Create probability dictionary for bar chart
     prob_dict = {str(i): prob for i, prob in enumerate(probabilities)}
     return digit, confidence_text, prob_dict
@@ -92,15 +92,15 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
     gr.Markdown(
         """
         # 🔢 Handwritten Digit Classifier
         Draw a digit (0-9) in the box below and the AI will predict
         what it is!
         This model uses a Convolutional Neural Network (CNN) trained on
         the MNIST dataset with **99.17% accuracy** on 10,000 test images.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             # Sketchpad for drawing
@@ -116,12 +116,16 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
                 height=280,
                 width=280
             )
             # Buttons
             with gr.Row():
-                predict_btn = gr.Button("🔍 Predict", variant="primary", scale=2)
-                clear_btn = gr.ClearButton(components=[input_image], value="🗑️ Clear", scale=1)
         with gr.Column(scale=1):
             # Prediction output
             output_digit = gr.Textbox(
@@ -132,7 +136,7 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
                 max_lines=1,
                 interactive=False
             )
             output_confidence = gr.Textbox(
                 label="Confidence",
                 placeholder="",
@@ -141,13 +145,13 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
                 max_lines=1,
                 interactive=False
             )
             # Probability distribution
             output_probs = gr.Label(
                 label="Probability Distribution",
                 num_top_classes=10
             )
     # Example images section
     gr.Markdown("### 📝 Try these examples:")
     gr.Examples(
@@ -157,7 +161,7 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
         inputs=input_image,
         label="Example digits"
     )
     # Model info
     gr.Markdown(
         """
@@ -168,7 +172,7 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
         - **Training**: MNIST dataset (60,000 images)
         - **Test Accuracy**: 99.17%
         - **Framework**: PyTorch 2.0.1
         ### 💡 Tips for best results:
         - Draw the digit large and centered
         - Use a thick brush stroke
@@ -176,14 +180,14 @@ with gr.Blocks(css=custom_css, title="MNIST Digit Classifier") as demo:
         - Make sure the digit is clear and recognizable
         """
     )
     # Connect events
     predict_btn.click(
         fn=predict_digit,
         inputs=input_image,
         outputs=[output_digit, output_confidence, output_probs]
     )
     # Also predict on sketchpad change (real-time prediction)
     input_image.change(
         fn=predict_digit,

 def predict_digit(image):
     """
     Predict digit from user-drawn image.
     Args:
         image: numpy array from Gradio Sketchpad (H, W, 3) or (H, W)
     Returns:
         Tuple of (predicted_digit, confidence_text, probability_dict)
     """
     if image is None:
         return "Please draw a digit", "", {}
     # Handle different image formats from Gradio
     if isinstance(image, dict):
         # Sketchpad returns dict with 'composite' key
         image = image.get('composite', image)
     # Convert to PIL Image
     if isinstance(image, np.ndarray):
         # If RGB, convert to grayscale
                 image = image[:, :, 3]  # Use alpha channel
             else:  # RGB
                 image = np.mean(image, axis=2).astype(np.uint8)
         # Ensure values are in [0, 255]
         if image.max() <= 1.0:
             image = (image * 255).astype(np.uint8)
         pil_image = Image.fromarray(image.astype(np.uint8), mode='L')
     else:
         pil_image = image
     # Get prediction
     result = classifier.predict(pil_image)
     # Format output
     digit = result['digit']
     confidence = result['confidence']
     probabilities = result['probabilities']
     # Create confidence text
     confidence_text = f"Confidence: {confidence*100:.1f}%"
     # Create probability dictionary for bar chart
     prob_dict = {str(i): prob for i, prob in enumerate(probabilities)}
     return digit, confidence_text, prob_dict
     gr.Markdown(
         """
         # 🔢 Handwritten Digit Classifier
         Draw a digit (0-9) in the box below and the AI will predict
         what it is!
         This model uses a Convolutional Neural Network (CNN) trained on
         the MNIST dataset with **99.17% accuracy** on 10,000 test images.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             # Sketchpad for drawing
                 height=280,
                 width=280
             )
             # Buttons
             with gr.Row():
+                predict_btn = gr.Button(
+                    "🔍 Predict", variant="primary", scale=2
+                )
+                clear_btn = gr.ClearButton(
+                    components=[input_image], value="🗑️ Clear", scale=1
+                )
         with gr.Column(scale=1):
             # Prediction output
             output_digit = gr.Textbox(
                 max_lines=1,
                 interactive=False
             )
             output_confidence = gr.Textbox(
                 label="Confidence",
                 placeholder="",
                 max_lines=1,
                 interactive=False
             )
             # Probability distribution
             output_probs = gr.Label(
                 label="Probability Distribution",
                 num_top_classes=10
             )
     # Example images section
     gr.Markdown("### 📝 Try these examples:")
     gr.Examples(
         inputs=input_image,
         label="Example digits"
     )
     # Model info
     gr.Markdown(
         """
         - **Training**: MNIST dataset (60,000 images)
         - **Test Accuracy**: 99.17%
         - **Framework**: PyTorch 2.0.1
         ### 💡 Tips for best results:
         - Draw the digit large and centered
         - Use a thick brush stroke
         - Make sure the digit is clear and recognizable
         """
     )
     # Connect events
     predict_btn.click(
         fn=predict_digit,
         inputs=input_image,
         outputs=[output_digit, output_confidence, output_probs]
     )
     # Also predict on sketchpad change (real-time prediction)
     input_image.change(
         fn=predict_digit,

ruff.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+# Ruff configuration for MNIST project
+select = ["E", "F", "W"]
+ignore = []
+# Line length
+line-length = 88
+[per-file-ignores]
+# Ignore E402 (module level import not at top) for scripts that modify sys.path
+"scripts/test_*.py" = ["E402"]
+"scripts/train_*.py" = ["E402"]
+"scripts/inference.py" = ["E402"]

scripts/augmentation.py CHANGED Viewed

@@ -11,10 +11,10 @@ These augmentations are applied on-the-fly during training for infinite variatio
 Usage:
     from scripts.augmentation import get_train_augmentation, get_val_augmentation
     from scripts.preprocessing import MnistDataset
     # Training with augmentation
     train_dataset = MnistDataset(x_train, y_train, transform=get_train_augmentation())
     # Validation/test without augmentation
     val_dataset = MnistDataset(x_val, y_val, transform=get_val_augmentation())
 """
@@ -26,14 +26,14 @@ import torch
 def get_train_augmentation():
     """
     Get augmentation pipeline for training data.
     Applies realistic transformations that preserve digit readability:
     - Rotation: ±15° (typical handwriting angle variation)
     - Translation: ±10% (off-center digits)
     - Scaling: 90-110% (size variation)
     Note: Normalization happens in MnistDataset, not here.
     Returns:
         torchvision.transforms.Compose: Composition of augmentation transforms
     """
@@ -44,7 +44,7 @@ def get_train_augmentation():
             interpolation=transforms.InterpolationMode.BILINEAR,
             fill=0  # Fill with black (background)
         ),
         # Random translation and scaling (no additional rotation)
         transforms.RandomAffine(
             degrees=0,  # No rotation here (already done above)
@@ -59,10 +59,10 @@ def get_train_augmentation():
 def get_val_augmentation():
     """
     Get augmentation pipeline for validation/test data.
     No augmentation is applied - returns identity transform.
     This ensures fair evaluation on original unmodified data.
     Returns:
         None (no transforms)
     """
@@ -72,12 +72,12 @@ def get_val_augmentation():
 def get_mild_augmentation():
     """
     Get milder augmentation pipeline (conservative settings).
     Use this if standard augmentation is too aggressive:
     - Rotation: ±10° (reduced from ±15°)
     - Translation: ±5% (reduced from ±10%)
     - Scaling: 95-105% (reduced from 90-110%)
     Returns:
         torchvision.transforms.Compose: Mild augmentation transforms
     """
@@ -100,13 +100,13 @@ def get_mild_augmentation():
 def get_aggressive_augmentation():
     """
     Get aggressive augmentation pipeline (stronger settings).
     Use with caution - may distort digits beyond recognition:
     - Rotation: ±20°
     - Translation: ±15%
     - Scaling: 80-120%
     - Elastic deformation (optional, commented out)
     Returns:
         torchvision.transforms.Compose: Aggressive augmentation transforms
     """
@@ -131,14 +131,14 @@ def get_aggressive_augmentation():
 def visualize_augmentations(image: torch.Tensor, transform, num_samples: int = 9):
     """
     Apply augmentation multiple times to visualize variations.
     Useful for debugging and understanding augmentation effects.
     Args:
         image: Single image tensor (1, 28, 28)
         transform: Augmentation transform to apply
         num_samples: Number of augmented versions to generate
     Returns:
         list: List of augmented image tensors
     """
@@ -149,7 +149,7 @@ def visualize_augmentations(image: torch.Tensor, transform, num_samples: int = 9
         else:
             aug_img = image
         augmented_images.append(aug_img)
     return augmented_images
@@ -165,10 +165,10 @@ AUGMENTATION_PRESETS = {
 def get_augmentation_by_name(preset_name: str = 'standard'):
     """
     Get augmentation pipeline by preset name.
     Args:
         preset_name: One of ['none', 'mild', 'standard', 'aggressive']
     Returns:
         Augmentation transform or None
     """
@@ -177,6 +177,6 @@ def get_augmentation_by_name(preset_name: str = 'standard'):
             f"Unknown preset '{preset_name}'. "
             f"Choose from: {list(AUGMENTATION_PRESETS.keys())}"
         )
     preset = AUGMENTATION_PRESETS[preset_name]
     return preset() if callable(preset) else preset

 Usage:
     from scripts.augmentation import get_train_augmentation, get_val_augmentation
     from scripts.preprocessing import MnistDataset
     # Training with augmentation
     train_dataset = MnistDataset(x_train, y_train, transform=get_train_augmentation())
     # Validation/test without augmentation
     val_dataset = MnistDataset(x_val, y_val, transform=get_val_augmentation())
 """
 def get_train_augmentation():
     """
     Get augmentation pipeline for training data.
     Applies realistic transformations that preserve digit readability:
     - Rotation: ±15° (typical handwriting angle variation)
     - Translation: ±10% (off-center digits)
     - Scaling: 90-110% (size variation)
     Note: Normalization happens in MnistDataset, not here.
     Returns:
         torchvision.transforms.Compose: Composition of augmentation transforms
     """
             interpolation=transforms.InterpolationMode.BILINEAR,
             fill=0  # Fill with black (background)
         ),
         # Random translation and scaling (no additional rotation)
         transforms.RandomAffine(
             degrees=0,  # No rotation here (already done above)
 def get_val_augmentation():
     """
     Get augmentation pipeline for validation/test data.
     No augmentation is applied - returns identity transform.
     This ensures fair evaluation on original unmodified data.
     Returns:
         None (no transforms)
     """
 def get_mild_augmentation():
     """
     Get milder augmentation pipeline (conservative settings).
     Use this if standard augmentation is too aggressive:
     - Rotation: ±10° (reduced from ±15°)
     - Translation: ±5% (reduced from ±10%)
     - Scaling: 95-105% (reduced from 90-110%)
     Returns:
         torchvision.transforms.Compose: Mild augmentation transforms
     """
 def get_aggressive_augmentation():
     """
     Get aggressive augmentation pipeline (stronger settings).
     Use with caution - may distort digits beyond recognition:
     - Rotation: ±20°
     - Translation: ±15%
     - Scaling: 80-120%
     - Elastic deformation (optional, commented out)
     Returns:
         torchvision.transforms.Compose: Aggressive augmentation transforms
     """
 def visualize_augmentations(image: torch.Tensor, transform, num_samples: int = 9):
     """
     Apply augmentation multiple times to visualize variations.
     Useful for debugging and understanding augmentation effects.
     Args:
         image: Single image tensor (1, 28, 28)
         transform: Augmentation transform to apply
         num_samples: Number of augmented versions to generate
     Returns:
         list: List of augmented image tensors
     """
         else:
             aug_img = image
         augmented_images.append(aug_img)
     return augmented_images
 def get_augmentation_by_name(preset_name: str = 'standard'):
     """
     Get augmentation pipeline by preset name.
     Args:
         preset_name: One of ['none', 'mild', 'standard', 'aggressive']
     Returns:
         Augmentation transform or None
     """
             f"Unknown preset '{preset_name}'. "
             f"Choose from: {list(AUGMENTATION_PRESETS.keys())}"
         )
     preset = AUGMENTATION_PRESETS[preset_name]
     return preset() if callable(preset) else preset

scripts/data_loader.py CHANGED Viewed

@@ -15,17 +15,17 @@ from numpy.typing import NDArray
 class MnistDataloader:
     """
     Load MNIST handwritten digit dataset from IDX binary files.
     The MNIST dataset uses a custom IDX binary format with magic numbers
     to identify image (2051) and label (2049) files.
     Attributes:
         training_images_filepath: Path to training images IDX file
         training_labels_filepath: Path to training labels IDX file
         test_images_filepath: Path to test images IDX file
         test_labels_filepath: Path to test labels IDX file
     """
     def __init__(
         self,
         training_images_filepath: str,
@@ -35,13 +35,13 @@ class MnistDataloader:
     ) -> None:
         """
         Initialize MNIST data loader with file paths.
         Args:
             training_images_filepath: Path to training images (.idx3-ubyte)
             training_labels_filepath: Path to training labels (.idx1-ubyte)
             test_images_filepath: Path to test images (.idx3-ubyte)
             test_labels_filepath: Path to test labels (.idx1-ubyte)
         Raises:
             FileNotFoundError: If any of the specified files don't exist
         """
@@ -49,7 +49,7 @@ class MnistDataloader:
         self.training_labels_filepath = training_labels_filepath
         self.test_images_filepath = test_images_filepath
         self.test_labels_filepath = test_labels_filepath
         # Verify files exist
         for filepath in [
             training_images_filepath,
@@ -59,7 +59,7 @@ class MnistDataloader:
         ]:
             if not Path(filepath).exists():
                 raise FileNotFoundError(f"MNIST data file not found: {filepath}")
     def read_images_labels(
         self,
         images_filepath: str,
@@ -67,16 +67,16 @@ class MnistDataloader:
     ) -> Tuple[List[NDArray[np.uint8]], List[int]]:
         """
         Read images and labels from IDX binary files.
         Args:
             images_filepath: Path to images IDX file
             labels_filepath: Path to labels IDX file
         Returns:
             Tuple of (images, labels) where:
                 - images: List of 28x28 numpy arrays (uint8)
                 - labels: List of integer labels (0-9)
         Raises:
             ValueError: If magic numbers don't match expected values
         """
@@ -90,7 +90,7 @@ class MnistDataloader:
                     f'Expected 2049, got {magic}'
                 )
             labels = array("B", file.read())
         # Read images
         with open(images_filepath, 'rb') as file:
             magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
@@ -100,12 +100,12 @@ class MnistDataloader:
                     f'Expected 2051, got {magic}'
                 )
             image_data = array("B", file.read())
         # Convert to list of 28x28 arrays
         images = []
         for i in range(size):
             images.append([0] * rows * cols)
         for i in range(size):
             img = np.array(
                 image_data[i * rows * cols:(i + 1) * rows * cols],
@@ -113,23 +113,23 @@ class MnistDataloader:
             )
             img = img.reshape(rows, cols)
             images[i][:] = img
         return images, list(labels)
     def load_data(self) -> Tuple[
         Tuple[List[NDArray[np.uint8]], List[int]],
         Tuple[List[NDArray[np.uint8]], List[int]]
     ]:
         """
         Load complete MNIST dataset (training and test sets).
         Returns:
             Tuple of ((x_train, y_train), (x_test, y_test)) where:
                 - x_train: 60,000 training images (28x28 uint8 arrays)
                 - y_train: 60,000 training labels (0-9)
                 - x_test: 10,000 test images (28x28 uint8 arrays)
                 - y_test: 10,000 test labels (0-9)
         Example:
             >>> loader = MnistDataloader(
             ...     'data/raw/train-images.idx3-ubyte',

 class MnistDataloader:
     """
     Load MNIST handwritten digit dataset from IDX binary files.
     The MNIST dataset uses a custom IDX binary format with magic numbers
     to identify image (2051) and label (2049) files.
     Attributes:
         training_images_filepath: Path to training images IDX file
         training_labels_filepath: Path to training labels IDX file
         test_images_filepath: Path to test images IDX file
         test_labels_filepath: Path to test labels IDX file
     """
     def __init__(
         self,
         training_images_filepath: str,
     ) -> None:
         """
         Initialize MNIST data loader with file paths.
         Args:
             training_images_filepath: Path to training images (.idx3-ubyte)
             training_labels_filepath: Path to training labels (.idx1-ubyte)
             test_images_filepath: Path to test images (.idx3-ubyte)
             test_labels_filepath: Path to test labels (.idx1-ubyte)
         Raises:
             FileNotFoundError: If any of the specified files don't exist
         """
         self.training_labels_filepath = training_labels_filepath
         self.test_images_filepath = test_images_filepath
         self.test_labels_filepath = test_labels_filepath
         # Verify files exist
         for filepath in [
             training_images_filepath,
         ]:
             if not Path(filepath).exists():
                 raise FileNotFoundError(f"MNIST data file not found: {filepath}")
     def read_images_labels(
         self,
         images_filepath: str,
     ) -> Tuple[List[NDArray[np.uint8]], List[int]]:
         """
         Read images and labels from IDX binary files.
         Args:
             images_filepath: Path to images IDX file
             labels_filepath: Path to labels IDX file
         Returns:
             Tuple of (images, labels) where:
                 - images: List of 28x28 numpy arrays (uint8)
                 - labels: List of integer labels (0-9)
         Raises:
             ValueError: If magic numbers don't match expected values
         """
                     f'Expected 2049, got {magic}'
                 )
             labels = array("B", file.read())
         # Read images
         with open(images_filepath, 'rb') as file:
             magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
                     f'Expected 2051, got {magic}'
                 )
             image_data = array("B", file.read())
         # Convert to list of 28x28 arrays
         images = []
         for i in range(size):
             images.append([0] * rows * cols)
         for i in range(size):
             img = np.array(
                 image_data[i * rows * cols:(i + 1) * rows * cols],
             )
             img = img.reshape(rows, cols)
             images[i][:] = img
         return images, list(labels)
     def load_data(self) -> Tuple[
         Tuple[List[NDArray[np.uint8]], List[int]],
         Tuple[List[NDArray[np.uint8]], List[int]]
     ]:
         """
         Load complete MNIST dataset (training and test sets).
         Returns:
             Tuple of ((x_train, y_train), (x_test, y_test)) where:
                 - x_train: 60,000 training images (28x28 uint8 arrays)
                 - y_train: 60,000 training labels (0-9)
                 - x_test: 10,000 test images (28x28 uint8 arrays)
                 - y_test: 10,000 test labels (0-9)
         Example:
             >>> loader = MnistDataloader(
             ...     'data/raw/train-images.idx3-ubyte',

scripts/data_quality.py CHANGED Viewed

@@ -10,9 +10,9 @@ This module provides functions to systematically check MNIST dataset quality:
 Usage:
     from scripts.data_quality import generate_quality_report
     report = generate_quality_report(
-        (x_train, y_train),
         (x_test, y_test)
     )
 """
@@ -24,16 +24,16 @@ from collections import Counter
 def check_missing_values(
-    images: List[NDArray[np.uint8]],
     labels: List[int]
 ) -> Dict[str, Any]:
     """
     Check for NaN or missing values in images and labels.
     Args:
         images: List of image arrays (each 28x28)
         labels: List of integer labels (0-9)
     Returns:
         dict: Contains 'has_missing_values', 'missing_count', 'details'
     """
@@ -43,12 +43,12 @@ def check_missing_values(
         img_array = np.array(img)
         if np.isnan(img_array).any():
             images_with_nan.append(idx)
     # Check labels for None
     labels_with_none = [idx for idx, label in enumerate(labels) if label is None]
     has_missing = len(images_with_nan) > 0 or len(labels_with_none) > 0
     return {
         'has_missing_values': has_missing,
         'missing_count': len(images_with_nan) + len(labels_with_none),
@@ -66,25 +66,25 @@ def check_missing_values(
 def check_outliers(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
     """
     Identify pixels outside valid range [0, 255] for uint8 images.
     Args:
         images: List of image arrays (each 28x28)
     Returns:
         dict: Contains 'has_outliers', 'outlier_count', 'pixel_range', 'details'
     """
     outlier_images = []
     pixel_min = 255
     pixel_max = 0
     for idx, img in enumerate(images):
         img_array = np.array(img)
         img_min = img_array.min()
         img_max = img_array.max()
         pixel_min = min(pixel_min, img_min)
         pixel_max = max(pixel_max, img_max)
         # Check for values outside [0, 255]
         if img_min < 0 or img_max > 255:
             outlier_images.append({
@@ -92,7 +92,7 @@ def check_outliers(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
                 'min': int(img_min),
                 'max': int(img_max)
             })
     return {
         'has_outliers': len(outlier_images) > 0,
         'outlier_count': len(outlier_images),
@@ -109,44 +109,44 @@ def check_outliers(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
 def check_class_balance(labels: List[int]) -> Dict[str, Any]:
     """
     Compute samples per class and calculate imbalance ratio.
     Imbalance ratio = max_count / min_count
     A ratio < 1.2 indicates good balance (< 20% difference)
     Args:
         labels: List of integer labels (0-9)
     Returns:
         dict: Contains 'is_balanced', 'imbalance_ratio', 'class_counts', 'details'
     """
     class_counts = Counter(labels)
     # Ensure all 10 digits present
     for digit in range(10):
         if digit not in class_counts:
             class_counts[digit] = 0
     counts = list(class_counts.values())
     max_count = max(counts)
     min_count = min(counts) if min(counts) > 0 else 1  # Avoid division by zero
     imbalance_ratio = max_count / min_count
     is_balanced = imbalance_ratio < 1.2  # Less than 20% difference
     # Per-class percentages
     total = len(labels)
     class_percentages = {
-        digit: (count / total) * 100
         for digit, count in class_counts.items()
     }
     return {
         'is_balanced': is_balanced,
         'imbalance_ratio': round(imbalance_ratio, 3),
         'threshold': 1.2,
         'class_counts': dict(sorted(class_counts.items())),
         'class_percentages': {
-            k: round(v, 2)
             for k, v in sorted(class_percentages.items())
         },
         'details': {
@@ -162,16 +162,16 @@ def check_class_balance(labels: List[int]) -> Dict[str, Any]:
 def check_image_dimensions(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
     """
     Verify all images are 28x28 as expected for MNIST.
     Args:
         images: List of image arrays
     Returns:
         dict: Contains 'all_correct_shape', 'expected_shape', 'invalid_count', 'details'
     """
     expected_shape = (28, 28)
     invalid_images = []
     for idx, img in enumerate(images):
         img_array = np.array(img)
         if img_array.shape != expected_shape:
@@ -179,7 +179,7 @@ def check_image_dimensions(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
                 'index': idx,
                 'shape': img_array.shape
             })
     return {
         'all_correct_shape': len(invalid_images) == 0,
         'expected_shape': expected_shape,
@@ -194,16 +194,16 @@ def check_image_dimensions(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
 def check_label_validity(labels: List[int]) -> Dict[str, Any]:
     """
     Verify all labels are valid integers in range [0, 9].
     Args:
         labels: List of labels
     Returns:
         dict: Contains 'all_valid', 'invalid_count', 'unique_labels', 'details'
     """
     valid_range = set(range(10))
     invalid_labels = []
     for idx, label in enumerate(labels):
         if not isinstance(label, int) or label not in valid_range:
             invalid_labels.append({
@@ -211,9 +211,9 @@ def check_label_validity(labels: List[int]) -> Dict[str, Any]:
                 'value': label,
                 'type': type(label).__name__
             })
     unique_labels = sorted(set(labels))
     return {
         'all_valid': len(invalid_labels) == 0,
         'expected_range': [0, 9],
@@ -232,17 +232,17 @@ def generate_quality_report(
 ) -> Dict[str, Any]:
     """
     Run all quality checks on training and test sets.
     Args:
         train_data: Tuple of (train_images, train_labels)
         test_data: Tuple of (test_images, test_labels)
     Returns:
         dict: Comprehensive quality report with all check results
     """
     x_train, y_train = train_data
     x_test, y_test = test_data
     report = {
         'dataset_info': {
             'train_samples': len(x_train),
@@ -264,7 +264,7 @@ def generate_quality_report(
             'label_validity': check_label_validity(y_test)
         }
     }
     # Overall quality assessment
     all_checks_pass = (
         not report['training_set']['missing_values']['has_missing_values'] and
@@ -276,7 +276,7 @@ def generate_quality_report(
         report['test_set']['image_dimensions']['all_correct_shape'] and
         report['test_set']['label_validity']['all_valid']
     )
     report['summary'] = {
         'all_checks_pass': all_checks_pass,
         'quality_rating': 'EXCELLENT' if all_checks_pass else 'ISSUES_FOUND',
@@ -284,22 +284,22 @@ def generate_quality_report(
         'test_balanced': report['test_set']['class_balance']['is_balanced'],
         'recommendations': _generate_recommendations(report)
     }
     return report
 def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
     """
     Generate recommendations based on quality check results.
     Args:
         report: Quality report dictionary
     Returns:
         list: List of recommendation strings
     """
     recommendations = []
     # Check missing values
     if report['training_set']['missing_values']['has_missing_values']:
         recommendations.append(
@@ -309,7 +309,7 @@ def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
         recommendations.append(
             "Remove or impute samples with missing values in test set"
         )
     # Check outliers
     if report['training_set']['outliers']['has_outliers']:
         recommendations.append(
@@ -319,7 +319,7 @@ def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
         recommendations.append(
             "Clip or remove test images with pixel values outside [0, 255]"
         )
     # Check class balance
     train_imbalance = report['training_set']['class_balance']['imbalance_ratio']
     if train_imbalance >= 1.5:
@@ -332,7 +332,7 @@ def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
             f"Minor class imbalance detected (ratio: {train_imbalance:.2f}). "
             "Monitor per-class performance during training."
         )
     # Check dimensions
     if not report['training_set']['image_dimensions']['all_correct_shape']:
         recommendations.append(
@@ -342,7 +342,7 @@ def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
         recommendations.append(
             "Resize or remove test images with incorrect dimensions"
         )
     # Check labels
     if not report['training_set']['label_validity']['all_valid']:
         recommendations.append(
@@ -352,20 +352,20 @@ def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
         recommendations.append(
             "Remove or correct test samples with invalid labels"
         )
     # If all checks pass
     if not recommendations:
         recommendations.append(
             "Dataset is high quality - proceed with preprocessing and normalization"
         )
     return recommendations
 def print_quality_summary(report: Dict[str, Any]) -> None:
     """
     Print a human-readable summary of the quality report.
     Args:
         report: Quality report dictionary from generate_quality_report()
     """
@@ -373,7 +373,7 @@ def print_quality_summary(report: Dict[str, Any]) -> None:
     print("MNIST DATASET QUALITY REPORT")
     print("=" * 60)
     print()
     # Dataset info
     info = report['dataset_info']
     print("Dataset Size:")
@@ -381,7 +381,7 @@ def print_quality_summary(report: Dict[str, Any]) -> None:
     print(f"  Test:     {info['test_samples']:,} samples")
     print(f"  Total:    {info['total_samples']:,} samples")
     print()
     # Training set checks
     print("Training Set Quality Checks:")
     train = report['training_set']
@@ -394,7 +394,7 @@ def print_quality_summary(report: Dict[str, Any]) -> None:
         train['class_balance']['is_balanced']
     )
     print()
     # Test set checks
     print("Test Set Quality Checks:")
     test = report['test_set']
@@ -407,14 +407,14 @@ def print_quality_summary(report: Dict[str, Any]) -> None:
         test['class_balance']['is_balanced']
     )
     print()
     # Overall summary
     summary = report['summary']
     print("=" * 60)
     print(f"Overall Quality: {summary['quality_rating']}")
     print("=" * 60)
     print()
     # Recommendations
     print("Recommendations:")
     for i, rec in enumerate(summary['recommendations'], 1):

 Usage:
     from scripts.data_quality import generate_quality_report
     report = generate_quality_report(
+        (x_train, y_train),
         (x_test, y_test)
     )
 """
 def check_missing_values(
+    images: List[NDArray[np.uint8]],
     labels: List[int]
 ) -> Dict[str, Any]:
     """
     Check for NaN or missing values in images and labels.
     Args:
         images: List of image arrays (each 28x28)
         labels: List of integer labels (0-9)
     Returns:
         dict: Contains 'has_missing_values', 'missing_count', 'details'
     """
         img_array = np.array(img)
         if np.isnan(img_array).any():
             images_with_nan.append(idx)
     # Check labels for None
     labels_with_none = [idx for idx, label in enumerate(labels) if label is None]
     has_missing = len(images_with_nan) > 0 or len(labels_with_none) > 0
     return {
         'has_missing_values': has_missing,
         'missing_count': len(images_with_nan) + len(labels_with_none),
 def check_outliers(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
     """
     Identify pixels outside valid range [0, 255] for uint8 images.
     Args:
         images: List of image arrays (each 28x28)
     Returns:
         dict: Contains 'has_outliers', 'outlier_count', 'pixel_range', 'details'
     """
     outlier_images = []
     pixel_min = 255
     pixel_max = 0
     for idx, img in enumerate(images):
         img_array = np.array(img)
         img_min = img_array.min()
         img_max = img_array.max()
         pixel_min = min(pixel_min, img_min)
         pixel_max = max(pixel_max, img_max)
         # Check for values outside [0, 255]
         if img_min < 0 or img_max > 255:
             outlier_images.append({
                 'min': int(img_min),
                 'max': int(img_max)
             })
     return {
         'has_outliers': len(outlier_images) > 0,
         'outlier_count': len(outlier_images),
 def check_class_balance(labels: List[int]) -> Dict[str, Any]:
     """
     Compute samples per class and calculate imbalance ratio.
     Imbalance ratio = max_count / min_count
     A ratio < 1.2 indicates good balance (< 20% difference)
     Args:
         labels: List of integer labels (0-9)
     Returns:
         dict: Contains 'is_balanced', 'imbalance_ratio', 'class_counts', 'details'
     """
     class_counts = Counter(labels)
     # Ensure all 10 digits present
     for digit in range(10):
         if digit not in class_counts:
             class_counts[digit] = 0
     counts = list(class_counts.values())
     max_count = max(counts)
     min_count = min(counts) if min(counts) > 0 else 1  # Avoid division by zero
     imbalance_ratio = max_count / min_count
     is_balanced = imbalance_ratio < 1.2  # Less than 20% difference
     # Per-class percentages
     total = len(labels)
     class_percentages = {
+        digit: (count / total) * 100
         for digit, count in class_counts.items()
     }
     return {
         'is_balanced': is_balanced,
         'imbalance_ratio': round(imbalance_ratio, 3),
         'threshold': 1.2,
         'class_counts': dict(sorted(class_counts.items())),
         'class_percentages': {
+            k: round(v, 2)
             for k, v in sorted(class_percentages.items())
         },
         'details': {
 def check_image_dimensions(images: List[NDArray[np.uint8]]) -> Dict[str, Any]:
     """
     Verify all images are 28x28 as expected for MNIST.
     Args:
         images: List of image arrays
     Returns:
         dict: Contains 'all_correct_shape', 'expected_shape', 'invalid_count', 'details'
     """
     expected_shape = (28, 28)
     invalid_images = []
     for idx, img in enumerate(images):
         img_array = np.array(img)
         if img_array.shape != expected_shape:
                 'index': idx,
                 'shape': img_array.shape
             })
     return {
         'all_correct_shape': len(invalid_images) == 0,
         'expected_shape': expected_shape,
 def check_label_validity(labels: List[int]) -> Dict[str, Any]:
     """
     Verify all labels are valid integers in range [0, 9].
     Args:
         labels: List of labels
     Returns:
         dict: Contains 'all_valid', 'invalid_count', 'unique_labels', 'details'
     """
     valid_range = set(range(10))
     invalid_labels = []
     for idx, label in enumerate(labels):
         if not isinstance(label, int) or label not in valid_range:
             invalid_labels.append({
                 'value': label,
                 'type': type(label).__name__
             })
     unique_labels = sorted(set(labels))
     return {
         'all_valid': len(invalid_labels) == 0,
         'expected_range': [0, 9],
 ) -> Dict[str, Any]:
     """
     Run all quality checks on training and test sets.
     Args:
         train_data: Tuple of (train_images, train_labels)
         test_data: Tuple of (test_images, test_labels)
     Returns:
         dict: Comprehensive quality report with all check results
     """
     x_train, y_train = train_data
     x_test, y_test = test_data
     report = {
         'dataset_info': {
             'train_samples': len(x_train),
             'label_validity': check_label_validity(y_test)
         }
     }
     # Overall quality assessment
     all_checks_pass = (
         not report['training_set']['missing_values']['has_missing_values'] and
         report['test_set']['image_dimensions']['all_correct_shape'] and
         report['test_set']['label_validity']['all_valid']
     )
     report['summary'] = {
         'all_checks_pass': all_checks_pass,
         'quality_rating': 'EXCELLENT' if all_checks_pass else 'ISSUES_FOUND',
         'test_balanced': report['test_set']['class_balance']['is_balanced'],
         'recommendations': _generate_recommendations(report)
     }
     return report
 def _generate_recommendations(report: Dict[str, Any]) -> List[str]:
     """
     Generate recommendations based on quality check results.
     Args:
         report: Quality report dictionary
     Returns:
         list: List of recommendation strings
     """
     recommendations = []
     # Check missing values
     if report['training_set']['missing_values']['has_missing_values']:
         recommendations.append(
         recommendations.append(
             "Remove or impute samples with missing values in test set"
         )
     # Check outliers
     if report['training_set']['outliers']['has_outliers']:
         recommendations.append(
         recommendations.append(
             "Clip or remove test images with pixel values outside [0, 255]"
         )
     # Check class balance
     train_imbalance = report['training_set']['class_balance']['imbalance_ratio']
     if train_imbalance >= 1.5:
             f"Minor class imbalance detected (ratio: {train_imbalance:.2f}). "
             "Monitor per-class performance during training."
         )
     # Check dimensions
     if not report['training_set']['image_dimensions']['all_correct_shape']:
         recommendations.append(
         recommendations.append(
             "Resize or remove test images with incorrect dimensions"
         )
     # Check labels
     if not report['training_set']['label_validity']['all_valid']:
         recommendations.append(
         recommendations.append(
             "Remove or correct test samples with invalid labels"
         )
     # If all checks pass
     if not recommendations:
         recommendations.append(
             "Dataset is high quality - proceed with preprocessing and normalization"
         )
     return recommendations
 def print_quality_summary(report: Dict[str, Any]) -> None:
     """
     Print a human-readable summary of the quality report.
     Args:
         report: Quality report dictionary from generate_quality_report()
     """
     print("MNIST DATASET QUALITY REPORT")
     print("=" * 60)
     print()
     # Dataset info
     info = report['dataset_info']
     print("Dataset Size:")
     print(f"  Test:     {info['test_samples']:,} samples")
     print(f"  Total:    {info['total_samples']:,} samples")
     print()
     # Training set checks
     print("Training Set Quality Checks:")
     train = report['training_set']
         train['class_balance']['is_balanced']
     )
     print()
     # Test set checks
     print("Test Set Quality Checks:")
     test = report['test_set']
         test['class_balance']['is_balanced']
     )
     print()
     # Overall summary
     summary = report['summary']
     print("=" * 60)
     print(f"Overall Quality: {summary['quality_rating']}")
     print("=" * 60)
     print()
     # Recommendations
     print("Recommendations:")
     for i, rec in enumerate(summary['recommendations'], 1):

scripts/inference.py CHANGED Viewed

@@ -14,103 +14,103 @@ from typing import Union, Dict
 class DigitClassifier:
     """Production inference wrapper for MNIST digit classifier."""
     def __init__(self, model_path: str, device: str = None):
         """
         Initialize the digit classifier.
         Args:
             model_path: Path to model checkpoint (.pt file)
-            device: Device to run inference on ('cuda' or 'cpu').
                    If None, auto-detects CUDA availability.
         """
         if device is None:
             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         else:
             self.device = device
         self.model_path = Path(model_path)
         if not self.model_path.exists():
             raise FileNotFoundError(f"Model not found at {model_path}")
         self.model = self._load_model()
         self.model.eval()
         # Normalization values (same as training)
         self.mean = 0.1307
         self.std = 0.3081
     def _load_model(self) -> torch.nn.Module:
         """Load model from checkpoint."""
         from scripts.models import BaselineCNN
         model = BaselineCNN()
         # Load checkpoint
         checkpoint = torch.load(self.model_path, map_location=self.device)
         # Handle different checkpoint formats
         if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
             model.load_state_dict(checkpoint['model_state_dict'])
         else:
             model.load_state_dict(checkpoint)
         return model.to(self.device)
     def preprocess(self, image: Union[Image.Image, np.ndarray]) -> torch.Tensor:
         """
         Preprocess image for model input.
         Handles:
         - RGB to grayscale conversion
         - Resizing to 28x28
         - Normalization
         - Inversion if needed (white digit on black background)
         Args:
             image: PIL Image or numpy array
         Returns:
             Preprocessed tensor of shape (1, 1, 28, 28)
         """
         # Convert numpy array to PIL Image if needed
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         # Convert to grayscale if RGB
         if image.mode != 'L':
             image = image.convert('L')
         # Resize to 28x28 if needed
         if image.size != (28, 28):
             image = image.resize((28, 28), Image.Resampling.LANCZOS)
         # Convert to numpy array
         img_array = np.array(image).astype(np.float32)
         # Normalize to [0, 1]
         img_array = img_array / 255.0
         # Check if inversion is needed (MNIST is white digit on black background)
         # If most pixels are bright, it's likely a black digit on white background
         if img_array.mean() > 0.5:
             img_array = 1.0 - img_array
         # Apply normalization (same as training)
         img_array = (img_array - self.mean) / self.std
         # Convert to tensor and add batch and channel dimensions
         img_tensor = torch.tensor(img_array).unsqueeze(0).unsqueeze(0)
         return img_tensor.to(self.device)
     def predict(self, image: Union[Image.Image, np.ndarray]) -> Dict:
         """
         Predict digit from image.
         Args:
             image: PIL Image or numpy array containing digit
         Returns:
             Dictionary with:
                 - digit: Predicted digit (0-9)
@@ -118,25 +118,25 @@ class DigitClassifier:
                 - probabilities: List of probabilities for each digit
         """
         img_tensor = self.preprocess(image)
         with torch.no_grad():
             outputs = self.model(img_tensor)
             probabilities = torch.softmax(outputs, dim=1)[0]
             confidence, predicted = torch.max(probabilities, dim=0)
         return {
             'digit': int(predicted.item()),
             'confidence': float(confidence.item()),
             'probabilities': probabilities.cpu().numpy().tolist()
         }
     def predict_batch(self, images: list) -> list:
         """
         Predict digits for a batch of images.
         Args:
             images: List of PIL Images or numpy arrays
         Returns:
             List of prediction dictionaries
         """
@@ -147,23 +147,23 @@ def test_inference():
     """Test inference module with sample images."""
     import sys
     from pathlib import Path
     # Add project root to path
     project_root = Path(__file__).parent.parent
     sys.path.insert(0, str(project_root))
     from scripts.data_loader import MnistDataloader
     print("Testing Inference Module")
     print("=" * 50)
     # Check if model exists
     model_path = project_root / 'models' / 'best_model.pt'
     if not model_path.exists():
         print(f"Error: Model not found at {model_path}")
         print("Please train a model first.")
         return
     # Load MNIST test data
     data_path = project_root / 'data' / 'raw'
     loader = MnistDataloader(
@@ -173,66 +173,70 @@ def test_inference():
         test_labels_filepath=str(data_path / 't10k-labels.idx1-ubyte')
     )
     _, (x_test, y_test) = loader.load_data()
     # Initialize classifier
     print(f"\n1. Loading model from: {model_path}")
     classifier = DigitClassifier(str(model_path))
     print(f"   Device: {classifier.device}")
     # Test on a few images
     print("\n2. Testing predictions on 10 random test images:")
     print("-" * 50)
     indices = np.random.choice(len(x_test), 10, replace=False)
     correct = 0
     for i, idx in enumerate(indices, 1):
         image = x_test[idx]
         true_label = y_test[idx]
         # Convert list to numpy array if needed
         if isinstance(image, list):
             image = np.array(image)
         # Convert to PIL Image
         img = Image.fromarray(image.astype(np.uint8), mode='L')
         # Predict
         result = classifier.predict(img)
         is_correct = result['digit'] == true_label
         correct += is_correct
         print(f"   Image {i}: True={true_label}, Pred={result['digit']}, "
               f"Conf={result['confidence']:.4f} {'✓' if is_correct else '✗'}")
     accuracy = correct / len(indices) * 100
     print(f"\nAccuracy on {len(indices)} samples: {accuracy:.1f}%")
     # Test edge cases
     print("\n3. Testing edge cases:")
     print("-" * 50)
     # Blank image
     blank = np.zeros((28, 28), dtype=np.uint8)
     blank_img = Image.fromarray(blank, mode='L')
     result = classifier.predict(blank_img)
     print(f"   Blank image: Pred={result['digit']}, Conf={result['confidence']:.4f}")
     # All white image
     white = np.ones((28, 28), dtype=np.uint8) * 255
     white_img = Image.fromarray(white, mode='L')
     result = classifier.predict(white_img)
     print(f"   White image: Pred={result['digit']}, Conf={result['confidence']:.4f}")
     # Different size image
     test_img = x_test[0]
     if isinstance(test_img, list):
         test_img = np.array(test_img)
-    large = Image.fromarray(test_img.astype(np.uint8), mode='L').resize((56, 56))
     result = classifier.predict(large)
-    print(f"   Resized image (56x56): Pred={result['digit']}, Conf={result['confidence']:.4f}")
     print("\n✓ Inference module test complete!")

 class DigitClassifier:
     """Production inference wrapper for MNIST digit classifier."""
     def __init__(self, model_path: str, device: str = None):
         """
         Initialize the digit classifier.
         Args:
             model_path: Path to model checkpoint (.pt file)
+            device: Device to run inference on ('cuda' or 'cpu').
                    If None, auto-detects CUDA availability.
         """
         if device is None:
             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         else:
             self.device = device
         self.model_path = Path(model_path)
         if not self.model_path.exists():
             raise FileNotFoundError(f"Model not found at {model_path}")
         self.model = self._load_model()
         self.model.eval()
         # Normalization values (same as training)
         self.mean = 0.1307
         self.std = 0.3081
     def _load_model(self) -> torch.nn.Module:
         """Load model from checkpoint."""
         from scripts.models import BaselineCNN
         model = BaselineCNN()
         # Load checkpoint
         checkpoint = torch.load(self.model_path, map_location=self.device)
         # Handle different checkpoint formats
         if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
             model.load_state_dict(checkpoint['model_state_dict'])
         else:
             model.load_state_dict(checkpoint)
         return model.to(self.device)
     def preprocess(self, image: Union[Image.Image, np.ndarray]) -> torch.Tensor:
         """
         Preprocess image for model input.
         Handles:
         - RGB to grayscale conversion
         - Resizing to 28x28
         - Normalization
         - Inversion if needed (white digit on black background)
         Args:
             image: PIL Image or numpy array
         Returns:
             Preprocessed tensor of shape (1, 1, 28, 28)
         """
         # Convert numpy array to PIL Image if needed
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         # Convert to grayscale if RGB
         if image.mode != 'L':
             image = image.convert('L')
         # Resize to 28x28 if needed
         if image.size != (28, 28):
             image = image.resize((28, 28), Image.Resampling.LANCZOS)
         # Convert to numpy array
         img_array = np.array(image).astype(np.float32)
         # Normalize to [0, 1]
         img_array = img_array / 255.0
         # Check if inversion is needed (MNIST is white digit on black background)
         # If most pixels are bright, it's likely a black digit on white background
         if img_array.mean() > 0.5:
             img_array = 1.0 - img_array
         # Apply normalization (same as training)
         img_array = (img_array - self.mean) / self.std
         # Convert to tensor and add batch and channel dimensions
         img_tensor = torch.tensor(img_array).unsqueeze(0).unsqueeze(0)
         return img_tensor.to(self.device)
     def predict(self, image: Union[Image.Image, np.ndarray]) -> Dict:
         """
         Predict digit from image.
         Args:
             image: PIL Image or numpy array containing digit
         Returns:
             Dictionary with:
                 - digit: Predicted digit (0-9)
                 - probabilities: List of probabilities for each digit
         """
         img_tensor = self.preprocess(image)
         with torch.no_grad():
             outputs = self.model(img_tensor)
             probabilities = torch.softmax(outputs, dim=1)[0]
             confidence, predicted = torch.max(probabilities, dim=0)
         return {
             'digit': int(predicted.item()),
             'confidence': float(confidence.item()),
             'probabilities': probabilities.cpu().numpy().tolist()
         }
     def predict_batch(self, images: list) -> list:
         """
         Predict digits for a batch of images.
         Args:
             images: List of PIL Images or numpy arrays
         Returns:
             List of prediction dictionaries
         """
     """Test inference module with sample images."""
     import sys
     from pathlib import Path
     # Add project root to path
     project_root = Path(__file__).parent.parent
     sys.path.insert(0, str(project_root))
     from scripts.data_loader import MnistDataloader
     print("Testing Inference Module")
     print("=" * 50)
     # Check if model exists
     model_path = project_root / 'models' / 'best_model.pt'
     if not model_path.exists():
         print(f"Error: Model not found at {model_path}")
         print("Please train a model first.")
         return
     # Load MNIST test data
     data_path = project_root / 'data' / 'raw'
     loader = MnistDataloader(
         test_labels_filepath=str(data_path / 't10k-labels.idx1-ubyte')
     )
     _, (x_test, y_test) = loader.load_data()
     # Initialize classifier
     print(f"\n1. Loading model from: {model_path}")
     classifier = DigitClassifier(str(model_path))
     print(f"   Device: {classifier.device}")
     # Test on a few images
     print("\n2. Testing predictions on 10 random test images:")
     print("-" * 50)
     indices = np.random.choice(len(x_test), 10, replace=False)
     correct = 0
     for i, idx in enumerate(indices, 1):
         image = x_test[idx]
         true_label = y_test[idx]
         # Convert list to numpy array if needed
         if isinstance(image, list):
             image = np.array(image)
         # Convert to PIL Image
         img = Image.fromarray(image.astype(np.uint8), mode='L')
         # Predict
         result = classifier.predict(img)
         is_correct = result['digit'] == true_label
         correct += is_correct
         print(f"   Image {i}: True={true_label}, Pred={result['digit']}, "
               f"Conf={result['confidence']:.4f} {'✓' if is_correct else '✗'}")
     accuracy = correct / len(indices) * 100
     print(f"\nAccuracy on {len(indices)} samples: {accuracy:.1f}%")
     # Test edge cases
     print("\n3. Testing edge cases:")
     print("-" * 50)
     # Blank image
     blank = np.zeros((28, 28), dtype=np.uint8)
     blank_img = Image.fromarray(blank, mode='L')
     result = classifier.predict(blank_img)
     print(f"   Blank image: Pred={result['digit']}, Conf={result['confidence']:.4f}")
     # All white image
     white = np.ones((28, 28), dtype=np.uint8) * 255
     white_img = Image.fromarray(white, mode='L')
     result = classifier.predict(white_img)
     print(f"   White image: Pred={result['digit']}, Conf={result['confidence']:.4f}")
     # Different size image
     test_img = x_test[0]
     if isinstance(test_img, list):
         test_img = np.array(test_img)
+    large = Image.fromarray(test_img.astype(np.uint8), mode='L')
+    large = large.resize((56, 56))
     result = classifier.predict(large)
+    print(
+        f"   Resized image (56x56): "
+        f"Pred={result['digit']}, Conf={result['confidence']:.4f}"
+    )
     print("\n✓ Inference module test complete!")

scripts/mlflow_setup.py CHANGED Viewed

@@ -24,20 +24,20 @@ def setup_mlflow(
 ) -> str:
     """
     Setup MLflow tracking with best practices.
     Args:
         experiment_name: Name of the experiment
         tracking_uri: MLflow tracking URI (default: local ./mlruns)
     Returns:
         experiment_id: MLflow experiment ID
     """
     # Set tracking URI
     if tracking_uri is None:
         tracking_uri = MLFLOW_TRACKING_URI
     mlflow.set_tracking_uri(tracking_uri)
     # Create or get experiment
     try:
         experiment = mlflow.get_experiment_by_name(experiment_name)
@@ -55,34 +55,36 @@ def setup_mlflow(
     except Exception as e:
         print(f"Warning: Could not create experiment: {e}")
         experiment_id = "0"  # Default experiment
     mlflow.set_experiment(experiment_name)
     print(f"MLflow tracking URI: {tracking_uri}")
     print(f"Experiment: {experiment_name} (ID: {experiment_id})")
     return experiment_id
 def log_model_params(model: Any, prefix: str = "model") -> Dict[str, Any]:
     """
     Log model parameters to MLflow.
     Args:
         model: PyTorch model
         prefix: Prefix for parameter names
     Returns:
         Dictionary of logged parameters
     """
     from scripts.models import count_parameters
     params = {
         f"{prefix}_name": model.__class__.__name__,
         f"{prefix}_total_params": count_parameters(model),
-        f"{prefix}_trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad)
     }
     mlflow.log_params(params)
     return params
@@ -90,7 +92,7 @@ def log_model_params(model: Any, prefix: str = "model") -> Dict[str, Any]:
 def log_training_config(config: Dict[str, Any]) -> None:
     """
     Log training configuration to MLflow.
     Args:
         config: Dictionary of training hyperparameters
     """
@@ -102,7 +104,7 @@ def log_training_config(config: Dict[str, Any]) -> None:
                 flat_config[f"{key}_{subkey}"] = subvalue
         else:
             flat_config[key] = value
     mlflow.log_params(flat_config)
@@ -115,7 +117,7 @@ def log_data_info(
 ) -> None:
     """
     Log dataset information to MLflow.
     Args:
         train_size: Number of training samples
         val_size: Number of validation samples
@@ -135,13 +137,13 @@ def log_data_info(
 def log_system_info() -> Dict[str, Any]:
     """
     Log system information to MLflow.
     Returns:
         Dictionary of system information
     """
     import torch
     import platform
     system_info = {
         "system_platform": platform.system(),
         "system_python_version": platform.python_version(),
@@ -152,11 +154,11 @@ def log_system_info() -> Dict[str, Any]:
         ),
         "system_device": "cuda" if torch.cuda.is_available() else "cpu"
     }
     if torch.cuda.is_available():
         system_info["system_gpu_name"] = torch.cuda.get_device_name(0)
         system_info["system_gpu_count"] = torch.cuda.device_count()
     mlflow.log_params(system_info)
     return system_info
@@ -164,7 +166,7 @@ def log_system_info() -> Dict[str, Any]:
 def log_metrics_epoch(metrics: Dict[str, float], step: int) -> None:
     """
     Log metrics for a specific epoch.
     Args:
         metrics: Dictionary of metric names and values
         step: Epoch number
@@ -175,7 +177,7 @@ def log_metrics_epoch(metrics: Dict[str, float], step: int) -> None:
 def log_artifact_path(path: str, artifact_path: Optional[str] = None) -> None:
     """
     Log a file or directory as an artifact.
     Args:
         path: Path to file or directory
         artifact_path: Optional artifact path in MLflow
@@ -194,14 +196,14 @@ def log_model_to_registry(
 ) -> None:
     """
     Log model to MLflow with model registry integration.
     Args:
         model: PyTorch model
         model_name: Name for the model artifact
         artifact_path: Artifact path in MLflow
         registered_model_name: Name for model registry (optional)
     """
     # Log model
     mlflow.pytorch.log_model(
         pytorch_model=model,
@@ -216,11 +218,11 @@ def get_or_create_run(
 ) -> mlflow.ActiveRun:
     """
     Get existing run or create a new one.
     Args:
         run_name: Name for the run
         tags: Tags for the run
     Returns:
         MLflow active run context
     """
@@ -236,10 +238,10 @@ def test_mlflow_setup():
     """Test MLflow setup and basic logging."""
     print("Testing MLflow Setup")
     print("=" * 50)
     # Setup MLflow
     setup_mlflow("test-experiment")
     # Test logging
     with mlflow.start_run(run_name="test-run"):
         # Log parameters
@@ -248,7 +250,7 @@ def test_mlflow_setup():
             "batch_size": 64,
             "epochs": 10
         })
         # Log metrics
         for epoch in range(3):
             mlflow.log_metrics({
@@ -257,13 +259,13 @@ def test_mlflow_setup():
                 "train_accuracy": 0.8 + epoch * 0.05,
                 "val_accuracy": 0.75 + epoch * 0.05
             }, step=epoch)
         # Log system info
         system_info = log_system_info()
         print("\nSystem Info:")
         for key, value in system_info.items():
             print(f"  {key}: {value}")
         print("\n✓ MLflow test complete!")
         print(f"View results at: mlflow ui --backend-store-uri {MLFLOW_TRACKING_URI}")

 ) -> str:
     """
     Setup MLflow tracking with best practices.
     Args:
         experiment_name: Name of the experiment
         tracking_uri: MLflow tracking URI (default: local ./mlruns)
     Returns:
         experiment_id: MLflow experiment ID
     """
     # Set tracking URI
     if tracking_uri is None:
         tracking_uri = MLFLOW_TRACKING_URI
     mlflow.set_tracking_uri(tracking_uri)
     # Create or get experiment
     try:
         experiment = mlflow.get_experiment_by_name(experiment_name)
     except Exception as e:
         print(f"Warning: Could not create experiment: {e}")
         experiment_id = "0"  # Default experiment
     mlflow.set_experiment(experiment_name)
     print(f"MLflow tracking URI: {tracking_uri}")
     print(f"Experiment: {experiment_name} (ID: {experiment_id})")
     return experiment_id
 def log_model_params(model: Any, prefix: str = "model") -> Dict[str, Any]:
     """
     Log model parameters to MLflow.
     Args:
         model: PyTorch model
         prefix: Prefix for parameter names
     Returns:
         Dictionary of logged parameters
     """
     from scripts.models import count_parameters
     params = {
         f"{prefix}_name": model.__class__.__name__,
         f"{prefix}_total_params": count_parameters(model),
+        f"{prefix}_trainable_params": sum(
+            p.numel() for p in model.parameters() if p.requires_grad
+        )
     }
     mlflow.log_params(params)
     return params
 def log_training_config(config: Dict[str, Any]) -> None:
     """
     Log training configuration to MLflow.
     Args:
         config: Dictionary of training hyperparameters
     """
                 flat_config[f"{key}_{subkey}"] = subvalue
         else:
             flat_config[key] = value
     mlflow.log_params(flat_config)
 ) -> None:
     """
     Log dataset information to MLflow.
     Args:
         train_size: Number of training samples
         val_size: Number of validation samples
 def log_system_info() -> Dict[str, Any]:
     """
     Log system information to MLflow.
     Returns:
         Dictionary of system information
     """
     import torch
     import platform
     system_info = {
         "system_platform": platform.system(),
         "system_python_version": platform.python_version(),
         ),
         "system_device": "cuda" if torch.cuda.is_available() else "cpu"
     }
     if torch.cuda.is_available():
         system_info["system_gpu_name"] = torch.cuda.get_device_name(0)
         system_info["system_gpu_count"] = torch.cuda.device_count()
     mlflow.log_params(system_info)
     return system_info
 def log_metrics_epoch(metrics: Dict[str, float], step: int) -> None:
     """
     Log metrics for a specific epoch.
     Args:
         metrics: Dictionary of metric names and values
         step: Epoch number
 def log_artifact_path(path: str, artifact_path: Optional[str] = None) -> None:
     """
     Log a file or directory as an artifact.
     Args:
         path: Path to file or directory
         artifact_path: Optional artifact path in MLflow
 ) -> None:
     """
     Log model to MLflow with model registry integration.
     Args:
         model: PyTorch model
         model_name: Name for the model artifact
         artifact_path: Artifact path in MLflow
         registered_model_name: Name for model registry (optional)
     """
     # Log model
     mlflow.pytorch.log_model(
         pytorch_model=model,
 ) -> mlflow.ActiveRun:
     """
     Get existing run or create a new one.
     Args:
         run_name: Name for the run
         tags: Tags for the run
     Returns:
         MLflow active run context
     """
     """Test MLflow setup and basic logging."""
     print("Testing MLflow Setup")
     print("=" * 50)
     # Setup MLflow
     setup_mlflow("test-experiment")
     # Test logging
     with mlflow.start_run(run_name="test-run"):
         # Log parameters
             "batch_size": 64,
             "epochs": 10
         })
         # Log metrics
         for epoch in range(3):
             mlflow.log_metrics({
                 "train_accuracy": 0.8 + epoch * 0.05,
                 "val_accuracy": 0.75 + epoch * 0.05
             }, step=epoch)
         # Log system info
         system_info = log_system_info()
         print("\nSystem Info:")
         for key, value in system_info.items():
             print(f"  {key}: {value}")
         print("\n✓ MLflow test complete!")
         print(f"View results at: mlflow ui --backend-store-uri {MLFLOW_TRACKING_URI}")

scripts/models.py CHANGED Viewed

@@ -8,7 +8,7 @@ This module provides CNN models for digit recognition:
 Usage:
     from scripts.models import BaselineCNN
     model = BaselineCNN()
     output = model(images)  # (batch, 10) logits
 """
@@ -22,7 +22,7 @@ from typing import Tuple
 class BaselineCNN(nn.Module):
     """
     Baseline CNN for MNIST classification.
     Architecture:
         Input: (batch, 1, 28, 28)
         Conv1: 1 -> 32 channels, 3x3 kernel, padding=1
@@ -32,28 +32,28 @@ class BaselineCNN(nn.Module):
         Flatten -> (batch, 3136)
         FC1: 3136 -> 128, ReLU, Dropout(0.5)
         FC2: 128 -> 10 (output logits)
     Design Rationale:
         - 2 conv layers: Balance between simplicity and capacity
         - 32->64 filters: Standard progression, proven effective
         - Dropout 0.5: Prevent overfitting on small dataset
         - No batch norm: Keep baseline simple
     Expected Performance:
         - Parameters: ~110k
         - Test accuracy: 98-99%
         - Training time: ~5-10 min on GPU
     """
     def __init__(self, dropout_rate: float = 0.5):
         """
         Initialize baseline CNN.
         Args:
             dropout_rate: Dropout probability (default 0.5)
         """
         super(BaselineCNN, self).__init__()
         # Convolutional layers
         self.conv1 = nn.Conv2d(
             in_channels=1,
@@ -67,25 +67,25 @@ class BaselineCNN(nn.Module):
             kernel_size=3,
             padding=1
         )
         # Pooling layer (shared)
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
         # Fully connected layers
         # After two pooling layers: 28->14->7, so 64*7*7 = 3136
         self.fc1 = nn.Linear(64 * 7 * 7, 128)
         self.fc2 = nn.Linear(128, 10)
         # Dropout for regularization
         self.dropout = nn.Dropout(p=dropout_rate)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass.
         Args:
             x: Input tensor of shape (batch, 1, 28, 28)
         Returns:
             Output logits of shape (batch, 10)
         """
@@ -93,28 +93,28 @@ class BaselineCNN(nn.Module):
         x = self.conv1(x)           # (batch, 32, 28, 28)
         x = F.relu(x)
         x = self.pool(x)            # (batch, 32, 14, 14)
         # Conv block 2: Conv -> ReLU -> Pool
         x = self.conv2(x)           # (batch, 64, 14, 14)
         x = F.relu(x)
         x = self.pool(x)            # (batch, 64, 7, 7)
         # Flatten
         x = x.view(-1, 64 * 7 * 7)  # (batch, 3136)
         # Fully connected layers
         x = self.fc1(x)             # (batch, 128)
         x = F.relu(x)
         x = self.dropout(x)
         x = self.fc2(x)             # (batch, 10)
         return x
 class ImprovedCNN(nn.Module):
     """
     Enhanced CNN with batch normalization and deeper architecture.
     Architecture:
         Conv1: 1 -> 32, BatchNorm, ReLU, MaxPool
         Conv2: 32 -> 64, BatchNorm, ReLU, MaxPool
@@ -122,49 +122,49 @@ class ImprovedCNN(nn.Module):
         Flatten
         FC1: 128*3*3 -> 256, BatchNorm, ReLU, Dropout(0.5)
         FC2: 256 -> 10
     Expected Performance:
         - Parameters: ~200k
         - Test accuracy: 99%+
         - Converges faster than baseline
     """
     def __init__(self, dropout_rate: float = 0.5):
         """
         Initialize improved CNN.
         Args:
             dropout_rate: Dropout probability (default 0.5)
         """
         super(ImprovedCNN, self).__init__()
         # Convolutional layers with batch normalization
         self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm2d(32)
         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm2d(64)
         self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
         self.bn3 = nn.BatchNorm2d(128)
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
         # Fully connected layers
         # After three pooling layers: 28->14->7->3, so 128*3*3 = 1152
         self.fc1 = nn.Linear(128 * 3 * 3, 256)
         self.bn_fc = nn.BatchNorm1d(256)
         self.fc2 = nn.Linear(256, 10)
         self.dropout = nn.Dropout(p=dropout_rate)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass.
         Args:
             x: Input tensor of shape (batch, 1, 28, 28)
         Returns:
             Output logits of shape (batch, 10)
         """
@@ -173,39 +173,39 @@ class ImprovedCNN(nn.Module):
         x = self.bn1(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 32, 14, 14)
         # Conv block 2
         x = self.conv2(x)
         x = self.bn2(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 64, 7, 7)
         # Conv block 3
         x = self.conv3(x)
         x = self.bn3(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 128, 3, 3)
         # Flatten
         x = x.view(-1, 128 * 3 * 3)
         # Fully connected layers
         x = self.fc1(x)
         x = self.bn_fc(x)
         x = F.relu(x)
         x = self.dropout(x)
         x = self.fc2(x)
         return x
 def count_parameters(model: nn.Module) -> Tuple[int, int]:
     """
     Count total and trainable parameters in model.
     Args:
         model: PyTorch model
     Returns:
         Tuple of (total_params, trainable_params)
     """
@@ -214,19 +214,21 @@ def count_parameters(model: nn.Module) -> Tuple[int, int]:
     return total_params, trainable_params
-def get_model_summary(model: nn.Module, input_size: Tuple[int, ...] = (1, 1, 28, 28)) -> str:
     """
     Generate model architecture summary.
     Args:
         model: PyTorch model
         input_size: Input tensor size (batch, channels, height, width)
     Returns:
         Formatted string with model summary
     """
     total_params, trainable_params = count_parameters(model)
     summary = []
     summary.append("=" * 60)
     summary.append(f"Model: {model.__class__.__name__}")
@@ -234,47 +236,49 @@ def get_model_summary(model: nn.Module, input_size: Tuple[int, ...] = (1, 1, 28,
     summary.append(f"Input size: {input_size}")
     summary.append(f"Total parameters: {total_params:,}")
     summary.append(f"Trainable parameters: {trainable_params:,}")
-    summary.append(f"Model size (MB): {total_params * 4 / (1024**2):.2f}")  # Assuming float32
     summary.append("=" * 60)
     return "\n".join(summary)
 def test_model(model: nn.Module, device: str = 'cpu') -> bool:
     """
     Test model with dummy input.
     Args:
         model: PyTorch model
         device: Device to run on ('cpu' or 'cuda')
     Returns:
         True if test passes, False otherwise
     """
     try:
         model = model.to(device)
         model.eval()
         # Create dummy input
         dummy_input = torch.randn(4, 1, 28, 28).to(device)
         # Forward pass
         with torch.no_grad():
             output = model(dummy_input)
         # Check output shape
         assert output.shape == (4, 10), f"Expected shape (4, 10), got {output.shape}"
         # Check output is finite
         assert torch.isfinite(output).all(), "Output contains NaN or Inf"
         print("✓ Model test passed")
         print(f"  Input shape: {dummy_input.shape}")
         print(f"  Output shape: {output.shape}")
         print(f"  Output range: [{output.min():.4f}, {output.max():.4f}]")
         return True
     except Exception as e:
         print(f"✗ Model test failed: {e}")
         return False
@@ -284,23 +288,23 @@ if __name__ == "__main__":
     """Test model instantiation and forward pass."""
     print("Testing BaselineCNN:")
     print()
     # Create model
     model = BaselineCNN()
     print(get_model_summary(model))
     print()
     # Test forward pass
     test_model(model)
     print()
     # Test improved model
     print("=" * 60)
     print("Testing ImprovedCNN:")
     print()
     model_improved = ImprovedCNN()
     print(get_model_summary(model_improved))
     print()
     test_model(model_improved)

 Usage:
     from scripts.models import BaselineCNN
     model = BaselineCNN()
     output = model(images)  # (batch, 10) logits
 """
 class BaselineCNN(nn.Module):
     """
     Baseline CNN for MNIST classification.
     Architecture:
         Input: (batch, 1, 28, 28)
         Conv1: 1 -> 32 channels, 3x3 kernel, padding=1
         Flatten -> (batch, 3136)
         FC1: 3136 -> 128, ReLU, Dropout(0.5)
         FC2: 128 -> 10 (output logits)
     Design Rationale:
         - 2 conv layers: Balance between simplicity and capacity
         - 32->64 filters: Standard progression, proven effective
         - Dropout 0.5: Prevent overfitting on small dataset
         - No batch norm: Keep baseline simple
     Expected Performance:
         - Parameters: ~110k
         - Test accuracy: 98-99%
         - Training time: ~5-10 min on GPU
     """
     def __init__(self, dropout_rate: float = 0.5):
         """
         Initialize baseline CNN.
         Args:
             dropout_rate: Dropout probability (default 0.5)
         """
         super(BaselineCNN, self).__init__()
         # Convolutional layers
         self.conv1 = nn.Conv2d(
             in_channels=1,
             kernel_size=3,
             padding=1
         )
         # Pooling layer (shared)
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
         # Fully connected layers
         # After two pooling layers: 28->14->7, so 64*7*7 = 3136
         self.fc1 = nn.Linear(64 * 7 * 7, 128)
         self.fc2 = nn.Linear(128, 10)
         # Dropout for regularization
         self.dropout = nn.Dropout(p=dropout_rate)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass.
         Args:
             x: Input tensor of shape (batch, 1, 28, 28)
         Returns:
             Output logits of shape (batch, 10)
         """
         x = self.conv1(x)           # (batch, 32, 28, 28)
         x = F.relu(x)
         x = self.pool(x)            # (batch, 32, 14, 14)
         # Conv block 2: Conv -> ReLU -> Pool
         x = self.conv2(x)           # (batch, 64, 14, 14)
         x = F.relu(x)
         x = self.pool(x)            # (batch, 64, 7, 7)
         # Flatten
         x = x.view(-1, 64 * 7 * 7)  # (batch, 3136)
         # Fully connected layers
         x = self.fc1(x)             # (batch, 128)
         x = F.relu(x)
         x = self.dropout(x)
         x = self.fc2(x)             # (batch, 10)
         return x
 class ImprovedCNN(nn.Module):
     """
     Enhanced CNN with batch normalization and deeper architecture.
     Architecture:
         Conv1: 1 -> 32, BatchNorm, ReLU, MaxPool
         Conv2: 32 -> 64, BatchNorm, ReLU, MaxPool
         Flatten
         FC1: 128*3*3 -> 256, BatchNorm, ReLU, Dropout(0.5)
         FC2: 256 -> 10
     Expected Performance:
         - Parameters: ~200k
         - Test accuracy: 99%+
         - Converges faster than baseline
     """
     def __init__(self, dropout_rate: float = 0.5):
         """
         Initialize improved CNN.
         Args:
             dropout_rate: Dropout probability (default 0.5)
         """
         super(ImprovedCNN, self).__init__()
         # Convolutional layers with batch normalization
         self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm2d(32)
         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm2d(64)
         self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
         self.bn3 = nn.BatchNorm2d(128)
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
         # Fully connected layers
         # After three pooling layers: 28->14->7->3, so 128*3*3 = 1152
         self.fc1 = nn.Linear(128 * 3 * 3, 256)
         self.bn_fc = nn.BatchNorm1d(256)
         self.fc2 = nn.Linear(256, 10)
         self.dropout = nn.Dropout(p=dropout_rate)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass.
         Args:
             x: Input tensor of shape (batch, 1, 28, 28)
         Returns:
             Output logits of shape (batch, 10)
         """
         x = self.bn1(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 32, 14, 14)
         # Conv block 2
         x = self.conv2(x)
         x = self.bn2(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 64, 7, 7)
         # Conv block 3
         x = self.conv3(x)
         x = self.bn3(x)
         x = F.relu(x)
         x = self.pool(x)  # (batch, 128, 3, 3)
         # Flatten
         x = x.view(-1, 128 * 3 * 3)
         # Fully connected layers
         x = self.fc1(x)
         x = self.bn_fc(x)
         x = F.relu(x)
         x = self.dropout(x)
         x = self.fc2(x)
         return x
 def count_parameters(model: nn.Module) -> Tuple[int, int]:
     """
     Count total and trainable parameters in model.
     Args:
         model: PyTorch model
     Returns:
         Tuple of (total_params, trainable_params)
     """
     return total_params, trainable_params
+def get_model_summary(
+    model: nn.Module, input_size: Tuple[int, ...] = (1, 1, 28, 28)
+) -> str:
     """
     Generate model architecture summary.
     Args:
         model: PyTorch model
         input_size: Input tensor size (batch, channels, height, width)
     Returns:
         Formatted string with model summary
     """
     total_params, trainable_params = count_parameters(model)
     summary = []
     summary.append("=" * 60)
     summary.append(f"Model: {model.__class__.__name__}")
     summary.append(f"Input size: {input_size}")
     summary.append(f"Total parameters: {total_params:,}")
     summary.append(f"Trainable parameters: {trainable_params:,}")
+    # Assuming float32
+    model_size_mb = total_params * 4 / (1024**2)
+    summary.append(f"Model size (MB): {model_size_mb:.2f}")
     summary.append("=" * 60)
     return "\n".join(summary)
 def test_model(model: nn.Module, device: str = 'cpu') -> bool:
     """
     Test model with dummy input.
     Args:
         model: PyTorch model
         device: Device to run on ('cpu' or 'cuda')
     Returns:
         True if test passes, False otherwise
     """
     try:
         model = model.to(device)
         model.eval()
         # Create dummy input
         dummy_input = torch.randn(4, 1, 28, 28).to(device)
         # Forward pass
         with torch.no_grad():
             output = model(dummy_input)
         # Check output shape
         assert output.shape == (4, 10), f"Expected shape (4, 10), got {output.shape}"
         # Check output is finite
         assert torch.isfinite(output).all(), "Output contains NaN or Inf"
         print("✓ Model test passed")
         print(f"  Input shape: {dummy_input.shape}")
         print(f"  Output shape: {output.shape}")
         print(f"  Output range: [{output.min():.4f}, {output.max():.4f}]")
         return True
     except Exception as e:
         print(f"✗ Model test failed: {e}")
         return False
     """Test model instantiation and forward pass."""
     print("Testing BaselineCNN:")
     print()
     # Create model
     model = BaselineCNN()
     print(get_model_summary(model))
     print()
     # Test forward pass
     test_model(model)
     print()
     # Test improved model
     print("=" * 60)
     print("Testing ImprovedCNN:")
     print()
     model_improved = ImprovedCNN()
     print(get_model_summary(model_improved))
     print()
     test_model(model_improved)

scripts/preprocessing.py CHANGED Viewed

@@ -9,7 +9,7 @@ This module provides PyTorch Dataset and DataLoader setup for MNIST:
 Usage:
     from scripts.preprocessing import MnistDataset, create_dataloaders
     train_dataset = MnistDataset(x_train, y_train, transform=None)
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=64
@@ -26,10 +26,10 @@ from torch.utils.data import Dataset, DataLoader
 class MnistDataset(Dataset):
     """
     PyTorch Dataset for MNIST images.
     Handles normalization and conversion to tensors suitable for CNN training.
     """
     def __init__(
         self,
         images: List[NDArray[np.uint8]],
@@ -38,7 +38,7 @@ class MnistDataset(Dataset):
     ):
         """
         Initialize MNIST dataset.
         Args:
             images: List of 28x28 numpy arrays with pixel values [0, 255]
             labels: List of integer labels (0-9)
@@ -47,22 +47,22 @@ class MnistDataset(Dataset):
         self.images = images
         self.labels = labels
         self.transform = transform
         # Validate inputs
         assert len(images) == len(labels), \
             f"Mismatch: {len(images)} images but {len(labels)} labels"
     def __len__(self) -> int:
         """Return number of samples in dataset."""
         return len(self.images)
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Get a single sample.
         Args:
             idx: Index of sample to retrieve
         Returns:
             Tuple of (image_tensor, label_tensor)
             - image_tensor: Shape (1, 28, 28), dtype float32, range [0, 1]
@@ -71,18 +71,18 @@ class MnistDataset(Dataset):
         # Get image and label
         image = np.array(self.images[idx])
         label = self.labels[idx]
         # Normalize to [0, 1]
         image = image.astype(np.float32) / 255.0
         # Convert to tensor and add channel dimension: (28, 28) -> (1, 28, 28)
         image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
         label = torch.tensor(label, dtype=torch.long)
         # Apply transforms if provided (e.g., augmentation)
         if self.transform:
             image = self.transform(image)
         return image, label
@@ -95,14 +95,14 @@ def create_dataloaders(
 ) -> Tuple[DataLoader, DataLoader]:
     """
     Create DataLoader instances for training and validation.
     Args:
         train_dataset: Training dataset
         val_dataset: Validation dataset
         batch_size: Number of samples per batch
         num_workers: Number of worker processes for data loading
         shuffle_train: Whether to shuffle training data
     Returns:
         Tuple of (train_loader, val_loader)
     """
@@ -113,7 +113,7 @@ def create_dataloaders(
         num_workers=num_workers,
         pin_memory=True  # Faster GPU transfer
     )
     val_loader = DataLoader(
         val_dataset,
         batch_size=batch_size,
@@ -121,7 +121,7 @@ def create_dataloaders(
         num_workers=num_workers,
         pin_memory=True
     )
     return train_loader, val_loader
@@ -132,12 +132,12 @@ def create_test_dataloader(
 ) -> DataLoader:
     """
     Create DataLoader for test set.
     Args:
         test_dataset: Test dataset
         batch_size: Number of samples per batch
         num_workers: Number of worker processes for data loading
     Returns:
         Test DataLoader
     """
@@ -148,7 +148,7 @@ def create_test_dataloader(
         num_workers=num_workers,
         pin_memory=True
     )
     return test_loader
@@ -163,70 +163,70 @@ def split_train_val(
 ]:
     """
     Split training data into train and validation sets.
     Uses stratified sampling to maintain class balance.
     Args:
         images: List of training images
         labels: List of training labels
         val_split: Fraction of data to use for validation (0.15 = 15%)
         random_seed: Random seed for reproducibility
     Returns:
         Tuple of ((train_images, train_labels), (val_images, val_labels))
     """
     from collections import defaultdict
     # Group indices by class for stratified split
     class_indices = defaultdict(list)
     for idx, label in enumerate(labels):
         class_indices[label].append(idx)
     # Set random seed
     np.random.seed(random_seed)
     train_indices = []
     val_indices = []
     # Split each class separately
     for class_label, indices in class_indices.items():
         indices = np.array(indices)
         np.random.shuffle(indices)
         split_point = int(len(indices) * (1 - val_split))
         train_indices.extend(indices[:split_point])
         val_indices.extend(indices[split_point:])
     # Shuffle combined indices
     np.random.shuffle(train_indices)
     np.random.shuffle(val_indices)
     # Extract images and labels
     train_images = [images[i] for i in train_indices]
     train_labels = [labels[i] for i in train_indices]
     val_images = [images[i] for i in val_indices]
     val_labels = [labels[i] for i in val_indices]
     return (train_images, train_labels), (val_images, val_labels)
 def get_dataset_statistics(dataset: MnistDataset) -> dict:
     """
     Compute statistics for a dataset (useful for debugging).
     Args:
         dataset: MnistDataset instance
     Returns:
         Dictionary with statistics
     """
     # Sample first image to check preprocessing
     sample_img, sample_label = dataset[0]
     # Count labels
     from collections import Counter
     label_counts = Counter([dataset[i][1].item() for i in range(len(dataset))])
     return {
         'num_samples': len(dataset),
         'sample_image_shape': tuple(sample_img.shape),

 Usage:
     from scripts.preprocessing import MnistDataset, create_dataloaders
     train_dataset = MnistDataset(x_train, y_train, transform=None)
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=64
 class MnistDataset(Dataset):
     """
     PyTorch Dataset for MNIST images.
     Handles normalization and conversion to tensors suitable for CNN training.
     """
     def __init__(
         self,
         images: List[NDArray[np.uint8]],
     ):
         """
         Initialize MNIST dataset.
         Args:
             images: List of 28x28 numpy arrays with pixel values [0, 255]
             labels: List of integer labels (0-9)
         self.images = images
         self.labels = labels
         self.transform = transform
         # Validate inputs
         assert len(images) == len(labels), \
             f"Mismatch: {len(images)} images but {len(labels)} labels"
     def __len__(self) -> int:
         """Return number of samples in dataset."""
         return len(self.images)
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Get a single sample.
         Args:
             idx: Index of sample to retrieve
         Returns:
             Tuple of (image_tensor, label_tensor)
             - image_tensor: Shape (1, 28, 28), dtype float32, range [0, 1]
         # Get image and label
         image = np.array(self.images[idx])
         label = self.labels[idx]
         # Normalize to [0, 1]
         image = image.astype(np.float32) / 255.0
         # Convert to tensor and add channel dimension: (28, 28) -> (1, 28, 28)
         image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
         label = torch.tensor(label, dtype=torch.long)
         # Apply transforms if provided (e.g., augmentation)
         if self.transform:
             image = self.transform(image)
         return image, label
 ) -> Tuple[DataLoader, DataLoader]:
     """
     Create DataLoader instances for training and validation.
     Args:
         train_dataset: Training dataset
         val_dataset: Validation dataset
         batch_size: Number of samples per batch
         num_workers: Number of worker processes for data loading
         shuffle_train: Whether to shuffle training data
     Returns:
         Tuple of (train_loader, val_loader)
     """
         num_workers=num_workers,
         pin_memory=True  # Faster GPU transfer
     )
     val_loader = DataLoader(
         val_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
         pin_memory=True
     )
     return train_loader, val_loader
 ) -> DataLoader:
     """
     Create DataLoader for test set.
     Args:
         test_dataset: Test dataset
         batch_size: Number of samples per batch
         num_workers: Number of worker processes for data loading
     Returns:
         Test DataLoader
     """
         num_workers=num_workers,
         pin_memory=True
     )
     return test_loader
 ]:
     """
     Split training data into train and validation sets.
     Uses stratified sampling to maintain class balance.
     Args:
         images: List of training images
         labels: List of training labels
         val_split: Fraction of data to use for validation (0.15 = 15%)
         random_seed: Random seed for reproducibility
     Returns:
         Tuple of ((train_images, train_labels), (val_images, val_labels))
     """
     from collections import defaultdict
     # Group indices by class for stratified split
     class_indices = defaultdict(list)
     for idx, label in enumerate(labels):
         class_indices[label].append(idx)
     # Set random seed
     np.random.seed(random_seed)
     train_indices = []
     val_indices = []
     # Split each class separately
     for class_label, indices in class_indices.items():
         indices = np.array(indices)
         np.random.shuffle(indices)
         split_point = int(len(indices) * (1 - val_split))
         train_indices.extend(indices[:split_point])
         val_indices.extend(indices[split_point:])
     # Shuffle combined indices
     np.random.shuffle(train_indices)
     np.random.shuffle(val_indices)
     # Extract images and labels
     train_images = [images[i] for i in train_indices]
     train_labels = [labels[i] for i in train_indices]
     val_images = [images[i] for i in val_indices]
     val_labels = [labels[i] for i in val_indices]
     return (train_images, train_labels), (val_images, val_labels)
 def get_dataset_statistics(dataset: MnistDataset) -> dict:
     """
     Compute statistics for a dataset (useful for debugging).
     Args:
         dataset: MnistDataset instance
     Returns:
         Dictionary with statistics
     """
     # Sample first image to check preprocessing
     sample_img, sample_label = dataset[0]
     # Count labels
     from collections import Counter
     label_counts = Counter([dataset[i][1].item() for i in range(len(dataset))])
     return {
         'num_samples': len(dataset),
         'sample_image_shape': tuple(sample_img.shape),

scripts/test_data_loader.py CHANGED Viewed

@@ -19,24 +19,24 @@ def test_data_loader():
     """Test MNIST data loader with actual files."""
     print("Testing MNIST Data Loader...")
     print("-" * 50)
     # Note: Files need to be uncompressed first
     # If .gz files exist, uncompress with: gunzip data/raw/*.gz
     base_path = Path(__file__).parent.parent / "data" / "raw"
     # Try to find uncompressed files
     train_images = base_path / "train-images.idx3-ubyte"
     train_labels = base_path / "train-labels.idx1-ubyte"
     test_images = base_path / "t10k-images.idx3-ubyte"
     test_labels = base_path / "t10k-labels.idx1-ubyte"
     # Check if files exist
     missing_files = []
     for filepath in [train_images, train_labels, test_images, test_labels]:
         if not filepath.exists():
             missing_files.append(str(filepath))
     if missing_files:
         print("⚠️  Missing uncompressed data files:")
         for f in missing_files:
@@ -44,7 +44,7 @@ def test_data_loader():
         print("\nTo uncompress .gz files, run:")
         print("   cd data/raw && gunzip *.gz")
         return False
     try:
         # Initialize loader
         loader = MnistDataloader(
@@ -54,37 +54,37 @@ def test_data_loader():
             str(test_labels)
         )
         print("✓ Loader initialized successfully")
         # Load data
         print("\nLoading MNIST dataset...")
         (x_train, y_train), (x_test, y_test) = loader.load_data()
         # Verify shapes
         print(f"\n✓ Training set: {len(x_train):,} images, {len(y_train):,} labels")
         print(f"✓ Test set: {len(x_test):,} images, {len(y_test):,} labels")
         # Convert first image to numpy array to check
         import numpy as np
         first_img = np.array(x_train[0])
         print(f"\n✓ Image shape: {first_img.shape}")
         print(f"✓ Image dtype: {first_img.dtype}")
         print(f"✓ Label type: {type(y_train[0])}")
         # Verify label range
         unique_labels = set(y_train + y_test)
         print(f"\n✓ Unique labels: {sorted(unique_labels)}")
         # Verify pixel value range (convert to numpy for analysis)
         sample_images = [np.array(img) for img in x_train[:100]]
         max_val = max(img.max() for img in sample_images)
         min_val = min(img.min() for img in sample_images)
         print(f"✓ Pixel value range (sample): [{min_val}, {max_val}]")
         print("\n" + "=" * 50)
         print("✅ All tests passed!")
         print("=" * 50)
         return True
     except Exception as e:
         print(f"\n❌ Error: {e}")
         import traceback

     """Test MNIST data loader with actual files."""
     print("Testing MNIST Data Loader...")
     print("-" * 50)
     # Note: Files need to be uncompressed first
     # If .gz files exist, uncompress with: gunzip data/raw/*.gz
     base_path = Path(__file__).parent.parent / "data" / "raw"
     # Try to find uncompressed files
     train_images = base_path / "train-images.idx3-ubyte"
     train_labels = base_path / "train-labels.idx1-ubyte"
     test_images = base_path / "t10k-images.idx3-ubyte"
     test_labels = base_path / "t10k-labels.idx1-ubyte"
     # Check if files exist
     missing_files = []
     for filepath in [train_images, train_labels, test_images, test_labels]:
         if not filepath.exists():
             missing_files.append(str(filepath))
     if missing_files:
         print("⚠️  Missing uncompressed data files:")
         for f in missing_files:
         print("\nTo uncompress .gz files, run:")
         print("   cd data/raw && gunzip *.gz")
         return False
     try:
         # Initialize loader
         loader = MnistDataloader(
             str(test_labels)
         )
         print("✓ Loader initialized successfully")
         # Load data
         print("\nLoading MNIST dataset...")
         (x_train, y_train), (x_test, y_test) = loader.load_data()
         # Verify shapes
         print(f"\n✓ Training set: {len(x_train):,} images, {len(y_train):,} labels")
         print(f"✓ Test set: {len(x_test):,} images, {len(y_test):,} labels")
         # Convert first image to numpy array to check
         import numpy as np
         first_img = np.array(x_train[0])
         print(f"\n✓ Image shape: {first_img.shape}")
         print(f"✓ Image dtype: {first_img.dtype}")
         print(f"✓ Label type: {type(y_train[0])}")
         # Verify label range
         unique_labels = set(y_train + y_test)
         print(f"\n✓ Unique labels: {sorted(unique_labels)}")
         # Verify pixel value range (convert to numpy for analysis)
         sample_images = [np.array(img) for img in x_train[:100]]
         max_val = max(img.max() for img in sample_images)
         min_val = min(img.min() for img in sample_images)
         print(f"✓ Pixel value range (sample): [{min_val}, {max_val}]")
         print("\n" + "=" * 50)
         print("✅ All tests passed!")
         print("=" * 50)
         return True
     except Exception as e:
         print(f"\n❌ Error: {e}")
         import traceback

scripts/test_data_quality.py CHANGED Viewed

@@ -21,32 +21,32 @@ def main():
     """Run quality checks and save report."""
     print("Loading MNIST dataset...")
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
         str(data_path / "train-images.idx3-ubyte"),
         str(data_path / "train-labels.idx1-ubyte"),
         str(data_path / "t10k-images.idx3-ubyte"),
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print("✓ Dataset loaded\n")
     # Generate quality report
     print("Running quality checks...")
     report = generate_quality_report((x_train, y_train), (x_test, y_test))
     print("✓ Quality checks complete\n")
     # Print summary
     print_quality_summary(report)
     # Save report as JSON
     output_path = project_root / "data" / "quality_report.json"
     with open(output_path, 'w') as f:
         json.dump(report, f, indent=2)
     print(f"✓ Quality report saved to: {output_path}")
     return 0 if report['summary']['all_checks_pass'] else 1

     """Run quality checks and save report."""
     print("Loading MNIST dataset...")
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
         str(data_path / "train-images.idx3-ubyte"),
         str(data_path / "train-labels.idx1-ubyte"),
         str(data_path / "t10k-images.idx3-ubyte"),
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print("✓ Dataset loaded\n")
     # Generate quality report
     print("Running quality checks...")
     report = generate_quality_report((x_train, y_train), (x_test, y_test))
     print("✓ Quality checks complete\n")
     # Print summary
     print_quality_summary(report)
     # Save report as JSON
     output_path = project_root / "data" / "quality_report.json"
     with open(output_path, 'w') as f:
         json.dump(report, f, indent=2)
     print(f"✓ Quality report saved to: {output_path}")
     return 0 if report['summary']['all_checks_pass'] else 1

scripts/test_preprocessing.py CHANGED Viewed

@@ -32,7 +32,7 @@ def test_dataset():
     print("=" * 60)
     print("TEST 1: MnistDataset Initialization and Indexing")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
@@ -42,11 +42,11 @@ def test_dataset():
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     # Create dataset (small subset for testing)
     dataset = MnistDataset(x_train[:1000], y_train[:1000])
     print(f"✓ Dataset created with {len(dataset)} samples")
     # Test __getitem__
     image, label = dataset[0]
     print("✓ Retrieved sample 0")
@@ -54,7 +54,7 @@ def test_dataset():
     print(f"  Image dtype: {image.dtype}")
     print(f"  Image range: [{image.min():.4f}, {image.max():.4f}]")
     print(f"  Label: {label.item()} (dtype: {label.dtype})")
     # Verify normalization
     assert image.shape == (1, 28, 28), f"Wrong shape: {image.shape}"
     assert image.dtype == torch.float32, f"Wrong dtype: {image.dtype}"
@@ -63,7 +63,7 @@ def test_dataset():
     assert label.dtype == torch.long, f"Label wrong dtype: {label.dtype}"
     print("✓ All assertions passed")
     print()
     return dataset
@@ -72,10 +72,10 @@ def test_dataloader(dataset):
     print("=" * 60)
     print("TEST 2: DataLoader Batching")
     print("=" * 60)
     loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
     print("✓ DataLoader created (batch_size=32)")
     # Get first batch
     images, labels = next(iter(loader))
     print("✓ Retrieved first batch")
@@ -83,7 +83,7 @@ def test_dataloader(dataset):
     print(f"  Batch labels shape: {labels.shape}")
     print(f"  Images dtype: {images.dtype}")
     print(f"  Labels dtype: {labels.dtype}")
     # Verify batch dimensions
     assert images.shape == (32, 1, 28, 28), f"Wrong batch shape: {images.shape}"
     assert labels.shape == (32,), f"Wrong labels shape: {labels.shape}"
@@ -98,7 +98,7 @@ def test_train_val_split():
     print("=" * 60)
     print("TEST 3: Train/Validation Split")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
@@ -108,36 +108,38 @@ def test_train_val_split():
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), _ = loader.load_data()
     # Split
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15, random_seed=42
     )
     print("✓ Split completed")
     print(f"  Original training: {len(x_train):,} samples")
-    print(f"  New training: {len(x_train_split):,} samples ({len(x_train_split)/len(x_train)*100:.1f}%)")
-    print(f"  Validation: {len(x_val):,} samples ({len(x_val)/len(x_train)*100:.1f}%)")
     # Verify split ratio
     expected_val_size = int(len(x_train) * 0.15)
     assert abs(len(x_val) - expected_val_size) < 100, "Split ratio incorrect"
     assert len(x_train_split) + len(x_val) == len(x_train), "Data loss during split"
     print("✓ Split ratio correct")
     # Check stratification (class balance)
     from collections import Counter
     train_counts = Counter(y_train_split)
     val_counts = Counter(y_val)
     print("\n  Class distribution in training set:")
     for digit in range(10):
         print(f"    Digit {digit}: {train_counts[digit]:>5,} samples")
     print("\n  Class distribution in validation set:")
     for digit in range(10):
         print(f"    Digit {digit}: {val_counts[digit]:>4,} samples")
     # Verify each class is present in both sets
     assert all(train_counts[i] > 0 for i in range(10)), "Missing class in train"
     assert all(val_counts[i] > 0 for i in range(10)), "Missing class in validation"
@@ -150,7 +152,7 @@ def test_full_pipeline():
     print("=" * 60)
     print("TEST 4: Full Pipeline")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
@@ -161,19 +163,19 @@ def test_full_pipeline():
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print("✓ Data loaded")
     # Split train/val
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15
     )
     print("✓ Train/val split completed")
     # Create datasets
     train_dataset = MnistDataset(x_train_split, y_train_split)
     val_dataset = MnistDataset(x_val, y_val)
     test_dataset = MnistDataset(x_test, y_test)
     print("✓ Datasets created")
     # Get statistics
     train_stats = get_dataset_statistics(train_dataset)
     print("\n  Training dataset statistics:")
@@ -182,7 +184,7 @@ def test_full_pipeline():
     print(f"    Image dtype: {train_stats['sample_image_dtype']}")
     print(f"    Image range: {train_stats['sample_image_range']}")
     print(f"    Label dtype: {train_stats['sample_label_dtype']}")
     # Create dataloaders
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=64, num_workers=0
@@ -192,12 +194,12 @@ def test_full_pipeline():
     print(f"  Training batches: {len(train_loader)}")
     print(f"  Validation batches: {len(val_loader)}")
     print(f"  Test batches: {len(test_loader)}")
     # Test iteration
     train_batch = next(iter(train_loader))
     val_batch = next(iter(val_loader))
     test_batch = next(iter(test_loader))
     print("\n✓ Successfully iterated through all loaders")
     print(f"  Train batch shapes: {train_batch[0].shape}, {train_batch[1].shape}")
     print(f"  Val batch shapes: {val_batch[0].shape}, {val_batch[1].shape}")
@@ -209,19 +211,19 @@ def main():
     """Run all tests."""
     print("\nTesting MNIST Preprocessing Pipeline")
     print()
     try:
         dataset = test_dataset()
         test_dataloader(dataset)
         test_train_val_split()
         test_full_pipeline()
         print("=" * 60)
         print("✅ ALL TESTS PASSED")
         print("=" * 60)
         print("\nPreprocessing pipeline is ready for model training!")
         return 0
     except Exception as e:
         print(f"\n❌ TEST FAILED: {e}")
         import traceback

     print("=" * 60)
     print("TEST 1: MnistDataset Initialization and Indexing")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     # Create dataset (small subset for testing)
     dataset = MnistDataset(x_train[:1000], y_train[:1000])
     print(f"✓ Dataset created with {len(dataset)} samples")
     # Test __getitem__
     image, label = dataset[0]
     print("✓ Retrieved sample 0")
     print(f"  Image dtype: {image.dtype}")
     print(f"  Image range: [{image.min():.4f}, {image.max():.4f}]")
     print(f"  Label: {label.item()} (dtype: {label.dtype})")
     # Verify normalization
     assert image.shape == (1, 28, 28), f"Wrong shape: {image.shape}"
     assert image.dtype == torch.float32, f"Wrong dtype: {image.dtype}"
     assert label.dtype == torch.long, f"Label wrong dtype: {label.dtype}"
     print("✓ All assertions passed")
     print()
     return dataset
     print("=" * 60)
     print("TEST 2: DataLoader Batching")
     print("=" * 60)
     loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
     print("✓ DataLoader created (batch_size=32)")
     # Get first batch
     images, labels = next(iter(loader))
     print("✓ Retrieved first batch")
     print(f"  Batch labels shape: {labels.shape}")
     print(f"  Images dtype: {images.dtype}")
     print(f"  Labels dtype: {labels.dtype}")
     # Verify batch dimensions
     assert images.shape == (32, 1, 28, 28), f"Wrong batch shape: {images.shape}"
     assert labels.shape == (32,), f"Wrong labels shape: {labels.shape}"
     print("=" * 60)
     print("TEST 3: Train/Validation Split")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
         str(data_path / "t10k-labels.idx1-ubyte")
     )
     (x_train, y_train), _ = loader.load_data()
     # Split
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15, random_seed=42
     )
     print("✓ Split completed")
     print(f"  Original training: {len(x_train):,} samples")
+    train_pct = len(x_train_split) / len(x_train) * 100
+    print(f"  New training: {len(x_train_split):,} samples ({train_pct:.1f}%)")
+    val_pct = len(x_val) / len(x_train) * 100
+    print(f"  Validation: {len(x_val):,} samples ({val_pct:.1f}%)")
     # Verify split ratio
     expected_val_size = int(len(x_train) * 0.15)
     assert abs(len(x_val) - expected_val_size) < 100, "Split ratio incorrect"
     assert len(x_train_split) + len(x_val) == len(x_train), "Data loss during split"
     print("✓ Split ratio correct")
     # Check stratification (class balance)
     from collections import Counter
     train_counts = Counter(y_train_split)
     val_counts = Counter(y_val)
     print("\n  Class distribution in training set:")
     for digit in range(10):
         print(f"    Digit {digit}: {train_counts[digit]:>5,} samples")
     print("\n  Class distribution in validation set:")
     for digit in range(10):
         print(f"    Digit {digit}: {val_counts[digit]:>4,} samples")
     # Verify each class is present in both sets
     assert all(train_counts[i] > 0 for i in range(10)), "Missing class in train"
     assert all(val_counts[i] > 0 for i in range(10)), "Missing class in validation"
     print("=" * 60)
     print("TEST 4: Full Pipeline")
     print("=" * 60)
     # Load data
     data_path = project_root / "data" / "raw"
     loader = MnistDataloader(
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print("✓ Data loaded")
     # Split train/val
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15
     )
     print("✓ Train/val split completed")
     # Create datasets
     train_dataset = MnistDataset(x_train_split, y_train_split)
     val_dataset = MnistDataset(x_val, y_val)
     test_dataset = MnistDataset(x_test, y_test)
     print("✓ Datasets created")
     # Get statistics
     train_stats = get_dataset_statistics(train_dataset)
     print("\n  Training dataset statistics:")
     print(f"    Image dtype: {train_stats['sample_image_dtype']}")
     print(f"    Image range: {train_stats['sample_image_range']}")
     print(f"    Label dtype: {train_stats['sample_label_dtype']}")
     # Create dataloaders
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=64, num_workers=0
     print(f"  Training batches: {len(train_loader)}")
     print(f"  Validation batches: {len(val_loader)}")
     print(f"  Test batches: {len(test_loader)}")
     # Test iteration
     train_batch = next(iter(train_loader))
     val_batch = next(iter(val_loader))
     test_batch = next(iter(test_loader))
     print("\n✓ Successfully iterated through all loaders")
     print(f"  Train batch shapes: {train_batch[0].shape}, {train_batch[1].shape}")
     print(f"  Val batch shapes: {val_batch[0].shape}, {val_batch[1].shape}")
     """Run all tests."""
     print("\nTesting MNIST Preprocessing Pipeline")
     print()
     try:
         dataset = test_dataset()
         test_dataloader(dataset)
         test_train_val_split()
         test_full_pipeline()
         print("=" * 60)
         print("✅ ALL TESTS PASSED")
         print("=" * 60)
         print("\nPreprocessing pipeline is ready for model training!")
         return 0
     except Exception as e:
         print(f"\n❌ TEST FAILED: {e}")
         import traceback

scripts/test_train.py CHANGED Viewed

@@ -28,7 +28,7 @@ def main():
     print("Testing Training Pipeline")
     print("=" * 60)
     print()
     # Load data (small subset for quick test)
     print("1. Loading MNIST data...")
     data_path = project_root / "data" / "raw"
@@ -41,7 +41,7 @@ def main():
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print(f"✓ Loaded {len(x_train):,} training samples")
     print()
     # Use small subset for quick test (1000 samples)
     print("2. Creating train/val split...")
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
@@ -50,13 +50,14 @@ def main():
     print(f"✓ Train: {len(x_train_split)} samples")
     print(f"✓ Val: {len(x_val)} samples")
     print()
     # Create datasets and loaders
     print("3. Creating datasets and loaders...")
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=None)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
-    test_dataset = MnistDataset(x_test[:200], y_test[:200], transform=None)  # Small test set
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=32, num_workers=0
     )
@@ -67,7 +68,7 @@ def main():
     print(f"✓ Val batches: {len(val_loader)}")
     print(f"✓ Test batches: {len(test_loader)}")
     print()
     # Create model
     print("4. Creating model...")
     model = BaselineCNN()
@@ -75,7 +76,7 @@ def main():
     print(f"✓ Model: {model.__class__.__name__}")
     print(f"✓ Device: {device}")
     print()
     # Train model (short run for testing)
     print("5. Training model (3 epochs for testing)...")
     print("-" * 60)
@@ -93,30 +94,30 @@ def main():
     )
     print("-" * 60)
     print()
     # Check checkpoints exist
     print("6. Verifying checkpoints...")
     best_model_path = project_root / "models" / "best_model.pt"
     last_model_path = project_root / "models" / "last_model.pt"
     assert best_model_path.exists(), "Best model checkpoint not found"
     assert last_model_path.exists(), "Last model checkpoint not found"
     print("✓ Best model saved")
     print("✓ Last model saved")
     print()
     # Save history
     print("7. Saving training history...")
     history_path = project_root / "experiments" / "test_training_history.json"
     save_training_history(history, str(history_path))
     print()
     # Evaluate on test set
     print("8. Evaluating on test set...")
     results = evaluate_model(model, test_loader, device=device)
     print(f"✓ Test Accuracy: {results['accuracy']:.2f}%")
     print()
     # Print per-class metrics
     print("Per-class metrics:")
     report = results['classification_report']
@@ -128,7 +129,7 @@ def main():
                   f"Recall={metrics['recall']:.3f}, "
                   f"F1={metrics['f1-score']:.3f}")
     print()
     # Summary
     print("=" * 60)
     print("✅ ALL TESTS PASSED")
@@ -138,7 +139,7 @@ def main():
     print(f"Test accuracy: {results['accuracy']:.2f}%")
     print("\nNote: These are quick test results with limited data.")
     print("For full training, use complete dataset and more epochs.")
     return 0

     print("Testing Training Pipeline")
     print("=" * 60)
     print()
     # Load data (small subset for quick test)
     print("1. Loading MNIST data...")
     data_path = project_root / "data" / "raw"
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     print(f"✓ Loaded {len(x_train):,} training samples")
     print()
     # Use small subset for quick test (1000 samples)
     print("2. Creating train/val split...")
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
     print(f"✓ Train: {len(x_train_split)} samples")
     print(f"✓ Val: {len(x_val)} samples")
     print()
     # Create datasets and loaders
     print("3. Creating datasets and loaders...")
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=None)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
+    # Small test set for quick validation
+    test_dataset = MnistDataset(x_test[:200], y_test[:200], transform=None)
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=32, num_workers=0
     )
     print(f"✓ Val batches: {len(val_loader)}")
     print(f"✓ Test batches: {len(test_loader)}")
     print()
     # Create model
     print("4. Creating model...")
     model = BaselineCNN()
     print(f"✓ Model: {model.__class__.__name__}")
     print(f"✓ Device: {device}")
     print()
     # Train model (short run for testing)
     print("5. Training model (3 epochs for testing)...")
     print("-" * 60)
     )
     print("-" * 60)
     print()
     # Check checkpoints exist
     print("6. Verifying checkpoints...")
     best_model_path = project_root / "models" / "best_model.pt"
     last_model_path = project_root / "models" / "last_model.pt"
     assert best_model_path.exists(), "Best model checkpoint not found"
     assert last_model_path.exists(), "Last model checkpoint not found"
     print("✓ Best model saved")
     print("✓ Last model saved")
     print()
     # Save history
     print("7. Saving training history...")
     history_path = project_root / "experiments" / "test_training_history.json"
     save_training_history(history, str(history_path))
     print()
     # Evaluate on test set
     print("8. Evaluating on test set...")
     results = evaluate_model(model, test_loader, device=device)
     print(f"✓ Test Accuracy: {results['accuracy']:.2f}%")
     print()
     # Print per-class metrics
     print("Per-class metrics:")
     report = results['classification_report']
                   f"Recall={metrics['recall']:.3f}, "
                   f"F1={metrics['f1-score']:.3f}")
     print()
     # Summary
     print("=" * 60)
     print("✅ ALL TESTS PASSED")
     print(f"Test accuracy: {results['accuracy']:.2f}%")
     print("\nNote: These are quick test results with limited data.")
     print("For full training, use complete dataset and more epochs.")
     return 0

scripts/train.py CHANGED Viewed

@@ -12,7 +12,7 @@ Supports MLflow experiment tracking for reproducibility.
 Usage:
     from scripts.train import train_model
     from scripts.models import BaselineCNN
     model = BaselineCNN()
     history = train_model(
         model, train_loader, val_loader,
@@ -39,14 +39,14 @@ def train_epoch(
 ) -> Dict[str, float]:
     """
     Train model for one epoch.
     Args:
         model: PyTorch model
         train_loader: Training data loader
         criterion: Loss function
         optimizer: Optimizer
         device: Device to train on ('cpu' or 'cuda')
     Returns:
         Dictionary with 'loss' and 'accuracy' metrics
     """
@@ -54,25 +54,25 @@ def train_epoch(
     total_loss = 0.0
     correct = 0
     total = 0
     for images, labels in train_loader:
         images, labels = images.to(device), labels.to(device)
         # Forward pass
         optimizer.zero_grad()
         outputs = model(images)
         loss = criterion(outputs, labels)
         # Backward pass
         loss.backward()
         optimizer.step()
         # Track metrics
         total_loss += loss.item()
         _, predicted = outputs.max(1)
         correct += predicted.eq(labels).sum().item()
         total += labels.size(0)
     return {
         'loss': total_loss / len(train_loader),
         'accuracy': 100.0 * correct / total
@@ -87,13 +87,13 @@ def validate(
 ) -> Dict[str, float]:
     """
     Evaluate model on validation/test set.
     Args:
         model: PyTorch model
         val_loader: Validation data loader
         criterion: Loss function
         device: Device to evaluate on
     Returns:
         Dictionary with 'loss' and 'accuracy' metrics
     """
@@ -101,21 +101,21 @@ def validate(
     total_loss = 0.0
     correct = 0
     total = 0
     with torch.no_grad():
         for images, labels in val_loader:
             images, labels = images.to(device), labels.to(device)
             # Forward pass
             outputs = model(images)
             loss = criterion(outputs, labels)
             # Track metrics
             total_loss += loss.item()
             _, predicted = outputs.max(1)
             correct += predicted.eq(labels).sum().item()
             total += labels.size(0)
     return {
         'loss': total_loss / len(val_loader),
         'accuracy': 100.0 * correct / total
@@ -136,7 +136,7 @@ def train_model(
 ) -> Dict[str, List[float]]:
     """
     Train model with early stopping and checkpointing.
     Args:
         model: PyTorch model
         train_loader: Training data loader
@@ -148,7 +148,7 @@ def train_model(
         device: Device to train on (auto-detect if None)
         use_scheduler: Whether to use learning rate scheduler
         verbose: Print training progress
     Returns:
         Dictionary with training history (losses and accuracies)
     """
@@ -156,30 +156,30 @@ def train_model(
     if device is None:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model = model.to(device)
     if verbose:
         print(f"Training on device: {device}")
         print(f"Model: {model.__class__.__name__}")
         print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
         print()
     # Setup training components
     criterion = nn.CrossEntropyLoss()
     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
     # Learning rate scheduler
     scheduler = None
     if use_scheduler:
         scheduler = optim.lr_scheduler.ReduceLROnPlateau(
             optimizer, mode='min', patience=3, factor=0.5, verbose=verbose
         )
     # Setup checkpointing
     checkpoint_path = Path(checkpoint_dir)
     checkpoint_path.mkdir(parents=True, exist_ok=True)
     best_model_path = checkpoint_path / 'best_model.pt'
     last_model_path = checkpoint_path / 'last_model.pt'
     # Training history
     history = {
         'train_loss': [],
@@ -188,26 +188,26 @@ def train_model(
         'val_accuracy': [],
         'learning_rate': []
     }
     # Early stopping setup
     best_val_loss = float('inf')
     epochs_without_improvement = 0
     # Training loop
     for epoch in range(num_epochs):
         # Train
         train_metrics = train_epoch(model, train_loader, criterion, optimizer, device)
         # Validate
         val_metrics = validate(model, val_loader, criterion, device)
         # Update history
         history['train_loss'].append(train_metrics['loss'])
         history['train_accuracy'].append(train_metrics['accuracy'])
         history['val_loss'].append(val_metrics['loss'])
         history['val_accuracy'].append(val_metrics['accuracy'])
         history['learning_rate'].append(optimizer.param_groups[0]['lr'])
         # Print progress
         if verbose:
             print(f"Epoch {epoch+1}/{num_epochs}")
@@ -217,11 +217,11 @@ def train_model(
                   f"Val Acc: {val_metrics['accuracy']:.2f}%")
             print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")
             print()
         # Learning rate scheduling
         if scheduler is not None:
             scheduler.step(val_metrics['loss'])
         # Save best model
         if val_metrics['loss'] < best_val_loss:
             best_val_loss = val_metrics['loss']
@@ -238,14 +238,14 @@ def train_model(
                 print()
         else:
             epochs_without_improvement += 1
         # Early stopping
         if epochs_without_improvement >= patience:
             if verbose:
                 print(f"Early stopping triggered after {epoch+1} epochs")
                 print(f"Best validation loss: {best_val_loss:.4f}")
             break
     # Save last model
     torch.save({
         'epoch': epoch,
@@ -254,12 +254,12 @@ def train_model(
         'val_loss': val_metrics['loss'],
         'val_accuracy': val_metrics['accuracy']
     }, last_model_path)
     if verbose:
         print("Training complete!")
         print(f"Best validation loss: {best_val_loss:.4f}")
         print(f"Final validation accuracy: {history['val_accuracy'][-1]:.2f}%")
     return history
@@ -271,13 +271,13 @@ def evaluate_model(
 ) -> Dict:
     """
     Comprehensive model evaluation with per-class metrics.
     Args:
         model: Trained PyTorch model
         test_loader: Test data loader
         device: Device to evaluate on
         class_names: List of class names (default: digits 0-9)
     Returns:
         Dictionary with metrics, predictions, and confusion matrix
     """
@@ -285,42 +285,42 @@ def evaluate_model(
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model = model.to(device)
     model.eval()
     if class_names is None:
         class_names = [str(i) for i in range(10)]
     all_preds = []
     all_labels = []
     all_probs = []
     with torch.no_grad():
         for images, labels in test_loader:
             images = images.to(device)
             outputs = model(images)
             probs = torch.softmax(outputs, dim=1)
             _, predicted = outputs.max(1)
             all_preds.extend(predicted.cpu().numpy())
             all_labels.extend(labels.numpy())
             all_probs.extend(probs.cpu().numpy())
     all_preds = np.array(all_preds)
     all_labels = np.array(all_labels)
     all_probs = np.array(all_probs)
     # Overall metrics
     accuracy = 100.0 * (all_preds == all_labels).sum() / len(all_labels)
     # Classification report
     report = classification_report(
         all_labels, all_preds,
         target_names=class_names,
         output_dict=True
     )
     # Confusion matrix
     conf_matrix = confusion_matrix(all_labels, all_preds)
     return {
         'accuracy': accuracy,
         'classification_report': report,
@@ -334,7 +334,7 @@ def evaluate_model(
 def save_training_history(history: Dict, filepath: str) -> None:
     """
     Save training history to JSON file.
     Args:
         history: Training history dictionary
         filepath: Path to save JSON file
@@ -348,11 +348,11 @@ def save_training_history(history: Dict, filepath: str) -> None:
 def load_checkpoint(checkpoint_path: str, model: nn.Module) -> Tuple[nn.Module, Dict]:
     """
     Load model from checkpoint.
     Args:
         checkpoint_path: Path to checkpoint file
         model: Model instance (for loading state dict)
     Returns:
         Tuple of (loaded_model, checkpoint_dict)
     """

 Usage:
     from scripts.train import train_model
     from scripts.models import BaselineCNN
     model = BaselineCNN()
     history = train_model(
         model, train_loader, val_loader,
 ) -> Dict[str, float]:
     """
     Train model for one epoch.
     Args:
         model: PyTorch model
         train_loader: Training data loader
         criterion: Loss function
         optimizer: Optimizer
         device: Device to train on ('cpu' or 'cuda')
     Returns:
         Dictionary with 'loss' and 'accuracy' metrics
     """
     total_loss = 0.0
     correct = 0
     total = 0
     for images, labels in train_loader:
         images, labels = images.to(device), labels.to(device)
         # Forward pass
         optimizer.zero_grad()
         outputs = model(images)
         loss = criterion(outputs, labels)
         # Backward pass
         loss.backward()
         optimizer.step()
         # Track metrics
         total_loss += loss.item()
         _, predicted = outputs.max(1)
         correct += predicted.eq(labels).sum().item()
         total += labels.size(0)
     return {
         'loss': total_loss / len(train_loader),
         'accuracy': 100.0 * correct / total
 ) -> Dict[str, float]:
     """
     Evaluate model on validation/test set.
     Args:
         model: PyTorch model
         val_loader: Validation data loader
         criterion: Loss function
         device: Device to evaluate on
     Returns:
         Dictionary with 'loss' and 'accuracy' metrics
     """
     total_loss = 0.0
     correct = 0
     total = 0
     with torch.no_grad():
         for images, labels in val_loader:
             images, labels = images.to(device), labels.to(device)
             # Forward pass
             outputs = model(images)
             loss = criterion(outputs, labels)
             # Track metrics
             total_loss += loss.item()
             _, predicted = outputs.max(1)
             correct += predicted.eq(labels).sum().item()
             total += labels.size(0)
     return {
         'loss': total_loss / len(val_loader),
         'accuracy': 100.0 * correct / total
 ) -> Dict[str, List[float]]:
     """
     Train model with early stopping and checkpointing.
     Args:
         model: PyTorch model
         train_loader: Training data loader
         device: Device to train on (auto-detect if None)
         use_scheduler: Whether to use learning rate scheduler
         verbose: Print training progress
     Returns:
         Dictionary with training history (losses and accuracies)
     """
     if device is None:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model = model.to(device)
     if verbose:
         print(f"Training on device: {device}")
         print(f"Model: {model.__class__.__name__}")
         print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
         print()
     # Setup training components
     criterion = nn.CrossEntropyLoss()
     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
     # Learning rate scheduler
     scheduler = None
     if use_scheduler:
         scheduler = optim.lr_scheduler.ReduceLROnPlateau(
             optimizer, mode='min', patience=3, factor=0.5, verbose=verbose
         )
     # Setup checkpointing
     checkpoint_path = Path(checkpoint_dir)
     checkpoint_path.mkdir(parents=True, exist_ok=True)
     best_model_path = checkpoint_path / 'best_model.pt'
     last_model_path = checkpoint_path / 'last_model.pt'
     # Training history
     history = {
         'train_loss': [],
         'val_accuracy': [],
         'learning_rate': []
     }
     # Early stopping setup
     best_val_loss = float('inf')
     epochs_without_improvement = 0
     # Training loop
     for epoch in range(num_epochs):
         # Train
         train_metrics = train_epoch(model, train_loader, criterion, optimizer, device)
         # Validate
         val_metrics = validate(model, val_loader, criterion, device)
         # Update history
         history['train_loss'].append(train_metrics['loss'])
         history['train_accuracy'].append(train_metrics['accuracy'])
         history['val_loss'].append(val_metrics['loss'])
         history['val_accuracy'].append(val_metrics['accuracy'])
         history['learning_rate'].append(optimizer.param_groups[0]['lr'])
         # Print progress
         if verbose:
             print(f"Epoch {epoch+1}/{num_epochs}")
                   f"Val Acc: {val_metrics['accuracy']:.2f}%")
             print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")
             print()
         # Learning rate scheduling
         if scheduler is not None:
             scheduler.step(val_metrics['loss'])
         # Save best model
         if val_metrics['loss'] < best_val_loss:
             best_val_loss = val_metrics['loss']
                 print()
         else:
             epochs_without_improvement += 1
         # Early stopping
         if epochs_without_improvement >= patience:
             if verbose:
                 print(f"Early stopping triggered after {epoch+1} epochs")
                 print(f"Best validation loss: {best_val_loss:.4f}")
             break
     # Save last model
     torch.save({
         'epoch': epoch,
         'val_loss': val_metrics['loss'],
         'val_accuracy': val_metrics['accuracy']
     }, last_model_path)
     if verbose:
         print("Training complete!")
         print(f"Best validation loss: {best_val_loss:.4f}")
         print(f"Final validation accuracy: {history['val_accuracy'][-1]:.2f}%")
     return history
 ) -> Dict:
     """
     Comprehensive model evaluation with per-class metrics.
     Args:
         model: Trained PyTorch model
         test_loader: Test data loader
         device: Device to evaluate on
         class_names: List of class names (default: digits 0-9)
     Returns:
         Dictionary with metrics, predictions, and confusion matrix
     """
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model = model.to(device)
     model.eval()
     if class_names is None:
         class_names = [str(i) for i in range(10)]
     all_preds = []
     all_labels = []
     all_probs = []
     with torch.no_grad():
         for images, labels in test_loader:
             images = images.to(device)
             outputs = model(images)
             probs = torch.softmax(outputs, dim=1)
             _, predicted = outputs.max(1)
             all_preds.extend(predicted.cpu().numpy())
             all_labels.extend(labels.numpy())
             all_probs.extend(probs.cpu().numpy())
     all_preds = np.array(all_preds)
     all_labels = np.array(all_labels)
     all_probs = np.array(all_probs)
     # Overall metrics
     accuracy = 100.0 * (all_preds == all_labels).sum() / len(all_labels)
     # Classification report
     report = classification_report(
         all_labels, all_preds,
         target_names=class_names,
         output_dict=True
     )
     # Confusion matrix
     conf_matrix = confusion_matrix(all_labels, all_preds)
     return {
         'accuracy': accuracy,
         'classification_report': report,
 def save_training_history(history: Dict, filepath: str) -> None:
     """
     Save training history to JSON file.
     Args:
         history: Training history dictionary
         filepath: Path to save JSON file
 def load_checkpoint(checkpoint_path: str, model: nn.Module) -> Tuple[nn.Module, Dict]:
     """
     Load model from checkpoint.
     Args:
         checkpoint_path: Path to checkpoint file
         model: Model instance (for loading state dict)
     Returns:
         Tuple of (loaded_model, checkpoint_dict)
     """

scripts/train_baseline.py CHANGED Viewed

@@ -34,9 +34,9 @@ from scripts.train import train_model, evaluate_model, save_training_history
 def plot_training_history(history: dict, save_path: str):
     """Plot and save training history curves."""
     fig, axes = plt.subplots(2, 2, figsize=(12, 10))
     epochs = range(1, len(history['train_loss']) + 1)
     # Loss curves
     axes[0, 0].plot(epochs, history['train_loss'], 'b-', label='Train Loss')
     axes[0, 0].plot(epochs, history['val_loss'], 'r-', label='Val Loss')
@@ -45,7 +45,7 @@ def plot_training_history(history: dict, save_path: str):
     axes[0, 0].set_title('Training and Validation Loss')
     axes[0, 0].legend()
     axes[0, 0].grid(True, alpha=0.3)
     # Accuracy curves
     axes[0, 1].plot(epochs, history['train_accuracy'], 'b-', label='Train Acc')
     axes[0, 1].plot(epochs, history['val_accuracy'], 'r-', label='Val Acc')
@@ -54,7 +54,7 @@ def plot_training_history(history: dict, save_path: str):
     axes[0, 1].set_title('Training and Validation Accuracy')
     axes[0, 1].legend()
     axes[0, 1].grid(True, alpha=0.3)
     # Learning rate
     axes[1, 0].plot(epochs, history['learning_rate'], 'g-')
     axes[1, 0].set_xlabel('Epoch')
@@ -62,7 +62,7 @@ def plot_training_history(history: dict, save_path: str):
     axes[1, 0].set_title('Learning Rate Schedule')
     axes[1, 0].set_yscale('log')
     axes[1, 0].grid(True, alpha=0.3)
     # Loss difference (overfitting indicator)
     loss_diff = np.array(history['val_loss']) - np.array(history['train_loss'])
     axes[1, 1].plot(epochs, loss_diff, 'm-')
@@ -71,7 +71,7 @@ def plot_training_history(history: dict, save_path: str):
     axes[1, 1].set_ylabel('Val Loss - Train Loss')
     axes[1, 1].set_title('Overfitting Indicator (positive = overfitting)')
     axes[1, 1].grid(True, alpha=0.3)
     plt.tight_layout()
     plt.savefig(save_path, dpi=300, bbox_inches='tight')
     print(f"Training curves saved to {save_path}")
@@ -81,10 +81,10 @@ def plot_training_history(history: dict, save_path: str):
 def plot_confusion_matrix(conf_matrix: np.ndarray, save_path: str):
     """Plot and save confusion matrix."""
     fig, ax = plt.subplots(figsize=(10, 8))
     im = ax.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
     ax.figure.colorbar(im, ax=ax)
     # Labels
     classes = list(range(10))
     ax.set(xticks=np.arange(conf_matrix.shape[1]),
@@ -93,10 +93,10 @@ def plot_confusion_matrix(conf_matrix: np.ndarray, save_path: str):
            title='Confusion Matrix - MNIST Digit Classification',
            ylabel='True Label',
            xlabel='Predicted Label')
     # Rotate the tick labels
     plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
     # Add text annotations
     thresh = conf_matrix.max() / 2.
     for i in range(conf_matrix.shape[0]):
@@ -104,7 +104,7 @@ def plot_confusion_matrix(conf_matrix: np.ndarray, save_path: str):
             ax.text(j, i, format(conf_matrix[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if conf_matrix[i, j] > thresh else "black")
     plt.tight_layout()
     plt.savefig(save_path, dpi=300, bbox_inches='tight')
     print(f"Confusion matrix saved to {save_path}")
@@ -112,14 +112,30 @@ def plot_confusion_matrix(conf_matrix: np.ndarray, save_path: str):
 def main():
-    parser = argparse.ArgumentParser(description='Train baseline CNN on MNIST')
-    parser.add_argument('--augment', action='store_true', help='Use data augmentation')
-    parser.add_argument('--epochs', type=int, default=20, help='Number of epochs (default: 20)')
-    parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)')
-    parser.add_argument('--batch-size', type=int, default=64, help='Batch size (default: 64)')
-    parser.add_argument('--patience', type=int, default=5, help='Early stopping patience (default: 5)')
     args = parser.parse_args()
     print("=" * 60)
     print("MNIST CNN Training - Baseline Model")
     print("=" * 60)
@@ -130,7 +146,7 @@ def main():
     print(f"  Augmentation: {'Yes' if args.augment else 'No'}")
     print(f"  Early Stopping Patience: {args.patience}")
     print()
     # 1. Load data
     print("1. Loading MNIST dataset...")
     data_path = project_root / "data" / "raw"
@@ -144,7 +160,7 @@ def main():
     print(f"✓ Loaded {len(x_train):,} training samples")
     print(f"✓ Loaded {len(x_test):,} test samples")
     print()
     # 2. Train/val split
     print("2. Creating train/validation split...")
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
@@ -154,15 +170,15 @@ def main():
     print(f"✓ Validation: {len(x_val):,} samples")
     print(f"✓ Test: {len(x_test):,} samples")
     print()
     # 3. Create datasets with optional augmentation
     print("3. Creating datasets...")
     augmentation = get_train_augmentation() if args.augment else None
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=augmentation)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
     test_dataset = MnistDataset(x_test, y_test, transform=None)
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=args.batch_size, num_workers=2
     )
@@ -173,13 +189,13 @@ def main():
     print(f"✓ Val batches: {len(val_loader)}")
     print(f"✓ Test batches: {len(test_loader)}")
     print()
     # 4. Create model
     print("4. Creating model...")
     model = BaselineCNN()
     print(get_model_summary(model))
     print()
     # 5. Train model
     print("5. Training model...")
     print("-" * 60)
@@ -197,54 +213,69 @@ def main():
     )
     print("-" * 60)
     print()
     # 6. Load best model and evaluate
     print("6. Evaluating best model on test set...")
     checkpoint = torch.load('models/best_model.pt', map_location='cpu')
     model.load_state_dict(checkpoint['model_state_dict'])
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     results = evaluate_model(model, test_loader, device=device)
     print(f"✓ Test Accuracy: {results['accuracy']:.2f}%")
     print()
     # 7. Print detailed metrics
     print("7. Per-class metrics:")
     print("-" * 60)
     report = results['classification_report']
-    print(f"{'Digit':<8} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
     print("-" * 60)
     for digit in range(10):
         if str(digit) in report:
             metrics = report[str(digit)]
-            print(f"{digit:<8} {metrics['precision']:<12.3f} {metrics['recall']:<12.3f} "
-                  f"{metrics['f1-score']:<12.3f} {metrics['support']:<10}")
     print("-" * 60)
-    print(f"{'Accuracy':<8} {' ':<12} {' ':<12} {report['accuracy']:<12.3f} {report['macro avg']['support']:<10}")
-    print(f"{'Macro Avg':<8} {report['macro avg']['precision']:<12.3f} "
-          f"{report['macro avg']['recall']:<12.3f} {report['macro avg']['f1-score']:<12.3f} "
-          f"{report['macro avg']['support']:<10}")
     print()
     # 8. Save results
     print("8. Saving results...")
     results_dir = project_root / "results"
     results_dir.mkdir(exist_ok=True)
     # Save history
     history_path = results_dir / "baseline_training_history.json"
     save_training_history(history, str(history_path))
     # Plot training curves
     curves_path = results_dir / "baseline_training_curves.png"
     plot_training_history(history, str(curves_path))
     # Plot confusion matrix
     conf_matrix_path = results_dir / "baseline_confusion_matrix.png"
     plot_confusion_matrix(results['confusion_matrix'], str(conf_matrix_path))
     # Save evaluation metrics
     metrics_path = results_dir / "baseline_metrics.json"
     # Convert numpy arrays to lists for JSON serialization
@@ -269,7 +300,7 @@ def main():
         json.dump(metrics_data, f, indent=2)
     print(f"Evaluation metrics saved to {metrics_path}")
     print()
     # 9. Summary
     print("=" * 60)
     print("✅ TRAINING COMPLETE")
@@ -285,7 +316,7 @@ def main():
     print(f"  - Training curves: {curves_path}")
     print(f"  - Confusion matrix: {conf_matrix_path}")
     print(f"  - Metrics: {metrics_path}")
     return 0

 def plot_training_history(history: dict, save_path: str):
     """Plot and save training history curves."""
     fig, axes = plt.subplots(2, 2, figsize=(12, 10))
     epochs = range(1, len(history['train_loss']) + 1)
     # Loss curves
     axes[0, 0].plot(epochs, history['train_loss'], 'b-', label='Train Loss')
     axes[0, 0].plot(epochs, history['val_loss'], 'r-', label='Val Loss')
     axes[0, 0].set_title('Training and Validation Loss')
     axes[0, 0].legend()
     axes[0, 0].grid(True, alpha=0.3)
     # Accuracy curves
     axes[0, 1].plot(epochs, history['train_accuracy'], 'b-', label='Train Acc')
     axes[0, 1].plot(epochs, history['val_accuracy'], 'r-', label='Val Acc')
     axes[0, 1].set_title('Training and Validation Accuracy')
     axes[0, 1].legend()
     axes[0, 1].grid(True, alpha=0.3)
     # Learning rate
     axes[1, 0].plot(epochs, history['learning_rate'], 'g-')
     axes[1, 0].set_xlabel('Epoch')
     axes[1, 0].set_title('Learning Rate Schedule')
     axes[1, 0].set_yscale('log')
     axes[1, 0].grid(True, alpha=0.3)
     # Loss difference (overfitting indicator)
     loss_diff = np.array(history['val_loss']) - np.array(history['train_loss'])
     axes[1, 1].plot(epochs, loss_diff, 'm-')
     axes[1, 1].set_ylabel('Val Loss - Train Loss')
     axes[1, 1].set_title('Overfitting Indicator (positive = overfitting)')
     axes[1, 1].grid(True, alpha=0.3)
     plt.tight_layout()
     plt.savefig(save_path, dpi=300, bbox_inches='tight')
     print(f"Training curves saved to {save_path}")
 def plot_confusion_matrix(conf_matrix: np.ndarray, save_path: str):
     """Plot and save confusion matrix."""
     fig, ax = plt.subplots(figsize=(10, 8))
     im = ax.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
     ax.figure.colorbar(im, ax=ax)
     # Labels
     classes = list(range(10))
     ax.set(xticks=np.arange(conf_matrix.shape[1]),
            title='Confusion Matrix - MNIST Digit Classification',
            ylabel='True Label',
            xlabel='Predicted Label')
     # Rotate the tick labels
     plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
     # Add text annotations
     thresh = conf_matrix.max() / 2.
     for i in range(conf_matrix.shape[0]):
             ax.text(j, i, format(conf_matrix[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if conf_matrix[i, j] > thresh else "black")
     plt.tight_layout()
     plt.savefig(save_path, dpi=300, bbox_inches='tight')
     print(f"Confusion matrix saved to {save_path}")
 def main():
+    parser = argparse.ArgumentParser(
+        description='Train baseline CNN on MNIST'
+    )
+    parser.add_argument(
+        '--augment', action='store_true', help='Use data augmentation'
+    )
+    parser.add_argument(
+        '--epochs', type=int, default=20,
+        help='Number of epochs (default: 20)'
+    )
+    parser.add_argument(
+        '--lr', type=float, default=0.001,
+        help='Learning rate (default: 0.001)'
+    )
+    parser.add_argument(
+        '--batch-size', type=int, default=64,
+        help='Batch size (default: 64)'
+    )
+    parser.add_argument(
+        '--patience', type=int, default=5,
+        help='Early stopping patience (default: 5)'
+    )
     args = parser.parse_args()
     print("=" * 60)
     print("MNIST CNN Training - Baseline Model")
     print("=" * 60)
     print(f"  Augmentation: {'Yes' if args.augment else 'No'}")
     print(f"  Early Stopping Patience: {args.patience}")
     print()
     # 1. Load data
     print("1. Loading MNIST dataset...")
     data_path = project_root / "data" / "raw"
     print(f"✓ Loaded {len(x_train):,} training samples")
     print(f"✓ Loaded {len(x_test):,} test samples")
     print()
     # 2. Train/val split
     print("2. Creating train/validation split...")
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
     print(f"✓ Validation: {len(x_val):,} samples")
     print(f"✓ Test: {len(x_test):,} samples")
     print()
     # 3. Create datasets with optional augmentation
     print("3. Creating datasets...")
     augmentation = get_train_augmentation() if args.augment else None
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=augmentation)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
     test_dataset = MnistDataset(x_test, y_test, transform=None)
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=args.batch_size, num_workers=2
     )
     print(f"✓ Val batches: {len(val_loader)}")
     print(f"✓ Test batches: {len(test_loader)}")
     print()
     # 4. Create model
     print("4. Creating model...")
     model = BaselineCNN()
     print(get_model_summary(model))
     print()
     # 5. Train model
     print("5. Training model...")
     print("-" * 60)
     )
     print("-" * 60)
     print()
     # 6. Load best model and evaluate
     print("6. Evaluating best model on test set...")
     checkpoint = torch.load('models/best_model.pt', map_location='cpu')
     model.load_state_dict(checkpoint['model_state_dict'])
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     results = evaluate_model(model, test_loader, device=device)
     print(f"✓ Test Accuracy: {results['accuracy']:.2f}%")
     print()
     # 7. Print detailed metrics
     print("7. Per-class metrics:")
     print("-" * 60)
     report = results['classification_report']
+    print(
+        f"{'Digit':<8} {'Precision':<12} {'Recall':<12} "
+        f"{'F1-Score':<12} {'Support':<10}"
+    )
     print("-" * 60)
     for digit in range(10):
         if str(digit) in report:
             metrics = report[str(digit)]
+            print(
+                f"{digit:<8} {metrics['precision']:<12.3f} "
+                f"{metrics['recall']:<12.3f} "
+                f"{metrics['f1-score']:<12.3f} {metrics['support']:<10}"
+            )
     print("-" * 60)
+    acc_line = (
+        f"{'Accuracy':<8} {' ':<12} {' ':<12} "
+        f"{report['accuracy']:<12.3f} "
+        f"{report['macro avg']['support']:<10}"
+    )
+    print(acc_line)
+    macro_line = (
+        f"{'Macro Avg':<8} {report['macro avg']['precision']:<12.3f} "
+        f"{report['macro avg']['recall']:<12.3f} "
+        f"{report['macro avg']['f1-score']:<12.3f} "
+        f"{report['macro avg']['support']:<10}"
+    )
+    print(macro_line)
     print()
     # 8. Save results
     print("8. Saving results...")
     results_dir = project_root / "results"
     results_dir.mkdir(exist_ok=True)
     # Save history
     history_path = results_dir / "baseline_training_history.json"
     save_training_history(history, str(history_path))
     # Plot training curves
     curves_path = results_dir / "baseline_training_curves.png"
     plot_training_history(history, str(curves_path))
     # Plot confusion matrix
     conf_matrix_path = results_dir / "baseline_confusion_matrix.png"
     plot_confusion_matrix(results['confusion_matrix'], str(conf_matrix_path))
     # Save evaluation metrics
     metrics_path = results_dir / "baseline_metrics.json"
     # Convert numpy arrays to lists for JSON serialization
         json.dump(metrics_data, f, indent=2)
     print(f"Evaluation metrics saved to {metrics_path}")
     print()
     # 9. Summary
     print("=" * 60)
     print("✅ TRAINING COMPLETE")
     print(f"  - Training curves: {curves_path}")
     print(f"  - Confusion matrix: {conf_matrix_path}")
     print(f"  - Metrics: {metrics_path}")
     return 0

scripts/train_with_mlflow.py CHANGED Viewed

@@ -50,7 +50,7 @@ def train_with_mlflow(
 ) -> dict:
     """
     Train model with full MLflow tracking.
     Args:
         model: PyTorch model to train
         train_loader: Training data loader
@@ -58,23 +58,23 @@ def train_with_mlflow(
         test_loader: Test data loader
         config: Training configuration dictionary
         run_name: Optional name for MLflow run
     Returns:
         Training history dictionary
     """
     device = config['device']
     num_epochs = config['num_epochs']
     learning_rate = config['learning_rate']
     # Setup MLflow
     setup_mlflow("mnist-digit-classification")
     # Start MLflow run
     with mlflow.start_run(run_name=run_name):
         print("\n" + "="*70)
         print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")
         print("="*70 + "\n")
         # Log all configuration
         print("Logging configuration to MLflow...")
         log_training_config(config)
@@ -87,7 +87,7 @@ def train_with_mlflow(
             augmentation=config.get('augmentation', False)
         )
         log_system_info()
         # Log model architecture as text
         total_params, trainable_params = count_parameters(model)
         model_summary = f"""
@@ -100,14 +100,14 @@ Architecture:
 {str(model)}
 """
         mlflow.log_text(model_summary, "model_architecture.txt")
         # Setup training
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
         scheduler = optim.lr_scheduler.ReduceLROnPlateau(
             optimizer, mode='min', patience=3, factor=0.5, verbose=True
         )
         # Training history
         history = {
             'train_loss': [],
@@ -116,37 +116,39 @@ Architecture:
             'val_accuracy': [],
             'learning_rate': []
         }
         best_val_loss = float('inf')
         patience = 5
         patience_counter = 0
         print(f"\nStarting training for {num_epochs} epochs...")
         print(f"Device: {device}")
         total_p, _ = count_parameters(model)
         print(f"Model: {model.__class__.__name__} ({total_p:,} parameters)")
         print("-" * 70)
         for epoch in range(num_epochs):
             # Train
-            train_metrics = train_epoch(model, train_loader, criterion, optimizer, device)
             # Validate
             val_metrics = validate(model, val_loader, criterion, device)
             # Get current learning rate
             current_lr = optimizer.param_groups[0]['lr']
             # Update scheduler
             scheduler.step(val_metrics['loss'])
             # Save history
             history['train_loss'].append(train_metrics['loss'])
             history['train_accuracy'].append(train_metrics['accuracy'])
             history['val_loss'].append(val_metrics['loss'])
             history['val_accuracy'].append(val_metrics['accuracy'])
             history['learning_rate'].append(current_lr)
             # Log metrics to MLflow
             mlflow_metrics = {
                 'train_loss': train_metrics['loss'],
@@ -157,19 +159,23 @@ Architecture:
                 'epoch': epoch + 1
             }
             log_metrics_epoch(mlflow_metrics, step=epoch)
             # Print progress
-            print(f"Epoch {epoch+1}/{num_epochs} | "
-                  f"Train Loss: {train_metrics['loss']:.4f} ({train_metrics['accuracy']:.2f}%) | "
-                  f"Val Loss: {val_metrics['loss']:.4f} ({val_metrics['accuracy']:.2f}%) | "
-                  f"LR: {current_lr:.6f}")
             # Save best model
             if val_metrics['loss'] < best_val_loss:
                 best_val_loss = val_metrics['loss']
                 best_epoch = epoch + 1
                 patience_counter = 0
                 # Save checkpoint
                 checkpoint_path = project_root / 'models' / 'best_model_mlflow.pt'
                 torch.save({
@@ -180,9 +186,9 @@ Architecture:
                     'val_loss': val_metrics['loss'],
                     'val_accuracy': val_metrics['accuracy'],
                 }, checkpoint_path)
                 print(f"  → New best model! (Val Loss: {best_val_loss:.4f})")
                 # Log model to MLflow
                 mlflow.pytorch.log_model(
                     model,
@@ -191,18 +197,18 @@ Architecture:
                 )
             else:
                 patience_counter += 1
             # Early stopping
             if patience_counter >= patience:
                 print(f"\nEarly stopping triggered after {epoch+1} epochs")
                 mlflow.log_param("early_stopped", True)
                 mlflow.log_param("early_stop_epoch", epoch + 1)
                 break
         print("-" * 70)
         print("\nTraining complete!")
         print(f"Best epoch: {best_epoch} (Val Loss: {best_val_loss:.4f})")
         # Log best metrics
         mlflow.log_metrics({
             'best_epoch': best_epoch,
@@ -210,24 +216,24 @@ Architecture:
             'final_train_loss': history['train_loss'][-1],
             'final_val_loss': history['val_loss'][-1]
         })
         # Evaluate on test set
         print("\nEvaluating on test set...")
         test_metrics = evaluate_model(model, test_loader, device)
         test_accuracy = test_metrics['accuracy']
         test_report = test_metrics['classification_report']
         # Extract macro average metrics
         test_precision = test_report['macro avg']['precision']
         test_recall = test_report['macro avg']['recall']
         test_f1_score = test_report['macro avg']['f1-score']
         print(f"Test Accuracy: {test_accuracy:.2f}%")
         print(f"Test Precision: {test_precision:.4f}")
         print(f"Test Recall: {test_recall:.4f}")
         print(f"Test F1-Score: {test_f1_score:.4f}")
         # Log test metrics to MLflow
         mlflow.log_metrics({
             'test_accuracy': test_accuracy,
@@ -235,16 +241,16 @@ Architecture:
             'test_recall': test_recall,
             'test_f1_score': test_f1_score
         })
         # Save and log artifacts
         print("\nSaving artifacts...")
         # Save history
         history_path = project_root / 'results' / 'mlflow_training_history.json'
         history_path.parent.mkdir(exist_ok=True)
         save_training_history(history, history_path)
         log_artifact_path(str(history_path))
         # Save test metrics
         metrics_to_save = {
             'test_accuracy': test_accuracy,
@@ -258,43 +264,63 @@ Architecture:
         with open(metrics_path, 'w') as f:
             json.dump(metrics_to_save, f, indent=2)
         log_artifact_path(str(metrics_path))
         # Save model checkpoint
         log_artifact_path(str(project_root / 'models' / 'best_model_mlflow.pt'))
         # Log confusion matrix as JSON
         conf_matrix_dict = {
             f"row_{i}": test_metrics['confusion_matrix'][i].tolist()
             for i in range(len(test_metrics['confusion_matrix']))
         }
         mlflow.log_dict(conf_matrix_dict, "confusion_matrix.json")
         # Log classification report
         mlflow.log_dict(test_report, "classification_report.json")
         print("\n✓ All artifacts logged to MLflow")
         print("View results: mlflow ui --backend-store-uri file:./mlruns")
         return history
 def main():
-    parser = argparse.ArgumentParser(description='Train MNIST CNN with MLflow tracking')
-    parser.add_argument('--epochs', type=int, default=20, help='Number of epochs (default: 20)')
-    parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)')
-    parser.add_argument('--batch-size', type=int, default=64, help='Batch size (default: 64)')
-    parser.add_argument('--augment', action='store_true', help='Use data augmentation')
-    parser.add_argument('--run-name', type=str, default=None, help='MLflow run name')
-    parser.add_argument('--seed', type=int, default=42, help='Random seed (default: 42)')
     args = parser.parse_args()
     # Set random seeds
     torch.manual_seed(args.seed)
     np.random.seed(args.seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(args.seed)
     # Configuration
     config = {
         'num_epochs': args.epochs,
@@ -307,10 +333,10 @@ def main():
         'scheduler': 'ReduceLROnPlateau',
         'early_stopping_patience': 5
     }
     print("Training Configuration:")
     print(json.dumps(config, indent=2))
     # Load MNIST data
     print("\nLoading MNIST data...")
     data_path = project_root / 'data' / 'raw'
@@ -321,18 +347,18 @@ def main():
         test_labels_filepath=str(data_path / 't10k-labels.idx1-ubyte')
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     # Split train/val
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15, random_seed=args.seed
     )
     # Create datasets with optional augmentation
     augmentation = get_train_augmentation() if args.augment else None
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=augmentation)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
     test_dataset = MnistDataset(x_test, y_test, transform=None)
     # Create data loaders
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=args.batch_size, num_workers=2
@@ -340,20 +366,20 @@ def main():
     test_loader = torch.utils.data.DataLoader(
         test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2
     )
     print(f"Train: {len(train_loader.dataset)} samples")
     print(f"Val: {len(val_loader.dataset)} samples")
     print(f"Test: {len(test_loader.dataset)} samples")
     # Create model
     model = BaselineCNN().to(config['device'])
     # Train with MLflow
     train_with_mlflow(
         model, train_loader, val_loader, test_loader,
         config, run_name=args.run_name
     )
     print("\n" + "="*70)
     print("Training complete! View MLflow dashboard:")
     print("  ./scripts/launch_mlflow_ui.sh")

 ) -> dict:
     """
     Train model with full MLflow tracking.
     Args:
         model: PyTorch model to train
         train_loader: Training data loader
         test_loader: Test data loader
         config: Training configuration dictionary
         run_name: Optional name for MLflow run
     Returns:
         Training history dictionary
     """
     device = config['device']
     num_epochs = config['num_epochs']
     learning_rate = config['learning_rate']
     # Setup MLflow
     setup_mlflow("mnist-digit-classification")
     # Start MLflow run
     with mlflow.start_run(run_name=run_name):
         print("\n" + "="*70)
         print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")
         print("="*70 + "\n")
         # Log all configuration
         print("Logging configuration to MLflow...")
         log_training_config(config)
             augmentation=config.get('augmentation', False)
         )
         log_system_info()
         # Log model architecture as text
         total_params, trainable_params = count_parameters(model)
         model_summary = f"""
 {str(model)}
 """
         mlflow.log_text(model_summary, "model_architecture.txt")
         # Setup training
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
         scheduler = optim.lr_scheduler.ReduceLROnPlateau(
             optimizer, mode='min', patience=3, factor=0.5, verbose=True
         )
         # Training history
         history = {
             'train_loss': [],
             'val_accuracy': [],
             'learning_rate': []
         }
         best_val_loss = float('inf')
         patience = 5
         patience_counter = 0
         print(f"\nStarting training for {num_epochs} epochs...")
         print(f"Device: {device}")
         total_p, _ = count_parameters(model)
         print(f"Model: {model.__class__.__name__} ({total_p:,} parameters)")
         print("-" * 70)
         for epoch in range(num_epochs):
             # Train
+            train_metrics = train_epoch(
+                model, train_loader, criterion, optimizer, device
+            )
             # Validate
             val_metrics = validate(model, val_loader, criterion, device)
             # Get current learning rate
             current_lr = optimizer.param_groups[0]['lr']
             # Update scheduler
             scheduler.step(val_metrics['loss'])
             # Save history
             history['train_loss'].append(train_metrics['loss'])
             history['train_accuracy'].append(train_metrics['accuracy'])
             history['val_loss'].append(val_metrics['loss'])
             history['val_accuracy'].append(val_metrics['accuracy'])
             history['learning_rate'].append(current_lr)
             # Log metrics to MLflow
             mlflow_metrics = {
                 'train_loss': train_metrics['loss'],
                 'epoch': epoch + 1
             }
             log_metrics_epoch(mlflow_metrics, step=epoch)
             # Print progress
+            print(
+                f"Epoch {epoch+1}/{num_epochs} | "
+                f"Train Loss: {train_metrics['loss']:.4f} "
+                f"({train_metrics['accuracy']:.2f}%) | "
+                f"Val Loss: {val_metrics['loss']:.4f} "
+                f"({val_metrics['accuracy']:.2f}%) | "
+                f"LR: {current_lr:.6f}"
+            )
             # Save best model
             if val_metrics['loss'] < best_val_loss:
                 best_val_loss = val_metrics['loss']
                 best_epoch = epoch + 1
                 patience_counter = 0
                 # Save checkpoint
                 checkpoint_path = project_root / 'models' / 'best_model_mlflow.pt'
                 torch.save({
                     'val_loss': val_metrics['loss'],
                     'val_accuracy': val_metrics['accuracy'],
                 }, checkpoint_path)
                 print(f"  → New best model! (Val Loss: {best_val_loss:.4f})")
                 # Log model to MLflow
                 mlflow.pytorch.log_model(
                     model,
                 )
             else:
                 patience_counter += 1
             # Early stopping
             if patience_counter >= patience:
                 print(f"\nEarly stopping triggered after {epoch+1} epochs")
                 mlflow.log_param("early_stopped", True)
                 mlflow.log_param("early_stop_epoch", epoch + 1)
                 break
         print("-" * 70)
         print("\nTraining complete!")
         print(f"Best epoch: {best_epoch} (Val Loss: {best_val_loss:.4f})")
         # Log best metrics
         mlflow.log_metrics({
             'best_epoch': best_epoch,
             'final_train_loss': history['train_loss'][-1],
             'final_val_loss': history['val_loss'][-1]
         })
         # Evaluate on test set
         print("\nEvaluating on test set...")
         test_metrics = evaluate_model(model, test_loader, device)
         test_accuracy = test_metrics['accuracy']
         test_report = test_metrics['classification_report']
         # Extract macro average metrics
         test_precision = test_report['macro avg']['precision']
         test_recall = test_report['macro avg']['recall']
         test_f1_score = test_report['macro avg']['f1-score']
         print(f"Test Accuracy: {test_accuracy:.2f}%")
         print(f"Test Precision: {test_precision:.4f}")
         print(f"Test Recall: {test_recall:.4f}")
         print(f"Test F1-Score: {test_f1_score:.4f}")
         # Log test metrics to MLflow
         mlflow.log_metrics({
             'test_accuracy': test_accuracy,
             'test_recall': test_recall,
             'test_f1_score': test_f1_score
         })
         # Save and log artifacts
         print("\nSaving artifacts...")
         # Save history
         history_path = project_root / 'results' / 'mlflow_training_history.json'
         history_path.parent.mkdir(exist_ok=True)
         save_training_history(history, history_path)
         log_artifact_path(str(history_path))
         # Save test metrics
         metrics_to_save = {
             'test_accuracy': test_accuracy,
         with open(metrics_path, 'w') as f:
             json.dump(metrics_to_save, f, indent=2)
         log_artifact_path(str(metrics_path))
         # Save model checkpoint
         log_artifact_path(str(project_root / 'models' / 'best_model_mlflow.pt'))
         # Log confusion matrix as JSON
         conf_matrix_dict = {
             f"row_{i}": test_metrics['confusion_matrix'][i].tolist()
             for i in range(len(test_metrics['confusion_matrix']))
         }
         mlflow.log_dict(conf_matrix_dict, "confusion_matrix.json")
         # Log classification report
         mlflow.log_dict(test_report, "classification_report.json")
         print("\n✓ All artifacts logged to MLflow")
         print("View results: mlflow ui --backend-store-uri file:./mlruns")
         return history
 def main():
+    parser = argparse.ArgumentParser(
+        description='Train MNIST CNN with MLflow tracking'
+    )
+    parser.add_argument(
+        '--epochs', type=int, default=20,
+        help='Number of epochs (default: 20)'
+    )
+    parser.add_argument(
+        '--lr', type=float, default=0.001,
+        help='Learning rate (default: 0.001)'
+    )
+    parser.add_argument(
+        '--batch-size', type=int, default=64,
+        help='Batch size (default: 64)'
+    )
+    parser.add_argument(
+        '--augment', action='store_true',
+        help='Use data augmentation'
+    )
+    parser.add_argument(
+        '--run-name', type=str, default=None,
+        help='MLflow run name'
+    )
+    parser.add_argument(
+        '--seed', type=int, default=42,
+        help='Random seed (default: 42)'
+    )
     args = parser.parse_args()
     # Set random seeds
     torch.manual_seed(args.seed)
     np.random.seed(args.seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(args.seed)
     # Configuration
     config = {
         'num_epochs': args.epochs,
         'scheduler': 'ReduceLROnPlateau',
         'early_stopping_patience': 5
     }
     print("Training Configuration:")
     print(json.dumps(config, indent=2))
     # Load MNIST data
     print("\nLoading MNIST data...")
     data_path = project_root / 'data' / 'raw'
         test_labels_filepath=str(data_path / 't10k-labels.idx1-ubyte')
     )
     (x_train, y_train), (x_test, y_test) = loader.load_data()
     # Split train/val
     (x_train_split, y_train_split), (x_val, y_val) = split_train_val(
         x_train, y_train, val_split=0.15, random_seed=args.seed
     )
     # Create datasets with optional augmentation
     augmentation = get_train_augmentation() if args.augment else None
     train_dataset = MnistDataset(x_train_split, y_train_split, transform=augmentation)
     val_dataset = MnistDataset(x_val, y_val, transform=None)
     test_dataset = MnistDataset(x_test, y_test, transform=None)
     # Create data loaders
     train_loader, val_loader = create_dataloaders(
         train_dataset, val_dataset, batch_size=args.batch_size, num_workers=2
     test_loader = torch.utils.data.DataLoader(
         test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2
     )
     print(f"Train: {len(train_loader.dataset)} samples")
     print(f"Val: {len(val_loader.dataset)} samples")
     print(f"Test: {len(test_loader.dataset)} samples")
     # Create model
     model = BaselineCNN().to(config['device'])
     # Train with MLflow
     train_with_mlflow(
         model, train_loader, val_loader, test_loader,
         config, run_name=args.run_name
     )
     print("\n" + "="*70)
     print("Training complete! View MLflow dashboard:")
     print("  ./scripts/launch_mlflow_ui.sh")