minor fixes fpr src/dataset

Browse files

Files changed (5) hide show

src/data/dataset.py +79 -40
src/main.py +30 -0
src/main/parser.py +43 -62
src/main/trainer.py +6 -3
src/utils.py +13 -4

src/data/dataset.py CHANGED Viewed

@@ -1,19 +1,41 @@
-"""Module for loading and processing .mat files containing channel estimates for PyTorch."""
-from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, List, Optional, Tuple, Union
 import scipy.io as sio
 import torch
 from torch.utils.data import Dataset, DataLoader
 from src.utils import extract_values
 __all__ = ['MatDataset', 'get_test_dataloaders']
-@dataclass
-class PilotDimensions:
     """Container for pilot signal dimensions.
     Stores and validates the dimensions of pilot signals used in channel estimation.
@@ -22,17 +44,8 @@ class PilotDimensions:
         num_subcarriers: Number of subcarriers in the pilot signal
         num_ofdm_symbols: Number of OFDM symbols in the pilot signal
     """
-    num_subcarriers: int
-    num_ofdm_symbols: int
-    def __post_init__(self):
-        """Validate dimensions after initialization.
-        Raises:
-            ValueError: If either dimension is not a positive integer
-        """
-        if self.num_subcarriers <= 0 or self.num_ofdm_symbols <= 0:
-            raise ValueError("Pilot dimensions must be positive integers")
     def as_tuple(self) -> Tuple[int, int]:
         """Return dimensions as a tuple.
@@ -48,6 +61,20 @@ class MatDataset(Dataset):
     Processes .mat files containing channel estimation data and converts them into
     PyTorch complex tensors for channel estimation tasks.
     """
     def __init__(
@@ -59,16 +86,16 @@ class MatDataset(Dataset):
         """Initialize the MatDataset.
         Args:
-            data_dir: Path to the directory containing the dataset.
             pilot_dims: Dimensions of pilot data as [num_subcarriers, num_ofdm_symbols].
             transform: Optional transformation to apply to samples.
         Raises:
-            ValueError: If pilot dimensions are invalid.
             FileNotFoundError: If data_dir doesn't exist.
         """
         self.data_dir = Path(data_dir)
-        self.pilot_dims = PilotDimensions(*pilot_dims)
         self.transform = transform
         if not self.data_dir.exists():
@@ -88,7 +115,6 @@ class MatDataset(Dataset):
     def _process_channel_data(
             self,
-            h_ideal: torch.Tensor,
             mat_data: dict
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Process channel data and extract pilot values from LS estimates.
@@ -97,8 +123,7 @@ class MatDataset(Dataset):
         returning complex-valued tensors for both estimate and ground truth.
         Args:
-            h_ideal: Ground truth channel tensor
-            mat_data: Loaded .mat file data
         Returns:
             Tuple of (pilot LS estimate, ground truth channel)
@@ -107,6 +132,9 @@ class MatDataset(Dataset):
             ValueError: If the data format is unexpected or processing fails
         """
         try:
             # Extract LS channel estimate with zero entries
             hzero_ls = torch.tensor(mat_data['H'][:, :, 1], dtype=torch.cfloat)
@@ -143,9 +171,10 @@ class MatDataset(Dataset):
         Returns:
             Tuple containing:
-                - Pilot LS channel estimate (complex tensor)
-                - Ground truth channel estimate (complex tensor)
-                - Metadata extracted from filename
         Raises:
             ValueError: If file format is invalid or processing fails.
@@ -155,16 +184,12 @@ class MatDataset(Dataset):
             raise IndexError(f"Index {idx} out of range for dataset of size {len(self)}")
         try:
-            # Load .mat file
             mat_data = sio.loadmat(self.file_list[idx])
             if 'H' not in mat_data or mat_data['H'].shape[-1] < 3:
                 raise ValueError("Invalid .mat file format: missing required data")
-            # Extract ground truth channel
-            h_ideal = torch.tensor(mat_data['H'][:, :, 0], dtype=torch.cfloat)
             # Process channel data to extract pilot estimates
-            h_est, h_ideal = self._process_channel_data(h_ideal, mat_data)
             # Extract metadata from filename
             meta_data = extract_values(self.file_list[idx].name)
@@ -184,7 +209,8 @@ class MatDataset(Dataset):
 def get_test_dataloaders(
         dataset_dir: Union[str, Path],
-        params: dict
 ) -> List[Tuple[str, DataLoader]]:
     """Create DataLoaders for each subdirectory in the dataset directory.
@@ -192,26 +218,39 @@ def get_test_dataloaders(
     all subdirectories in the specified dataset directory, useful for testing
     across multiple test conditions or scenarios.
     Args:
         dataset_dir: Path to main directory containing dataset subdirectories
-        params: Configuration parameters including:
-            - pilot_dims: List of [num_subcarriers, num_ofdm_symbols]
-            - batch_size: Number of samples per batch
     Returns:
         List of tuples containing (subdirectory_name, corresponding_dataloader)
     Raises:
         FileNotFoundError: If dataset_dir doesn't exist
-        ValueError: If params are invalid or no valid subdirectories are found
     """
     dataset_dir = Path(dataset_dir)
     if not dataset_dir.exists():
         raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
-    if not isinstance(params, dict) or "pilot_dims" not in params or "batch_size" not in params:
-        raise ValueError("params must be a dict containing 'pilot_dims' and 'batch_size'")
     subdirs = [d for d in dataset_dir.iterdir() if d.is_dir()]
     if not subdirs:
         raise ValueError(f"No subdirectories found in {dataset_dir}")
@@ -221,7 +260,7 @@ def get_test_dataloaders(
             subdir.name,
             MatDataset(
                 subdir,
-                params["pilot_dims"]
             )
         )
         for subdir in subdirs
@@ -230,8 +269,8 @@ def get_test_dataloaders(
     return [
         (name, DataLoader(
             dataset,
-            batch_size=params["batch_size"],
-            shuffle=False,
             num_workers=0
         ))
         for name, dataset in test_datasets

+"""Module for loading and processing .mat files containing channel estimates for PyTorch.
+This module expects .mat files with a specific naming convention and internal structure:
+File Naming Convention:
+    Files must follow the pattern: {file_number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat
+    Example: 1_SNR-20_DS-50_DOP-500_N-3_TDL-A.mat
+    - file_number: Sequential file identifier
+    - SNR: Signal-to-Noise Ratio in dB
+    - DS: Delay Spread
+    - DOP: Maximum Doppler Shift
+    - N: Pilot placement frequency
+    - channel_type: Channel model type (e.g., TDL-A)
+File Content Structure:
+    Each .mat file must contain a variable 'H' with shape [subcarriers, symbols, 3]:
+    - H[:, :, 0]: Ground truth channel (complex values)
+    - H[:, :, 1]: LS channel estimate with zeros for non-pilot positions
+    - H[:, :, 2]: Unused (reserved for future use)
+The dataset extracts pilot values from the LS estimates and provides metadata from the filename
+for adaptive channel estimation models.
+"""
 from pathlib import Path
 from typing import Callable, List, Optional, Tuple, Union
 import scipy.io as sio
 import torch
 from torch.utils.data import Dataset, DataLoader
+from pydantic import BaseModel, Field
 from src.utils import extract_values
 __all__ = ['MatDataset', 'get_test_dataloaders']
+class PilotDimensions(BaseModel):
     """Container for pilot signal dimensions.
     Stores and validates the dimensions of pilot signals used in channel estimation.
         num_subcarriers: Number of subcarriers in the pilot signal
         num_ofdm_symbols: Number of OFDM symbols in the pilot signal
     """
+    num_subcarriers: int = Field(..., gt=0, description="Number of subcarriers in the pilot signal")
+    num_ofdm_symbols: int = Field(..., gt=0, description="Number of OFDM symbols in the pilot signal")
     def as_tuple(self) -> Tuple[int, int]:
         """Return dimensions as a tuple.
     Processes .mat files containing channel estimation data and converts them into
     PyTorch complex tensors for channel estimation tasks.
+    Expected File Format:
+        - Files must be named according to the pattern:
+          {file_number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat
+        - Each .mat file must contain a variable 'H' with shape [subcarriers, symbols, 3]
+        - H[:, :, 0]: Ground truth channel (complex values)
+        - H[:, :, 1]: LS channel estimate with zeros for non-pilot positions
+        - H[:, :, 2]: Bilinear interpolated LS channel estimate
+    Returns:
+        For each sample, returns a tuple of:
+        - Pilot LS channel estimate (complex tensor, shape [pilot_subcarriers, pilot_symbols])
+        - Ground truth channel estimate (complex tensor, shape [ofdm_subcarriers, ofdm_symbols])
+        - Metadata tuple: (file_number, snr, delay_spread, doppler, pilot_freq, channel_type)
     """
     def __init__(
         """Initialize the MatDataset.
         Args:
+            data_dir: Path to the directory containing the dataset (should contain .mat files).
             pilot_dims: Dimensions of pilot data as [num_subcarriers, num_ofdm_symbols].
             transform: Optional transformation to apply to samples.
         Raises:
             FileNotFoundError: If data_dir doesn't exist.
+            ValueError: If no .mat files are found in data_dir.
         """
         self.data_dir = Path(data_dir)
+        self.pilot_dims = PilotDimensions(num_subcarriers=pilot_dims[0], num_ofdm_symbols=pilot_dims[1])
         self.transform = transform
         if not self.data_dir.exists():
     def _process_channel_data(
             self,
             mat_data: dict
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Process channel data and extract pilot values from LS estimates.
         returning complex-valued tensors for both estimate and ground truth.
         Args:
+            mat_data: Loaded .mat file data containing 'H' variable
         Returns:
             Tuple of (pilot LS estimate, ground truth channel)
             ValueError: If the data format is unexpected or processing fails
         """
         try:
+            # Extract ground truth channel
+            h_ideal = torch.tensor(mat_data['H'][:, :, 0], dtype=torch.cfloat)
             # Extract LS channel estimate with zero entries
             hzero_ls = torch.tensor(mat_data['H'][:, :, 1], dtype=torch.cfloat)
         Returns:
             Tuple containing:
+                - Pilot LS channel estimate (complex tensor, shape [pilot_subcarriers, pilot_symbols])
+                - Ground truth channel estimate (complex tensor, shape [ofdm_subcarriers, ofdm_symbols])
+                - Metadata tuple: (file_number, snr, delay_spread, doppler, pilot_freq, channel_type)
+                  All metadata values are torch.Tensor except channel_type which is a list
         Raises:
             ValueError: If file format is invalid or processing fails.
             raise IndexError(f"Index {idx} out of range for dataset of size {len(self)}")
         try:
             mat_data = sio.loadmat(self.file_list[idx])
             if 'H' not in mat_data or mat_data['H'].shape[-1] < 3:
                 raise ValueError("Invalid .mat file format: missing required data")
             # Process channel data to extract pilot estimates
+            h_est, h_ideal = self._process_channel_data(mat_data)
             # Extract metadata from filename
             meta_data = extract_values(self.file_list[idx].name)
 def get_test_dataloaders(
         dataset_dir: Union[str, Path],
+        pilot_dims: List[int],
+        batch_size: int
 ) -> List[Tuple[str, DataLoader]]:
     """Create DataLoaders for each subdirectory in the dataset directory.
     all subdirectories in the specified dataset directory, useful for testing
     across multiple test conditions or scenarios.
+    Expected Directory Structure:
+        dataset_dir/
+        ├── DS_50/          # Delay Spread = 50
+        │   ├── 1_SNR-20_DS-50_DOP-500_N-3_TDL-A.mat
+        │   ├── 2_SNR-20_DS-50_DOP-500_N-3_TDL-A.mat
+        │   └── ...
+        ├── DS_100/         # Delay Spread = 100
+        │   ├── 1_SNR-20_DS-100_DOP-500_N-3_TDL-A.mat
+        │   └── ...
+        ├── SNR_10/         # SNR = 10 dB
+        │   ├── 1_SNR-10_DS-50_DOP-500_N-3_TDL-A.mat
+        │   └── ...
+        └── ...
+    Each subdirectory should contain .mat files with the naming convention:
+    {file_number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat
     Args:
         dataset_dir: Path to main directory containing dataset subdirectories
+        pilot_dims: List of [num_subcarriers, num_ofdm_symbols] for pilot dimensions
+        batch_size: Number of samples per batch
     Returns:
         List of tuples containing (subdirectory_name, corresponding_dataloader)
     Raises:
         FileNotFoundError: If dataset_dir doesn't exist
+        ValueError: If no valid subdirectories are found
     """
     dataset_dir = Path(dataset_dir)
     if not dataset_dir.exists():
         raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
     subdirs = [d for d in dataset_dir.iterdir() if d.is_dir()]
     if not subdirs:
         raise ValueError(f"No subdirectories found in {dataset_dir}")
             subdir.name,
             MatDataset(
                 subdir,
+                pilot_dims
             )
         )
         for subdir in subdirs
     return [
         (name, DataLoader(
             dataset,
+            batch_size=batch_size,
+            shuffle=False,  # no shuffling for testing
             num_workers=0
         ))
         for name, dataset in test_datasets

src/main.py CHANGED Viewed

@@ -5,6 +5,36 @@ Main entry point for OFDM channel estimation model training.
 This script provides the command-line interface for training OFDM channel estimation
 models. It loads configuration files, parses command-line arguments, and initiates
 the training process.
 """
 import logging

 This script provides the command-line interface for training OFDM channel estimation
 models. It loads configuration files, parses command-line arguments, and initiates
 the training process.
+Dataset Requirements:
+    The training script expects datasets with the following structure:
+    Training/Validation Sets:
+        Directory containing .mat files with naming convention:
+        {file_number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat
+        Example: 1_SNR-20_DS-50_DOP-500_N-3_TDL-A.mat
+    Test Sets:
+        Directory with subdirectories for different test conditions:
+        test_set/
+        ├── DS_test_set/     # Delay Spread tests
+        │   ├── DS_50/
+        │   ├── DS_100/
+        │   └── ...
+        ├── SNR_test_set/    # SNR tests
+        │   ├── SNR_10/
+        │   ├── SNR_20/
+        │   └── ...
+        └── MDS_test_set/    # Multi-Doppler tests
+            ├── DOP_200/
+            ├── DOP_400/
+            └── ...
+    Each .mat file must contain variable 'H' with shape [subcarriers, symbols, 3]:
+    - H[:, :, 0]: Ground truth channel
+    - H[:, :, 1]: LS channel estimate with zeros for non-pilot positions
+    - H[:, :, 2]: Unused (reserved)
 """
 import logging

src/main/parser.py CHANGED Viewed

@@ -7,10 +7,11 @@ their types, default values, and validation rules to ensure proper configuration
 of training runs.
 """
-from dataclasses import dataclass
 from pathlib import Path
 import argparse
 from enum import Enum
 class LossType(Enum):
@@ -20,8 +21,7 @@ class LossType(Enum):
     HUBER = "huber"
-@dataclass
-class TrainingArguments:
     """Container for OFDM model training arguments.
     Stores, validates, and provides access to all parameters needed for
@@ -57,45 +57,34 @@ class TrainingArguments:
     """
     # Model Configuration
-    model_name: str
-    system_config_path: Path
-    model_config_path: Path
     # Dataset Paths
-    train_set: Path
-    val_set: Path
-    test_set: Path
     # Experiment Settings
-    exp_id: str
-    python_log_level: str = "INFO"
-    tensorboard_log_dir: Path = Path("runs")
     # Training Hyperparameters
-    batch_size: int = 64
-    lr: float = 1e-3
-    max_epoch: int = 10
-    patience: int = 3
-    loss_type: LossType = LossType.MSE
-    return_type: str = "complex"
     # Hardware & Evaluation
-    cuda: int = 0
-    test_every_n: int = 10
-    def __post_init__(self) -> None:
-        """Validate arguments after initialization.
-        Runs multiple validation checks on the provided arguments to ensure
-        they are consistent and valid for training.
-        Raises:
-            ValueError: If any validation check fails
-        """
-        self._validate_paths()
-        self._validate_numeric_args()
-    def _validate_paths(self) -> None:
         """Validate path-related arguments.
         Checks that the config files exist and have the correct extension.
@@ -115,33 +104,7 @@ class TrainingArguments:
         if not self.model_config_path.suffix == '.yaml':
             raise ValueError(f"Model config file must be a .yaml file: {self.model_config_path}")
-    def _validate_numeric_args(self) -> None:
-        """Validate numeric arguments.
-        Ensures that all numeric parameters have appropriate values:
-        - test_every_n, max_epoch, patience, batch_size, lr must be positive
-        - cuda must be non-negative
-        Raises:
-            ValueError: If any numeric argument has an invalid value
-        """
-        if self.test_every_n <= 0:
-            raise ValueError(f"test_every_n must be positive, got: {self.test_every_n}")
-        if self.max_epoch <= 0:
-            raise ValueError(f"max_epoch must be positive, got: {self.max_epoch}")
-        if self.patience <= 0:
-            raise ValueError(f"patience must be positive, got: {self.patience}")
-        if self.batch_size <= 0:
-            raise ValueError(f"batch_size must be positive, got: {self.batch_size}")
-        if self.cuda < 0:
-            raise ValueError(f"cuda must be non-negative, got: {self.cuda}")
-        if self.lr <= 0:
-            raise ValueError(f"lr must be positive, got: {self.lr}")
 def parse_arguments() -> TrainingArguments:
@@ -278,7 +241,25 @@ def parse_arguments() -> TrainingArguments:
     args = parser.parse_args()
     # Convert loss_type string to enum
-    args.loss_type = LossType(args.loss_type)
     # Create and validate TrainingArguments
-    return TrainingArguments(**vars(args))

 of training runs.
 """
 from pathlib import Path
 import argparse
 from enum import Enum
+from pydantic import BaseModel, Field, model_validator
+from typing import Self
 class LossType(Enum):
     HUBER = "huber"
+class TrainingArguments(BaseModel):
     """Container for OFDM model training arguments.
     Stores, validates, and provides access to all parameters needed for
     """
     # Model Configuration
+    model_name: str = Field(..., description="Model type to train")
+    system_config_path: Path = Field(..., description="Path to OFDM system configuration file")
+    model_config_path: Path = Field(..., description="Path to model configuration file")
     # Dataset Paths
+    train_set: Path = Field(..., description="Training dataset folder path")
+    val_set: Path = Field(..., description="Validation dataset folder path")
+    test_set: Path = Field(..., description="Test dataset folder path")
     # Experiment Settings
+    exp_id: str = Field(..., description="Experiment identifier for log folder naming")
+    python_log_level: str = Field(default="INFO", description="Logger level for python logging module")
+    tensorboard_log_dir: Path = Field(default=Path("runs"), description="Directory for tensorboard logs")
     # Training Hyperparameters
+    batch_size: int = Field(default=64, gt=0, description="Training batch size")
+    lr: float = Field(default=1e-3, gt=0, description="Initial learning rate")
+    max_epoch: int = Field(default=10, gt=0, description="Maximum number of training epochs")
+    patience: int = Field(default=3, gt=0, description="Early stopping patience (epochs)")
+    loss_type: LossType = Field(default=LossType.MSE, description="Loss function type")
+    return_type: str = Field(default="complex", description="Type of data to return from dataset")
     # Hardware & Evaluation
+    cuda: int = Field(default=0, ge=0, description="CUDA device index (0 for single GPU)")
+    test_every_n: int = Field(default=10, gt=0, description="Test model every N epochs")
+    @model_validator(mode='after')
+    def validate_paths(self) -> Self:
         """Validate path-related arguments.
         Checks that the config files exist and have the correct extension.
         if not self.model_config_path.suffix == '.yaml':
             raise ValueError(f"Model config file must be a .yaml file: {self.model_config_path}")
+        return self
 def parse_arguments() -> TrainingArguments:
     args = parser.parse_args()
     # Convert loss_type string to enum
+    loss_type = LossType(args.loss_type)
     # Create and validate TrainingArguments
+    return TrainingArguments(
+        model_name=args.model_name,
+        system_config_path=args.system_config_path,
+        model_config_path=args.model_config_path,
+        train_set=args.train_set,
+        val_set=args.val_set,
+        test_set=args.test_set,
+        exp_id=args.exp_id,
+        python_log_level=args.python_log_level,
+        tensorboard_log_dir=args.tensorboard_log_dir,
+        batch_size=args.batch_size,
+        lr=args.lr,
+        max_epoch=args.max_epoch,
+        patience=args.patience,
+        loss_type=loss_type,
+        return_type=args.return_type,
+        cuda=args.cuda,
+        test_every_n=args.test_every_n
+    )

src/main/trainer.py CHANGED Viewed

@@ -163,15 +163,18 @@ class ModelTrainer:
         test_loaders = {
             "DS": get_test_dataloaders(
                 self.args.test_set / "DS_test_set",
-                {"pilot_dims": pilot_dims, "batch_size": self.args.batch_size}
             ),
             "MDS": get_test_dataloaders(
                 self.args.test_set / "MDS_test_set",
-                {"pilot_dims": pilot_dims, "batch_size": self.args.batch_size}
             ),
             "SNR": get_test_dataloaders(
                 self.args.test_set / "SNR_test_set",
-                {"pilot_dims": pilot_dims, "batch_size": self.args.batch_size}
             ),
         }
         return train_loader, val_loader, test_loaders

         test_loaders = {
             "DS": get_test_dataloaders(
                 self.args.test_set / "DS_test_set",
+                pilot_dims,
+                self.args.batch_size
             ),
             "MDS": get_test_dataloaders(
                 self.args.test_set / "MDS_test_set",
+                pilot_dims,
+                self.args.batch_size
             ),
             "SNR": get_test_dataloaders(
                 self.args.test_set / "SNR_test_set",
+                pilot_dims,
+                self.args.batch_size
             ),
         }
         return train_loader, val_loader, test_loaders

src/utils.py CHANGED Viewed

@@ -70,19 +70,28 @@ def extract_values(file_name):
     Extract channel information from a file name.
     Parses file names with format:
-    '{number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat'
     Args:
         file_name: The file name from which to extract channel information
     Returns:
         tuple: A tuple containing:
-            - file_number (torch.Tensor): The file number
-            - snr (torch.Tensor): Signal-to-noise ratio value
             - delay_spread (torch.Tensor): Delay spread value
             - max_doppler_shift (torch.Tensor): Maximum Doppler shift value
             - pilot_placement_frequency (torch.Tensor): Pilot placement frequency
-            - channel_type (list): The channel type
     Raises:
         ValueError: If the file name does not match the expected pattern

     Extract channel information from a file name.
     Parses file names with format:
+    '{file_number}_SNR-{snr}_DS-{delay_spread}_DOP-{doppler}_N-{pilot_freq}_{channel_type}.mat'
+    Example:
+        For filename "1_SNR-20_DS-50_DOP-500_N-3_TDL-A.mat":
+        - file_number: 1
+        - snr: 20 (Signal-to-Noise Ratio in dB)
+        - delay_spread: 50 (Delay Spread)
+        - doppler: 500 (Maximum Doppler Shift)
+        - pilot_freq: 3 (Pilot placement frequency)
+        - channel_type: TDL-A (Channel model type)
     Args:
         file_name: The file name from which to extract channel information
     Returns:
         tuple: A tuple containing:
+            - file_number (torch.Tensor): The file number (sequential identifier)
+            - snr (torch.Tensor): Signal-to-noise ratio value in dB
             - delay_spread (torch.Tensor): Delay spread value
             - max_doppler_shift (torch.Tensor): Maximum Doppler shift value
             - pilot_placement_frequency (torch.Tensor): Pilot placement frequency
+            - channel_type (list): The channel type (e.g., ['TDL-A'])
     Raises:
         ValueError: If the file name does not match the expected pattern