Upload pretraining/train_lwm_spectro.py with huggingface_hub

Browse files

Files changed (1) hide show

pretraining/train_lwm_spectro.py +741 -0

pretraining/train_lwm_spectro.py ADDED Viewed

	@@ -0,0 +1,741 @@

+#!/usr/bin/env python3
+# =============================================================================
+# train_lwm_spectro.py - LWM Pretraining with Complex-Valued Spectrogram Support
+# Modified from train_lwm_spectro_no_contrast.py to handle complex spectrograms
+# by separating real and imaginary parts and flattening them (similar to train_lwm.py)
+# =============================================================================
+# =============================================================================
+# 1. IMPORTS AND WARNINGS SETUP
+#    - Load necessary PyTorch modules, utilities, and suppress UserWarnings
+# =============================================================================
+import sys
+import os
+import argparse
+# Add project root to path (Windows compatible)
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, random_split, TensorDataset
+import torch.optim as optim
+from utils import (generate_spectrograms_and_labels, tokenizer_train,
+                   create_train_dataloader, count_parameters, train_lwm)
+import numpy as np
+import pretrained_model  # Assuming this contains the LWM model definition
+from torch.optim.lr_scheduler import LambdaLR
+from torch.optim import AdamW
+import warnings
+import platform
+import re
+from tqdm import tqdm
+from datetime import datetime
+import concurrent.futures
+import multiprocessing
+from collections import Counter
+from functools import lru_cache
+import json
+SNR_PATTERN = re.compile(r"SNR(-?\d+)dB")
+DOPPLER_MAP = {"static": 0, "pedestrian": 1, "vehicular": 2}
+DOPPLER_INV = {v: k for k, v in DOPPLER_MAP.items()}
+def _parse_snr_and_doppler(path: str) -> tuple[float, int]:
+    snr_db = 0.0
+    doppler_id = 0
+    matches = SNR_PATTERN.findall(path)
+    if matches:
+        try:
+            snr_db = float(matches[-1])
+        except ValueError:
+            snr_db = 0.0
+    normalized_path = os.path.normpath(path)
+    parts = normalized_path.split(os.sep)
+    for part in parts:
+        if part in DOPPLER_MAP:
+            doppler_id = DOPPLER_MAP[part]
+            break
+    return snr_db, doppler_id
+warnings.filterwarnings("ignore", category=UserWarning)
+# Use simple progress display instead of tqdm on Windows
+USE_TQDM = platform.system() != 'Windows'
+# CPU 코어 수 계산 (메모리 사용량 고려하여 보수적으로 설정)
+total_cores = multiprocessing.cpu_count()
+if total_cores >= 16:
+    MAX_WORKERS = min(8, total_cores // 2)  # 고성능 서버의 경우 8코어로 제한
+else:
+    MAX_WORKERS = max(2, total_cores // 2)  # 일반 시스템의 경우 절반 사용
+print(f"🚀 Using {MAX_WORKERS}/{total_cores} CPU cores for parallel processing")
+PRINT_CONVERSION_STATS = os.environ.get("LWM_PRINT_CONVERSION_STATS", "").strip().lower() in {"1", "true", "yes"}
+def convert_complex_to_interleaved(spectrograms):
+    """
+    Convert complex-valued spectrograms to real-imaginary interleaved format.
+    Similar to patch_maker() in train_lwm.py, this function:
+    1. Extracts real and imaginary parts
+    2. Interleaves them along the last dimension
+    Args:
+        spectrograms (np.ndarray): Complex-valued array of shape (n_samples, n_rows, n_cols)
+                                   or (n_samples, 1, n_rows, n_cols)
+    Returns:
+        np.ndarray: Real-valued array with interleaved real/imag parts
+                   Shape: (n_samples, n_rows, n_cols * 2)
+    """
+    # Handle different input shapes
+    if spectrograms.ndim == 4:
+        # Remove channel dimension if present: (n_samples, 1, n_rows, n_cols) -> (n_samples, n_rows, n_cols)
+        spectrograms = spectrograms[:, 0, :, :]
+    # Check if data is complex
+    if np.iscomplexobj(spectrograms):
+        n_samples, n_rows, n_cols = spectrograms.shape
+        # Extract real and imaginary parts
+        flat_real = spectrograms.real
+        flat_imag = spectrograms.imag
+        # Interleave real and imaginary parts along the last axis
+        # Output shape: (n_samples, n_rows, n_cols * 2)
+        interleaved = np.empty((n_samples, n_rows, n_cols * 2), dtype=np.float32)
+        interleaved[:, :, 0::2] = flat_real  # Even indices: real parts
+        interleaved[:, :, 1::2] = flat_imag  # Odd indices: imaginary parts
+        if PRINT_CONVERSION_STATS:
+            print(f"  ℹ️  Converted complex spectrograms: {spectrograms.shape} -> {interleaved.shape}")
+            print(f"      Real part range: [{flat_real.min():.2e}, {flat_real.max():.2e}]")
+            print(f"      Imag part range: [{flat_imag.min():.2e}, {flat_imag.max():.2e}]")
+        return interleaved
+    else:
+        # Already real-valued, just ensure correct shape
+        if spectrograms.ndim == 3:
+            if PRINT_CONVERSION_STATS:
+                print(f"  ℹ️  Data is already real-valued: {spectrograms.shape}")
+            return spectrograms
+        else:
+            raise ValueError(f"Unexpected spectrogram shape: {spectrograms.shape}")
+def process_single_scenario(scenario_info):
+    """단일 시나리오를 처리하는 함수 (멀티프로세싱용)"""
+    scenario_name, spectrogram_path = scenario_info
+    try:
+        # 메모리 효율성을 위해 필요한 데이터만 로드
+        scenario_spectrograms, scenario_labels = generate_spectrograms_and_labels(
+            scenario_name=scenario_name,
+            spectrogram_path=spectrogram_path,
+            cache_path=None,  # 메모리 문제로 캐시 비활성화
+        )
+        # Validate load
+        if scenario_spectrograms is None or (hasattr(scenario_spectrograms, 'size') and scenario_spectrograms.size == 0):
+            print(f"  ⚠️  No data loaded from: {spectrogram_path}")
+            return None
+        # Convert complex spectrograms to interleaved real-imaginary format
+        scenario_spectrograms = convert_complex_to_interleaved(scenario_spectrograms)
+        snr_db, doppler_id = _parse_snr_and_doppler(spectrogram_path)
+        # 데이터 분할 (인덱스만 계산)
+        total_samples = len(scenario_spectrograms)
+        train_size = int(0.8 * total_samples)
+        val_size = total_samples - train_size
+        # 메모리 절약을 위해 numpy array로 유지 (필요할 때만 tensor로 변환)
+        train_data = np.array(scenario_spectrograms[:train_size], dtype=np.float32)
+        val_data = np.array(scenario_spectrograms[train_size:], dtype=np.float32)
+        snr_array = np.full(total_samples, snr_db, dtype=np.float32)
+        doppler_array = np.full(total_samples, doppler_id, dtype=np.int64)
+        train_meta = {
+            'snr_db': snr_array[:train_size],
+            'doppler_id': doppler_array[:train_size],
+        }
+        val_meta = {
+            'snr_db': snr_array[train_size:],
+            'doppler_id': doppler_array[train_size:],
+        }
+        # 불필요한 데이터 즉시 삭제
+        del scenario_spectrograms
+        return {
+            'scenario': scenario_name,
+            'train_data': train_data,
+            'val_data': val_data,
+            'train_meta': train_meta,
+            'val_meta': val_meta,
+            'train_size': len(train_data),
+            'val_size': len(val_data)
+        }
+    except Exception as e:
+        print(f"❌ Error processing scenario {scenario_name}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+# GPU Memory Monitor import (for Lambda) - Removed
+# =============================================================================
+# 2. SCENARIO LIST DEFINITION
+#    - Define the list of scenario names to iterate over for data generation
+# =============================================================================
+# Supported communications; can be limited via CLI
+SUPPORTED_COMM_TYPES = {"LTE", "WiFi", "5G"}
+def _parse_standard_args():
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument('--standards', nargs='+', choices=SUPPORTED_COMM_TYPES,
+                        help='Specify one or more communication types to include (default: all).')
+    for comm in SUPPORTED_COMM_TYPES:
+        parser.add_argument(f'--{comm}', dest=f'flag_{comm}', action='store_true',
+                            help=f'Include only {comm} data (can be combined).')
+    parser.add_argument('--city', '--cities', dest='cities', nargs='+',
+                        help='Limit scenarios to one or more city prefixes (e.g., "0" or "city_0").')
+    parser.add_argument(
+        '--normalization',
+        choices=('per_sample', 'dataset'),
+        default='per_sample',
+        help='Normalization mode applied during tokenization (default: %(default)s).'
+    )
+    parser.add_argument('--help', action='help')
+    args, remaining = parser.parse_known_args()
+    enabled = set(SUPPORTED_COMM_TYPES)
+    if args.standards:
+        enabled = set(args.standards)
+    else:
+        flagged = {comm for comm in SUPPORTED_COMM_TYPES if getattr(args, f'flag_{comm}', False)}
+        if flagged:
+            enabled = flagged
+    selected_cities: list[str] | None = None
+    if args.cities:
+        selected_cities = []
+        for city_token in args.cities:
+            token = str(city_token).strip()
+            if not token:
+                continue
+            if token.startswith('city_'):
+                selected_cities.append(token)
+            else:
+                selected_cities.append(f'city_{token}')
+        if not selected_cities:
+            selected_cities = None
+    # Return remaining args to allow downstream parsing if needed
+    sys.argv = [sys.argv[0]] + remaining
+    return enabled, selected_cities, args.normalization
+ENABLED_COMM_TYPES, ENABLED_CITY_PREFIXES, NORMALIZATION_MODE = _parse_standard_args()
+MAX_SCENARIOS = int(os.environ.get("LWM_MAX_SCENARIOS", "0")) or None
+def _extract_scenario_token(file_path):
+    """Derive the base scenario token (without city) from the file path."""
+    normalized_path = os.path.normpath(file_path)
+    parts = normalized_path.split(os.sep)
+    scenario_parts = []
+    for i, part in enumerate(parts):
+        if part in SUPPORTED_COMM_TYPES:
+            trailing = parts[i:i + 5]
+            if trailing:
+                scenario_parts = trailing[:5]
+            break
+    if not scenario_parts:
+        # Fallback for datasets where the communication type is only captured in the filename
+        base_name = os.path.splitext(os.path.basename(file_path))[0]
+        if base_name.startswith('spectrogram_'):
+            tokens = base_name.split('_')[1:]  # drop 'spectrogram'
+            if tokens and tokens[0] in SUPPORTED_COMM_TYPES:
+                scenario_parts = tokens[:5] if len(tokens) >= 5 else tokens
+    return '_'.join(scenario_parts) if scenario_parts else None
+@lru_cache(maxsize=1)
+def _collect_scenario_file_info():
+    import glob
+    scenario_entries = []
+    # New MATLAB receiver pipeline output
+    new_base = os.path.join('ls_data', 'MATLAB', 'receiver_pipeline')
+    if os.path.isdir(new_base):
+        patterns = [os.path.join(new_base, '*', '**', 'spectrogram_*.mat')]
+        for pattern in patterns:
+            for file_path in sorted(glob.glob(pattern, recursive=True)):
+                norm = os.path.normpath(file_path)
+                parts = norm.split(os.sep)
+                # Determine a grouping token similar to city_name; use the standard folder name
+                try:
+                    idx = parts.index('receiver_pipeline')
+                    city_name = parts[idx + 1] if idx + 1 < len(parts) else 'receiver_pipeline'
+                except ValueError:
+                    city_name = 'receiver_pipeline'
+                base_token = _extract_scenario_token(file_path)
+                if not base_token:
+                    continue
+                comm_type = base_token.split('_', 1)[0]
+                if comm_type not in ENABLED_COMM_TYPES:
+                    continue
+                scenario_id = f"{city_name}::{base_token}"
+                scenario_entries.append((scenario_id, file_path, city_name, base_token))
+    # Legacy repo layouts under spectrograms/city_*
+    import glob as _glob
+    for city_dir in sorted(_glob.glob(os.path.join('spectrograms', 'city_*'))):
+        if not os.path.isdir(city_dir):
+            continue
+        city_name = os.path.basename(city_dir)
+        if ENABLED_CITY_PREFIXES:
+            if not any(city_name.startswith(prefix) for prefix in ENABLED_CITY_PREFIXES):
+                continue
+        # Look for complex spectrogram outputs; support both nested and flat layouts
+        candidate_patterns = [
+            os.path.join(city_dir, '**', 'complex_raw', '**', 'spectrogram_*.mat'),
+            os.path.join(city_dir, '**', 'spectrogram_*.mat'),
+        ]
+        city_files = []
+        seen_paths = set()
+        for pattern in candidate_patterns:
+            for file_path in sorted(_glob.glob(pattern, recursive=True)):
+                if not file_path.lower().endswith('.mat'):
+                    continue
+                if file_path in seen_paths:
+                    continue
+                seen_paths.add(file_path)
+                city_files.append(file_path)
+        # Fallback: 512FFT pattern (기존 호환성)
+        if not city_files:
+            pattern = os.path.join(city_dir, '**', '512FFT', '**', 'spectrograms', '*.pkl')
+            city_files = sorted(_glob.glob(pattern, recursive=True))
+        for file_path in city_files:
+            base_token = _extract_scenario_token(file_path)
+            if not base_token:
+                continue
+            comm_type = base_token.split('_', 1)[0]
+            if comm_type not in ENABLED_COMM_TYPES:
+                continue
+            scenario_id = f"{city_name}::{base_token}"
+            scenario_entries.append((scenario_id, file_path, city_name, base_token))
+    if MAX_SCENARIOS:
+        scenario_entries = scenario_entries[:MAX_SCENARIOS]
+    return scenario_entries
+def scenarios_list():
+    scenario_entries = _collect_scenario_file_info()
+    if not scenario_entries:
+        print("⚠️  No spectrogram files found for pretraining.")
+        return np.array([])
+    print(f"Enabled communication types: {sorted(ENABLED_COMM_TYPES)}")
+    if ENABLED_CITY_PREFIXES:
+        print(f"Selected city prefixes: {sorted(ENABLED_CITY_PREFIXES)}")
+    city_counts = Counter(entry[2] for entry in scenario_entries)
+    print("Using scenarios from the following city datasets:")
+    for city_name, count in city_counts.items():
+        print(f"  - {city_name}: {count} files")
+    print(f"Total scenarios selected: {len(scenario_entries)}")
+    return np.array([entry[0] for entry in scenario_entries])
+# =============================================================================
+# 3. SCENARIO PROPERTIES MAPPING
+#    - Map each scenario name to its corresponding properties
+# =============================================================================
+def scenario_prop():
+    scenario_entries = _collect_scenario_file_info()
+    row_column_users = {}
+    for scenario_id, file_path, city_name, _ in scenario_entries:
+        row_column_users[scenario_id] = {
+            'spectrogram_path': file_path,
+            'cache_path': os.path.join('spectrograms', city_name, 'spectrogram_cache_128x128.pkl')
+        }
+    return row_column_users
+# =============================================================================
+# 4. TRAINING PARAMETERS AND HYPERPARAMETERS
+#    - Set training epochs, batch sizes, learning rates, model dimensions, etc.
+# =============================================================================
+EPOCHS = 20  # Increased for better convergence
+# Optimized batch size for A100 GPU (40GB)
+BATCH_SIZE = 16
+VAL_BATCH_SIZE = 16
+WARMUP_EPOCHS = 5
+BASE_LR = 5e-4
+MIN_LR = 1e-8
+# Updated for 128x128 complex spectrograms with real-imaginary interleaving
+N_ROWS = 4
+N_COLUMNS = 4
+ELEMENT_LENGTH = N_ROWS * N_COLUMNS * 2  # Complex spectrograms: 2x for real+imaginary interleaving
+D_MODEL = 128
+MAX_LEN = 1025  # (128/4) * (128/4) + 1 = 32 * 32 + 1 = 1024 + 1 for [CLS] token
+# Interleaving keeps the same number of spatial patches (32x32) while doubling patch width
+# so each token covers 4x4 complex bins (real+imag) and sequence length stays at 1025.
+N_LAYERS = 12
+device_idx = 0
+WEIGHT_DECAY = 0.05
+BETA1 = 0.9
+BETA2 = 0.999
+MASK_PERCENT = 0.6
+N_HEADS = 8
+DROPOUT = 0.1
+print(f"📊 Model configuration for complex spectrograms:")
+print(f"   Patch size: {N_ROWS}x{N_COLUMNS}")
+print(f"   Element length: {ELEMENT_LENGTH} (includes real+imag interleaving)")
+print(f"   Max sequence length: {MAX_LEN}")
+# =============================================================================
+# 5. DATA GENERATION LOOP
+#    - Iterate over scenarios to generate spectrogram samples and labels
+# =============================================================================
+scenarios = scenarios_list()
+scenario_properties = scenario_prop()
+# Collect all training and validation data separately
+train_spectrogram_chunks = []
+val_spectrogram_chunks = []
+train_label_chunks = []
+val_label_chunks = []
+train_meta_chunks = []
+val_meta_chunks = []
+print(f"📂 Loading {len(scenarios)} scenarios...")
+# TEMP: Modified to not use cache
+print("⚠️  TEMPORARY FIX: Skipping cache to avoid memory issues")
+cache_path = None  # Disable cache usage
+# 단일 프로세스 시나리오 처리 (멀티프로세싱 비활성화)
+scenario_info_list = []
+missing_props = []
+for scenario in scenarios:
+    props = scenario_properties.get(scenario)
+    if props is None:
+        missing_props.append(scenario)
+        continue
+    scenario_info_list.append((scenario, props["spectrogram_path"]))
+if missing_props:
+    print("⚠️  Missing metadata for the following scenarios; skipping:")
+    for scen in missing_props:
+        print(f"    - {scen}")
+print(f"📂 Loading {len(scenario_info_list)} scenarios using single process...")
+# 단일 프로세스로 처리
+successful_scenarios = 0
+scenario_results = []
+for scenario_info in tqdm(scenario_info_list, desc="Processing scenarios", unit="scenario"):
+    scenario_name = scenario_info[0]
+    try:
+        result = process_single_scenario(scenario_info)
+        if result is not None:
+            # 데이터 수집 (시나리오 단위로 누적)
+            train_spectrogram_chunks.append(result['train_data'])
+            val_spectrogram_chunks.append(result['val_data'])
+            train_label_chunks.append(np.zeros(result['train_size'], dtype=np.int64))
+            val_label_chunks.append(np.zeros(result['val_size'], dtype=np.int64))
+            train_meta_chunks.append(result['train_meta'])
+            val_meta_chunks.append(result['val_meta'])
+            successful_scenarios += 1
+    except Exception as e:
+        print(f"❌ Scenario {scenario_name} processing failed: {e}")
+print(f"✅ Processing completed! Successful scenarios: {successful_scenarios}/{len(scenario_info_list)}")
+if not train_spectrogram_chunks or not val_spectrogram_chunks:
+    raise ValueError("No spectrogram data collected; check scenario configuration.")
+print("🔄 Collating spectrogram arrays...")
+train_spectrograms = np.concatenate(train_spectrogram_chunks, axis=0).astype(np.float32, copy=False)
+val_spectrograms = np.concatenate(val_spectrogram_chunks, axis=0).astype(np.float32, copy=False)
+train_labels = np.concatenate(train_label_chunks, axis=0)
+val_labels = np.concatenate(val_label_chunks, axis=0)
+def _concat_metadata_dicts(dict_list):
+    if not dict_list:
+        return {}
+    keys = dict_list[0].keys()
+    return {k: np.concatenate([d[k] for d in dict_list], axis=0) for k in keys}
+train_metadata = _concat_metadata_dicts(train_meta_chunks)
+val_metadata = _concat_metadata_dicts(val_meta_chunks)
+del train_spectrogram_chunks, val_spectrogram_chunks, train_label_chunks, val_label_chunks
+del train_meta_chunks, val_meta_chunks
+print(f"Training spectrograms shape: {train_spectrograms.shape}")
+print(f"Validation spectrograms shape: {val_spectrograms.shape}")
+print(f"Memory usage: {train_spectrograms.nbytes + val_spectrograms.nbytes + train_labels.nbytes + val_labels.nbytes:,} bytes")
+train_mean = float(train_spectrograms.mean())
+train_std = float(train_spectrograms.std())
+if abs(train_std) < 1e-6:
+    print("⚠️  Training std near zero, using epsilon for stability")
+    train_std = 1e-6
+dataset_normalization = {'mean': train_mean, 'std': train_std, 'normalization': NORMALIZATION_MODE}
+print(f"Dataset normalization stats -> mean: {train_mean:.4f}, std: {train_std:.4f}")
+# =============================================================================
+# 6. DATA TOKENIZATION
+#    - Tokenize spectrogram matrices into input sequences with masking for pretraining
+# =============================================================================
+# Tokenize training data
+print("🔄 Starting tokenization of training data...")
+preprocessed_train = tokenizer_train(
+    train_spectrograms,
+    max_len=MAX_LEN,
+    masking_percent=MASK_PERCENT,
+    mask=True,
+    seed=42,
+    metadata=train_metadata,
+    dataset_stats=dataset_normalization,
+    normalization=NORMALIZATION_MODE,
+    interleaved=True,
+)
+print("✅ Training data tokenization completed!")
+# Tokenize validation data (with masking for pretraining evaluation)
+print("🔄 Starting tokenization of validation data...")
+preprocessed_val = tokenizer_train(
+    val_spectrograms,
+    max_len=MAX_LEN,
+    masking_percent=MASK_PERCENT,
+    mask=True,  # Apply masking for pretraining evaluation
+    seed=42,
+    metadata=val_metadata,
+    dataset_stats=dataset_normalization,
+    normalization=NORMALIZATION_MODE,
+    interleaved=True,
+)
+print("✅ Validation data tokenization completed!")
+# =============================================================================
+# 7. TRAIN/VALIDATION DATA SETUP
+#    - Use pre-split training and validation data
+# =============================================================================
+SEED = 42
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+# Use pre-split data
+train_data = preprocessed_train
+val_data = preprocessed_val
+# =============================================================================
+# 8. DATALOADER CREATION
+#    - Build PyTorch DataLoader objects for batched training and validation
+# =============================================================================
+# Handle different data formats
+print("🔧 Creating data loaders...")
+if isinstance(train_data, dict):
+    print(f"  Training data format: dict with {len(train_data)} sequence lengths")
+    # Training data with masking
+    train_loaders = create_train_dataloader(train_data, batch_size=BATCH_SIZE, shuffle=True)
+else:
+    print(f"  Training data format: tensor with shape {train_data.shape}")
+    # Training data without masking (fallback)
+    train_dataset = TensorDataset(train_data)
+    train_loaders = {'seq_0': DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)}
+if isinstance(val_data, dict):
+    print(f"  Validation data format: dict with {len(val_data)} sequence lengths")
+    # Validation data with masking
+    val_loaders = create_train_dataloader(val_data, batch_size=VAL_BATCH_SIZE, shuffle=False)
+else:
+    print(f"  Validation data format: tensor with shape {val_data.shape}")
+    # Validation data without masking
+    val_dataset = TensorDataset(val_data)
+    val_loaders = {'seq_0': DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)}
+print("✅ Data loaders created successfully!")
+# =============================================================================
+# 9. MODEL INITIALIZATION
+#    - Instantiate the LWM transformer model and optionally load pre-trained weights
+#    - Wrap with DataParallel for multi-GPU support
+# =============================================================================
+# Device selection with MPS support for Mac
+print("🔧 Setting up device and GPU configuration...")
+if torch.cuda.is_available():
+    device_count = torch.cuda.device_count()
+    print(f"  CUDA available: {device_count} GPU(s) detected")
+    device = torch.device("cuda:0")
+    # On Windows, use only available GPUs
+    gpu_ids = list(range(device_count))  # 0, 1, 2... auto-detect
+    print(f"  Using CUDA GPUs: {gpu_ids}")
+    # GPU memory status
+    for i in gpu_ids:
+        try:
+            mem_total = torch.cuda.get_device_properties(i).total_memory / 1024**3
+            mem_allocated = torch.cuda.memory_allocated(i) / 1024**3
+            print(f"    GPU {i}: Total: {mem_total:.1f}GB, Allocated: {mem_allocated:.1f}GB")
+        except Exception as e:
+            print(f"  GPU {i}: Error getting memory info - {e}")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+    gpu_ids = []  # MPS doesn't support DataParallel
+    print("  Using MPS (Apple Silicon GPU)")
+else:
+    device = torch.device("cpu")
+    gpu_ids = []
+    print("  Using CPU")
+print(f"  Final device: {device}")
+print(f"  GPU IDs for DataParallel: {gpu_ids}")
+print("🤖 Initializing LWM model...")
+print(f"  Model parameters: element_length={ELEMENT_LENGTH}, d_model={D_MODEL}, n_layers={N_LAYERS}, max_len={MAX_LEN}, n_heads={N_HEADS}")
+try:
+    model = pretrained_model.lwm(
+        element_length=ELEMENT_LENGTH,  # Complex spectrograms with real-imag interleaving
+        d_model=D_MODEL,
+        n_layers=N_LAYERS,
+        max_len=MAX_LEN,
+        n_heads=N_HEADS,
+        dropout=DROPOUT
+    )
+    print("  ✅ Model created successfully")
+    print(f"  Moving model to device: {device}")
+    # MPS only supports float32, so set dtype
+    if 'mps' in str(device):
+        model = model.to(device).float()
+        print("  ✅ Model moved to MPS device (float32)")
+    else:
+        model = model.to(device)
+        print("  ✅ Model moved to device successfully")
+except Exception as e:
+    print(f"  ❌ Model initialization failed: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+# Optional: Load pre-trained model
+load_model = False
+if load_model:
+    model.load_state_dict(torch.load("models/model_checkpoint.pth", map_location=device))
+    print("Pre-trained model loaded successfully.")
+# Use DataParallel for multi-GPU support (skip for MPS)
+if gpu_ids:
+    model = nn.DataParallel(model, device_ids=gpu_ids)
+    print(f"Model loaded successfully on GPU {device.index}")
+else:
+    print(f"Model loaded successfully on {device}")
+n_parameters = count_parameters(model)
+print(f"Number of trainable parameters: {n_parameters:,}")
+# =============================================================================
+# 10. OPTIMIZER AND LEARNING RATE SCHEDULER
+#     - Configure AdamW optimizer and a cosine-with-warmup LR schedule based on total steps
+# =============================================================================
+TOTAL_STEPS = sum(len(loader) for loader in train_loaders.values()) * EPOCHS
+WARMUP_STEPS = sum(len(loader) for loader in train_loaders.values()) * WARMUP_EPOCHS
+optimizer = AdamW(
+    model.parameters(),
+    lr=BASE_LR,
+    betas=(BETA1, BETA2),
+    weight_decay=WEIGHT_DECAY
+)
+def lr_lambda(current_step):
+    if current_step < WARMUP_STEPS:
+        return current_step / WARMUP_STEPS
+    else:
+        scaled_progress = (current_step - WARMUP_STEPS) / (TOTAL_STEPS - WARMUP_STEPS)
+        cosine_decay = 0.5 * (1 + np.cos(np.pi * scaled_progress))
+        return cosine_decay * (BASE_LR - MIN_LR) / BASE_LR + MIN_LR / BASE_LR
+scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)
+# =============================================================================
+# 11. PRE-TRAINING LOOP
+#     - Call the train_lwm utility to run the pre-training epochs, logging metrics and saving models
+# =============================================================================
+# Create timestamp-based save directory
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+save_dir = f"models/{timestamp}_complex"
+print(f"📁 Models and logs will be saved to: {save_dir}")
+os.makedirs(save_dir, exist_ok=True)
+stats_path = os.path.join(save_dir, "dataset_stats.json")
+with open(stats_path, 'w') as f:
+    json.dump(dataset_normalization, f, indent=2)
+print(f"📝 Saved dataset stats to {stats_path}")
+comm_selection = sorted(ENABLED_COMM_TYPES) if ENABLED_COMM_TYPES else []
+if comm_selection:
+    comm_suffix = "_" + "-".join(comm_selection)
+else:
+    comm_suffix = ""
+if comm_selection:
+    print(f"[INFO] Communication standards for this run: {', '.join(comm_selection)}")
+if __name__ == "__main__":
+    pretrained_model_output = train_lwm(
+        model,
+        train_loaders,
+        val_loaders,
+        optimizer,
+        scheduler,
+        EPOCHS,
+        device=device,
+        save_dir=save_dir,
+        log_file="training_log.csv",
+        checkpoint_suffix=comm_suffix + "_complex",
+    )
+    print("🎉 Training completed successfully!")