fea-surrogate / src /training /dataset.py
WolfDavid's picture
Upload folder using huggingface_hub
8e5ba9e verified
"""PyTorch Dataset and DataLoader for structural mechanics data.
Loads Parquet files, applies normalization, and prepares tensors for training.
Log-space targets for stress and deflection because values span Pa to GPa.
"""
from pathlib import Path
from typing import Optional
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from src.data.schema import SafetyCategory
from src.models.normalization import LogTransformStandardizer
# Numeric features used as model input (order matters for normalization)
NUMERIC_FEATURES = [
"length", "width", "height", "inner_radius", "outer_radius", "thickness",
"elastic_modulus", "poisson_ratio", "yield_strength", "density",
"point_load", "distributed_load", "internal_pressure", "pressure",
"moment_of_inertia", "section_modulus", "cross_section_area",
]
SAFETY_CLASS_MAP = {
SafetyCategory.SAFE.value: 0,
SafetyCategory.MARGINAL.value: 1,
SafetyCategory.FAILURE.value: 2,
}
class StructuralMechanicsDataset(Dataset):
"""Dataset for structural analysis surrogate training."""
def __init__(
self,
parquet_path: Path,
normalizer: LogTransformStandardizer,
fit_normalizer: bool = False,
) -> None:
self.df = pd.read_parquet(parquet_path)
# Extract features as dict of arrays
features = {}
for col in NUMERIC_FEATURES:
if col in self.df.columns:
features[col] = self.df[col].values.astype(np.float64)
config_ids = self.df["config_id"].values
if fit_normalizer:
normalizer.fit(features, config_ids)
# Transform inputs
self.X = normalizer.transform(features, config_ids)
# Targets in log-space
stress = self.df["max_stress"].values
deflection = self.df["max_deflection"].values
# Clamp to avoid log(0)
stress = np.where(stress > 0, stress, 1e-30)
deflection = np.where(deflection > 0, deflection, 1e-30)
self.log_stress = torch.from_numpy(np.log10(stress).astype(np.float32))
self.log_deflection = torch.from_numpy(np.log10(deflection).astype(np.float32))
# Log yield strength for physics loss consistency check
yield_strength = self.df["yield_strength"].values
self.log_yield = torch.from_numpy(np.log10(yield_strength).astype(np.float32))
# Safety category as class index
safety_classes = self.df["safety_category"].map(SAFETY_CLASS_MAP).values
self.safety_class = torch.from_numpy(safety_classes.astype(np.int64))
def __len__(self) -> int:
return len(self.df)
def __getitem__(self, idx: int) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
targets = {
"log_stress": self.log_stress[idx],
"log_deflection": self.log_deflection[idx],
"log_yield_strength": self.log_yield[idx],
"safety_class": self.safety_class[idx],
}
return self.X[idx], targets
def create_dataloaders(
data_dir: Path,
normalizer: LogTransformStandardizer,
batch_size: int = 512,
num_workers: int = 0,
) -> tuple[DataLoader, DataLoader, DataLoader]:
"""Create train/val/test dataloaders.
Fits the normalizer on the training set only (no data leakage).
"""
train_ds = StructuralMechanicsDataset(
data_dir / "train.parquet",
normalizer,
fit_normalizer=True,
)
val_ds = StructuralMechanicsDataset(
data_dir / "validation.parquet",
normalizer,
)
test_ds = StructuralMechanicsDataset(
data_dir / "test.parquet",
normalizer,
)
train_loader = DataLoader(
train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers,
pin_memory=True, drop_last=True,
)
val_loader = DataLoader(
val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
pin_memory=True,
)
test_loader = DataLoader(
test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
pin_memory=True,
)
return train_loader, val_loader, test_loader