File size: 5,917 Bytes
8e5ba9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Input normalization for structural mechanics features.

Physical quantities span wildly different scales (Poisson's ratio ~0.3 vs
elastic modulus ~200e9). Strategy: log-transform extensive quantities that
span orders of magnitude, then standardize all features to zero mean,
unit variance. This is critical for stable neural network training.
"""

import json
from pathlib import Path
from typing import Optional

import numpy as np
import torch


# Features that span orders of magnitude and should be log-transformed
LOG_TRANSFORM_FEATURES = {
    "length", "width", "height", "inner_radius", "outer_radius", "thickness",
    "elastic_modulus", "yield_strength", "density",
    "point_load", "distributed_load", "internal_pressure", "pressure",
    "moment_of_inertia", "section_modulus", "cross_section_area",
}

# Features that are already on a reasonable scale (keep linear)
LINEAR_FEATURES = {"poisson_ratio"}

# One-hot encoded categorical feature
CATEGORICAL_FEATURE = "config_id"

# All config IDs in deterministic order
CONFIG_IDS = [
    "beam_ss_point", "beam_ss_udl",
    "beam_cantilever_point", "beam_cantilever_udl",
    "beam_fixed_point", "beam_fixed_udl",
    "plate_ss_uniform", "plate_fixed_uniform",
    "vessel_cylinder", "vessel_sphere",
]


class LogTransformStandardizer:
    """Two-stage normalization: log-transform then standardize.

    Stores normalization parameters as model artifacts for reproducible
    inference. All parameters are JSON-serializable for deployment.
    """

    def __init__(self) -> None:
        self.feature_names: list[str] = []
        self.means: Optional[np.ndarray] = None
        self.stds: Optional[np.ndarray] = None
        self.log_mask: Optional[np.ndarray] = None
        self._fitted = False

    @property
    def input_dim(self) -> int:
        """Total input dimension after one-hot encoding."""
        if not self._fitted:
            raise RuntimeError("Call fit() before accessing input_dim")
        return len(self.feature_names) + len(CONFIG_IDS)

    def fit(self, features: dict[str, np.ndarray], config_ids: np.ndarray) -> "LogTransformStandardizer":
        """Compute normalization parameters from training data.

        Args:
            features: Dict mapping feature name to 1D array of values.
                      NaN values are replaced with 0 before transformation.
            config_ids: Array of config_id strings for one-hot encoding.
        """
        self.feature_names = sorted(features.keys())
        n_features = len(self.feature_names)

        # Build log-transform mask
        self.log_mask = np.array([
            name in LOG_TRANSFORM_FEATURES for name in self.feature_names
        ])

        # Stack features into matrix
        matrix = np.column_stack([features[name] for name in self.feature_names])

        # Replace NaN with 0 (for optional features like inner_radius on beams)
        matrix = np.nan_to_num(matrix, nan=0.0)

        # Apply log10 to selected features (add epsilon for zero values)
        log_matrix = matrix.copy()
        for i in range(n_features):
            if self.log_mask[i]:
                col = matrix[:, i]
                col = np.where(col > 0, col, 1e-30)  # avoid log(0)
                log_matrix[:, i] = np.log10(col)

        # Compute mean and std on transformed features
        self.means = log_matrix.mean(axis=0)
        self.stds = log_matrix.std(axis=0)
        self.stds = np.where(self.stds > 0, self.stds, 1.0)  # avoid division by zero

        self._fitted = True
        return self

    def transform(
        self,
        features: dict[str, np.ndarray],
        config_ids: np.ndarray,
    ) -> torch.Tensor:
        """Transform raw features to normalized tensor.

        Returns:
            Tensor of shape (n_samples, input_dim) ready for model input.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform()")

        n_samples = len(next(iter(features.values())))

        # Stack numeric features
        matrix = np.column_stack([features[name] for name in self.feature_names])
        matrix = np.nan_to_num(matrix, nan=0.0)

        # Log-transform
        for i in range(len(self.feature_names)):
            if self.log_mask[i]:
                col = matrix[:, i]
                col = np.where(col > 0, col, 1e-30)
                matrix[:, i] = np.log10(col)

        # Standardize
        matrix = (matrix - self.means) / self.stds

        # One-hot encode config_id
        config_onehot = np.zeros((n_samples, len(CONFIG_IDS)), dtype=np.float32)
        config_to_idx = {c: i for i, c in enumerate(CONFIG_IDS)}
        for row_idx, cid in enumerate(config_ids):
            if cid in config_to_idx:
                config_onehot[row_idx, config_to_idx[cid]] = 1.0

        # Concatenate: [numeric_features | config_onehot]
        combined = np.concatenate([matrix.astype(np.float32), config_onehot], axis=1)

        return torch.from_numpy(combined)

    def save(self, path: Path) -> None:
        """Save normalization parameters to JSON."""
        data = {
            "feature_names": self.feature_names,
            "means": self.means.tolist(),
            "stds": self.stds.tolist(),
            "log_mask": self.log_mask.tolist(),
            "config_ids": CONFIG_IDS,
        }
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as f:
            json.dump(data, f, indent=2)

    @classmethod
    def load(cls, path: Path) -> "LogTransformStandardizer":
        """Load normalization parameters from JSON."""
        with open(path) as f:
            data = json.load(f)

        obj = cls()
        obj.feature_names = data["feature_names"]
        obj.means = np.array(data["means"])
        obj.stds = np.array(data["stds"])
        obj.log_mask = np.array(data["log_mask"])
        obj._fitted = True
        return obj