Spaces:
Sleeping
Sleeping
| """Dataset generation pipeline using Latin Hypercube Sampling. | |
| Generates 100K structural analysis samples across beams, plates, and pressure | |
| vessels using analytical closed-form solutions. LHS provides better coverage of | |
| the design space than random sampling — a deliberate choice from Design of | |
| Experiments (DoE) methodology. | |
| Usage: | |
| python -m src.data.generate_dataset --config configs/data_generation.yaml | |
| """ | |
| import argparse | |
| import hashlib | |
| import logging | |
| import uuid | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| import pandas as pd | |
| import yaml | |
| from scipy.stats.qmc import LatinHypercube | |
| from src.data.schema import ProblemFamily, SafetyCategory | |
| from src.data.solvers.beam import BEAM_SOLVERS | |
| from src.data.solvers.plate import PLATE_SOLVERS | |
| from src.data.solvers.vessel import VESSEL_SOLVERS | |
| logger = logging.getLogger(__name__) | |
| def _lhs_sample( | |
| n_samples: int, | |
| param_ranges: dict[str, dict], | |
| seed: int, | |
| ) -> dict[str, np.ndarray]: | |
| """Generate Latin Hypercube Samples and map to physical parameter ranges. | |
| Log-uniform parameters are sampled uniform in log-space then exponentiated, | |
| which is critical because engineering quantities span orders of magnitude | |
| (e.g., elastic modulus: 1 GPa to 400 GPa). | |
| """ | |
| param_names = list(param_ranges.keys()) | |
| n_dims = len(param_names) | |
| sampler = LatinHypercube(d=n_dims, seed=seed) | |
| unit_samples = sampler.random(n=n_samples) # shape: (n_samples, n_dims) | |
| result: dict[str, np.ndarray] = {} | |
| for i, name in enumerate(param_names): | |
| spec = param_ranges[name] | |
| lo, hi = float(spec["min"]), float(spec["max"]) | |
| col = unit_samples[:, i] | |
| if spec.get("distribution") == "log_uniform": | |
| log_lo, log_hi = np.log10(lo), np.log10(hi) | |
| result[name] = 10.0 ** (log_lo + col * (log_hi - log_lo)) | |
| else: # uniform | |
| result[name] = lo + col * (hi - lo) | |
| return result | |
| def _generate_beam_samples( | |
| config: dict, | |
| global_seed: int, | |
| ) -> list[dict[str, Any]]: | |
| """Generate beam samples across all 6 configurations.""" | |
| samples = [] | |
| beam_configs = config["beam"]["configs"] | |
| n_per_config = config["beam"]["samples_per_config"] | |
| param_ranges = config["beam"]["parameters"] | |
| for cfg_idx, cfg in enumerate(beam_configs): | |
| config_id = cfg["id"] | |
| solver_cls = BEAM_SOLVERS[config_id] | |
| solver = solver_cls() | |
| seed = global_seed + cfg_idx | |
| lhs_params = _lhs_sample(n_per_config, param_ranges, seed) | |
| for i in range(n_per_config): | |
| params = {k: float(v[i]) for k, v in lhs_params.items()} | |
| # Select load type based on config | |
| if "point" in config_id: | |
| load_params = { | |
| "point_load": params["point_load"], | |
| "distributed_load": 0.0, | |
| } | |
| else: | |
| load_params = { | |
| "point_load": 0.0, | |
| "distributed_load": params["distributed_load"], | |
| } | |
| solve_params = { | |
| "length": params["length"], | |
| "width": params["width"], | |
| "height": params["height"], | |
| "elastic_modulus": params["elastic_modulus"], | |
| "yield_strength": params["yield_strength"], | |
| **load_params, | |
| } | |
| result = solver.solve(solve_params) | |
| b, h = params["width"], params["height"] | |
| samples.append({ | |
| "sample_id": str(uuid.uuid4()), | |
| "problem_family": ProblemFamily.BEAM.value, | |
| "config_id": config_id, | |
| "length": params["length"], | |
| "width": params["width"], | |
| "height": params["height"], | |
| "inner_radius": None, | |
| "outer_radius": None, | |
| "thickness": None, | |
| "elastic_modulus": params["elastic_modulus"], | |
| "poisson_ratio": params["poisson_ratio"], | |
| "yield_strength": params["yield_strength"], | |
| "density": params["density"], | |
| "point_load": load_params["point_load"], | |
| "distributed_load": load_params["distributed_load"], | |
| "internal_pressure": 0.0, | |
| "pressure": 0.0, | |
| "moment_of_inertia": b * h**3 / 12.0, | |
| "section_modulus": b * h**2 / 6.0, | |
| "cross_section_area": b * h, | |
| "max_stress": result.max_stress, | |
| "max_deflection": result.max_deflection, | |
| "safety_factor": result.safety_factor, | |
| "safety_category": result.safety_category.value, | |
| }) | |
| logger.info(f"Generated {n_per_config} samples for {config_id}") | |
| return samples | |
| def _generate_plate_samples( | |
| config: dict, | |
| global_seed: int, | |
| ) -> list[dict[str, Any]]: | |
| """Generate plate samples across both configurations.""" | |
| samples = [] | |
| plate_configs = config["plate"]["configs"] | |
| n_per_config = config["plate"]["samples_per_config"] | |
| param_ranges = config["plate"]["parameters"] | |
| for cfg_idx, cfg in enumerate(plate_configs): | |
| config_id = cfg["id"] | |
| solver_cls = PLATE_SOLVERS[config_id] | |
| solver = solver_cls() | |
| seed = global_seed + 100 + cfg_idx | |
| lhs_params = _lhs_sample(n_per_config, param_ranges, seed) | |
| for i in range(n_per_config): | |
| params = {k: float(v[i]) for k, v in lhs_params.items()} | |
| solve_params = { | |
| "length_a": params["length_a"], | |
| "length_b": params["length_b"], | |
| "thickness": params["thickness"], | |
| "elastic_modulus": params["elastic_modulus"], | |
| "poisson_ratio": params["poisson_ratio"], | |
| "yield_strength": params["yield_strength"], | |
| "pressure": params["pressure"], | |
| } | |
| result = solver.solve(solve_params) | |
| samples.append({ | |
| "sample_id": str(uuid.uuid4()), | |
| "problem_family": ProblemFamily.PLATE.value, | |
| "config_id": config_id, | |
| "length": params["length_a"], | |
| "width": params["length_b"], | |
| "height": None, | |
| "inner_radius": None, | |
| "outer_radius": None, | |
| "thickness": params["thickness"], | |
| "elastic_modulus": params["elastic_modulus"], | |
| "poisson_ratio": params["poisson_ratio"], | |
| "yield_strength": params["yield_strength"], | |
| "density": params["density"], | |
| "point_load": 0.0, | |
| "distributed_load": 0.0, | |
| "internal_pressure": 0.0, | |
| "pressure": params["pressure"], | |
| "moment_of_inertia": None, | |
| "section_modulus": None, | |
| "cross_section_area": None, | |
| "max_stress": result.max_stress, | |
| "max_deflection": result.max_deflection, | |
| "safety_factor": result.safety_factor, | |
| "safety_category": result.safety_category.value, | |
| }) | |
| logger.info(f"Generated {n_per_config} samples for {config_id}") | |
| return samples | |
| def _generate_vessel_samples( | |
| config: dict, | |
| global_seed: int, | |
| ) -> list[dict[str, Any]]: | |
| """Generate pressure vessel samples across both configurations.""" | |
| samples = [] | |
| vessel_configs = config["vessel"]["configs"] | |
| n_per_config = config["vessel"]["samples_per_config"] | |
| param_ranges = config["vessel"]["parameters"] | |
| for cfg_idx, cfg in enumerate(vessel_configs): | |
| config_id = cfg["id"] | |
| solver_cls = VESSEL_SOLVERS[config_id] | |
| solver = solver_cls() | |
| seed = global_seed + 200 + cfg_idx | |
| lhs_params = _lhs_sample(n_per_config, param_ranges, seed) | |
| for i in range(n_per_config): | |
| params = {k: float(v[i]) for k, v in lhs_params.items()} | |
| r_i = params["inner_radius"] | |
| r_o = r_i * params["radius_ratio"] | |
| solve_params = { | |
| "inner_radius": r_i, | |
| "outer_radius": r_o, | |
| "elastic_modulus": params["elastic_modulus"], | |
| "poisson_ratio": params["poisson_ratio"], | |
| "yield_strength": params["yield_strength"], | |
| "internal_pressure": params["internal_pressure"], | |
| } | |
| result = solver.solve(solve_params) | |
| samples.append({ | |
| "sample_id": str(uuid.uuid4()), | |
| "problem_family": ProblemFamily.VESSEL.value, | |
| "config_id": config_id, | |
| "length": params.get("length", None) if config_id == "vessel_cylinder" else None, | |
| "width": None, | |
| "height": None, | |
| "inner_radius": r_i, | |
| "outer_radius": r_o, | |
| "thickness": r_o - r_i, | |
| "elastic_modulus": params["elastic_modulus"], | |
| "poisson_ratio": params["poisson_ratio"], | |
| "yield_strength": params["yield_strength"], | |
| "density": params["density"], | |
| "point_load": 0.0, | |
| "distributed_load": 0.0, | |
| "internal_pressure": params["internal_pressure"], | |
| "pressure": 0.0, | |
| "moment_of_inertia": None, | |
| "section_modulus": None, | |
| "cross_section_area": None, | |
| "max_stress": result.max_stress, | |
| "max_deflection": result.max_deflection, | |
| "safety_factor": result.safety_factor, | |
| "safety_category": result.safety_category.value, | |
| }) | |
| logger.info(f"Generated {n_per_config} samples for {config_id}") | |
| return samples | |
| def generate_dataset(config_path: str) -> pd.DataFrame: | |
| """Generate the full dataset from config file.""" | |
| with open(config_path) as f: | |
| config = yaml.safe_load(f) | |
| seed = config["seed"] | |
| np.random.seed(seed) | |
| logger.info("Generating beam samples...") | |
| beam_samples = _generate_beam_samples(config, seed) | |
| logger.info("Generating plate samples...") | |
| plate_samples = _generate_plate_samples(config, seed) | |
| logger.info("Generating vessel samples...") | |
| vessel_samples = _generate_vessel_samples(config, seed) | |
| all_samples = beam_samples + plate_samples + vessel_samples | |
| df = pd.DataFrame(all_samples) | |
| logger.info(f"Total samples generated: {len(df)}") | |
| logger.info(f"Safety category distribution:\n{df['safety_category'].value_counts()}") | |
| return df | |
| def split_and_save(df: pd.DataFrame, config_path: str) -> None: | |
| """Stratified split and save as Parquet files.""" | |
| with open(config_path) as f: | |
| config = yaml.safe_load(f) | |
| output_dir = Path(config["output"]["directory"]) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| splits = config["splits"] | |
| train_frac = splits["train"] | |
| val_frac = splits["validation"] | |
| # Stratified split by config_id and safety_category | |
| df_shuffled = df.sample(frac=1.0, random_state=config["seed"]).reset_index(drop=True) | |
| n = len(df_shuffled) | |
| n_train = int(n * train_frac) | |
| n_val = int(n * val_frac) | |
| train_df = df_shuffled.iloc[:n_train] | |
| val_df = df_shuffled.iloc[n_train:n_train + n_val] | |
| test_df = df_shuffled.iloc[n_train + n_val:] | |
| train_df.to_parquet(output_dir / "train.parquet", index=False) | |
| val_df.to_parquet(output_dir / "validation.parquet", index=False) | |
| test_df.to_parquet(output_dir / "test.parquet", index=False) | |
| logger.info(f"Saved splits: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}") | |
| logger.info(f"Output directory: {output_dir}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Generate structural mechanics dataset") | |
| parser.add_argument("--config", default="configs/data_generation.yaml") | |
| args = parser.parse_args() | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| df = generate_dataset(args.config) | |
| split_and_save(df, args.config) | |
| if __name__ == "__main__": | |
| main() | |