| | """ |
| | Synthetic Walnut Storage Dataset Generator |
| | Simulates Indian storage conditions using Arrhenius-based food chemistry kinetics. |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | from pathlib import Path |
| | import random |
| |
|
| | np.random.seed(42) |
| | random.seed(42) |
| |
|
| | |
| | R = 8.314 |
| | Ea = 80000.0 |
| | A = 1.5e12 |
| |
|
| | |
| | PV_THRESHOLD = 5.0 |
| |
|
| | |
| | SCENARIOS = { |
| | "cold_storage": { |
| | "temp_range": (2.0, 8.0), |
| | "hum_range": (40.0, 60.0), |
| | "weight": 0.20, |
| | }, |
| | "hill_region": { |
| | "temp_range": (5.0, 20.0), |
| | "hum_range": (35.0, 65.0), |
| | "weight": 0.25, |
| | }, |
| | "ambient_warehouse": { |
| | "temp_range": (18.0, 32.0), |
| | "hum_range": (50.0, 75.0), |
| | "weight": 0.30, |
| | }, |
| | "hot_transport": { |
| | "temp_range": (28.0, 40.0), |
| | "hum_range": (55.0, 85.0), |
| | "weight": 0.25, |
| | }, |
| | } |
| |
|
| | SCENARIO_NAMES = list(SCENARIOS.keys()) |
| | SCENARIO_WEIGHTS = [SCENARIOS[s]["weight"] for s in SCENARIO_NAMES] |
| |
|
| |
|
| | def sigmoid(x): |
| | return 1.0 / (1.0 + np.exp(-x)) |
| |
|
| |
|
| | def arrhenius_rate(T_celsius: float, humidity: float, moisture: float) -> float: |
| | """Compute daily oxidation rate constant via Arrhenius kinetics.""" |
| | T_kelvin = T_celsius + 273.15 |
| | k_base = A * np.exp(-Ea / (R * T_kelvin)) |
| |
|
| | |
| | h_factor = 1.0 + 0.015 * (humidity - 40.0) |
| | m_factor = 1.0 + 0.20 * (moisture - 4.0) |
| | h_factor = max(h_factor, 0.5) |
| | m_factor = max(m_factor, 0.5) |
| |
|
| | return k_base * h_factor * m_factor |
| |
|
| |
|
| | def generate_sequence( |
| | seq_len: int, |
| | scenario: str, |
| | start_day: int = 0, |
| | ) -> list[dict]: |
| | """ |
| | Generate a single walnut storage sequence. |
| | Returns a list of daily records. |
| | """ |
| | scen = SCENARIOS[scenario] |
| | temp_lo, temp_hi = scen["temp_range"] |
| | hum_lo, hum_hi = scen["hum_range"] |
| |
|
| | |
| | PV0 = np.random.uniform(0.3, 1.2) |
| | FFA0 = np.random.uniform(0.05, 0.15) |
| | base_oxy = np.random.uniform(0.18, 0.23) |
| |
|
| | |
| | |
| | T_avg = (temp_lo + temp_hi) / 2.0 |
| | H_avg = (hum_lo + hum_hi) / 2.0 |
| | M_avg = np.random.uniform(3.0, 8.0) |
| | k_avg = arrhenius_rate(T_avg, H_avg, M_avg) |
| | if k_avg > 0 and PV0 < PV_THRESHOLD: |
| | days_to_rancid = np.log(PV_THRESHOLD / PV0) / k_avg |
| | else: |
| | days_to_rancid = 0.0 |
| |
|
| | records = [] |
| | PV = PV0 |
| | FFA = FFA0 |
| |
|
| | for i in range(seq_len): |
| | day = start_day + i |
| |
|
| | |
| | temp = np.clip(np.random.normal((temp_lo + temp_hi) / 2, |
| | (temp_hi - temp_lo) / 6), |
| | temp_lo, temp_hi) |
| | humidity = np.clip(np.random.normal((hum_lo + hum_hi) / 2, |
| | (hum_hi - hum_lo) / 6), |
| | hum_lo, hum_hi) |
| | moisture = np.clip(np.random.normal(5.0, 1.2), 3.0, 8.0) |
| | oxygen = np.clip(np.random.normal(base_oxy, 0.005), 0.18, 0.23) |
| |
|
| | k = arrhenius_rate(temp, humidity, moisture) |
| |
|
| | |
| | PV = PV * np.exp(k) |
| | PV = max(PV, 0.01) |
| |
|
| | |
| | FFA = FFA + 0.002 * k * PV + np.random.normal(0, 0.002) |
| | hexanal = 0.15 * PV + 0.05 * max(PV - 2.0, 0) ** 1.3 \ |
| | + np.random.normal(0, 0.05) |
| | oxidation_ix = 0.3 * PV + 0.4 * max(FFA, 0) + np.random.normal(0, 0.05) |
| |
|
| | FFA = max(FFA, 0.01) |
| | hexanal = max(hexanal, 0.0) |
| | oxidation_ix = max(oxidation_ix, 0.0) |
| |
|
| | |
| | |
| | rand_prob = float(sigmoid(PV - PV_THRESHOLD)) |
| |
|
| | |
| | shelf_life = float(max(days_to_rancid - day, 0.0)) |
| |
|
| | |
| | decay_curve = float(min(PV / 10.0, 1.0)) |
| |
|
| | records.append({ |
| | "day": day, |
| | "temperature": round(float(temp), 3), |
| | "humidity": round(float(humidity), 3), |
| | "moisture": round(float(moisture), 3), |
| | "oxygen": round(float(oxygen), 5), |
| | "peroxide_value": round(float(PV), 4), |
| | "free_fatty_acids": round(float(FFA), 4), |
| | "hexanal_level": round(float(hexanal), 4), |
| | "oxidation_index": round(float(oxidation_ix), 4), |
| | "rancidity_probability": round(rand_prob, 6), |
| | "shelf_life_remaining_days": round(shelf_life, 2), |
| | "decay_curve_value": round(decay_curve, 6), |
| | }) |
| |
|
| | return records |
| |
|
| |
|
| | def generate_dataset(target_sequences: int = 90000) -> pd.DataFrame: |
| | """Generate the full dataset as a flat CSV.""" |
| | all_records = [] |
| | seq_count = 0 |
| |
|
| | print(f"Generating {target_sequences} sequences …") |
| | log_every = target_sequences // 10 |
| |
|
| | while seq_count < target_sequences: |
| | scenario = random.choices(SCENARIO_NAMES, weights=SCENARIO_WEIGHTS, k=1)[0] |
| | seq_len = random.randint(30, 90) |
| | start_day = random.randint(0, 90) |
| |
|
| | records = generate_sequence(seq_len, scenario, start_day) |
| |
|
| | |
| | seq_id = seq_count |
| | for rec in records: |
| | rec["sequence_id"] = seq_id |
| | all_records.extend(records) |
| |
|
| | seq_count += 1 |
| | if seq_count % log_every == 0: |
| | print(f" {seq_count}/{target_sequences} sequences " |
| | f"({len(all_records):,} rows)") |
| |
|
| | df = pd.DataFrame(all_records) |
| | |
| | cols = [ |
| | "sequence_id", "day", |
| | "temperature", "humidity", "moisture", "oxygen", |
| | "peroxide_value", "free_fatty_acids", "hexanal_level", "oxidation_index", |
| | "rancidity_probability", "shelf_life_remaining_days", "decay_curve_value", |
| | ] |
| | df = df[cols] |
| | return df |
| |
|
| |
|
| | if __name__ == "__main__": |
| | out_dir = Path("data") |
| | out_dir.mkdir(exist_ok=True) |
| |
|
| | df = generate_dataset(target_sequences=90000) |
| |
|
| | out_path = out_dir / "walnut_storage_timeseries.csv" |
| | df.to_csv(out_path, index=False) |
| | print(f"\nDataset saved → {out_path}") |
| | print(f"Total rows : {len(df):,}") |
| | print(f"Total seqs : {df['sequence_id'].nunique():,}") |
| | print(df.describe().to_string()) |
| |
|