Spaces:

clarindasusan
/

cyclone-pred-api

Sleeping

App Files Files Community

clarindasusan commited on Feb 19

Commit

349d88b

verified ·

1 Parent(s): d5dcb2d

Update src/train_model.py

Browse files

Files changed (1) hide show

src/train_model.py +170 -83

src/train_model.py CHANGED Viewed

@@ -39,7 +39,7 @@ from src.disaster_predictors import (
     FLOOD_FEATURES, CYCLONE_FEATURES, LANDSLIDE_FEATURES, EARTHQUAKE_FEATURES
 )
-MODEL_DIR = "models"
 SEED = 42
 np.random.seed(SEED)
 torch.manual_seed(SEED)
@@ -182,6 +182,7 @@ def generate_cyclone_data(n: int = 3000):
     def nearest_merge(base_df, aux_df, cols):
         tree = cKDTree(aux_df[["latitude", "longitude"]].values)
         _, idxs = tree.query(base_df[["latitude", "longitude"]].values)
         for col in cols:
             base_df[col] = aux_df[col].iloc[idxs].values
         return base_df
@@ -441,37 +442,168 @@ def generate_landslide_data(n: int = 4000):
     return X, y
 def generate_earthquake_data(n: int = 3000):
-    rng = np.random.default_rng(SEED + 3)
-    hist_seism_norm  = rng.beta(2, 4, n)
-    fault_norm       = rng.beta(2, 2, n)       # Higher = farther from fault
-    liquef_norm      = rng.beta(2, 4, n)
-    depth_norm       = rng.beta(3, 2, n)       # Higher = deeper = less damage
-    stress_norm      = rng.beta(2, 3, n)
-    vuln_norm        = rng.beta(2, 3, n)
-    pop_norm         = rng.beta(2, 2, n)
-    amp_norm         = rng.beta(2, 3, n)
-    X = np.column_stack([
-        hist_seism_norm, fault_norm, liquef_norm, depth_norm,
-        stress_norm, vuln_norm, pop_norm, amp_norm
-    ])
-    risk = (
-        0.25 * hist_seism_norm +
-        0.20 * (1 - fault_norm) +              # Close to fault = more risk
         0.15 * liquef_norm +
-        0.10 * (1 - depth_norm) +              # Shallow = more damage
-        0.10 * stress_norm +
         0.10 * vuln_norm +
         0.05 * pop_norm +
-        0.05 * amp_norm
     )
-    risk += rng.normal(0, 0.05, n)
-    y = np.clip(risk, 0.0, 1.0).astype(np.float32)
-    return X.astype(np.float32), y
 DATA_GENERATORS = {
@@ -482,6 +614,7 @@ DATA_GENERATORS = {
 }
 # ============================================================================
 # TRAINING PIPELINE
 # ============================================================================
@@ -514,65 +647,19 @@ def train_disaster_model(disaster_type: str, epochs: int = 200, n_samples: int =
     print(f"{'='*60}")
     generator_fn, feature_names = DATA_GENERATORS[disaster_type]
-    n = n_samples or {"flood": 5000, "cyclone": 3000, "landslide": 4000, "earthquake": 3000}[disaster_type]
-    print(f"Loading data (n_samples hint: {n})...")
     X, y = generator_fn(n)
-    # Train/val/test split
-    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED)
-    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.15, random_state=SEED)
-    print(f"  Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")
-    # Tensors
-    X_train_t = torch.tensor(X_train)
-    y_train_t = torch.tensor(y_train)
-    X_val_t   = torch.tensor(X_val)
-    y_val_t   = torch.tensor(y_val)
-    X_test_t  = torch.tensor(X_test)
-    y_test_t  = torch.tensor(y_test)
-    # Model
-    n_features = len(feature_names)
-    model = FuzzyNeuralNetwork(
-        n_features=n_features,
-        n_terms=3,
-        hidden_dims=[64, 32],
-        dropout=0.2
-    )
-    print(f"  Model: FNN with {n_features} inputs, 3 fuzzy terms, 64→32 deep head")
-    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"  Trainable parameters: {total_params:,}")
-    # Train
-    trainer = FNNTrainer(model, lr=1e-3, weight_decay=1e-4)
-    history = trainer.fit(
-        X_train_t, y_train_t,
-        X_val_t,   y_val_t,
-        epochs=epochs, batch_size=64, patience=25
-    )
-    # Evaluate
-    print("\n  Test set evaluation:")
-    metrics = evaluate_model(model, X_test_t, y_test_t)
-    for k, v in metrics.items():
-        print(f"    {k}: {v}")
-    # Save
-    os.makedirs(MODEL_DIR, exist_ok=True)
-    model_path = os.path.join(MODEL_DIR, f"fnn_{disaster_type}_model.pt")
-    save_model(model, model_path, feature_names)
-    # Save feature names as text too
-    feat_path = os.path.join(MODEL_DIR, "feature_names", f"{disaster_type}_features.txt")
-    os.makedirs(os.path.dirname(feat_path), exist_ok=True)
-    with open(feat_path, "w") as f:
-        f.write("\n".join(feature_names))
-    print(f"\n  Model saved to: {model_path}")
-    return metrics
 def train_all(epochs: int = 200):

     FLOOD_FEATURES, CYCLONE_FEATURES, LANDSLIDE_FEATURES, EARTHQUAKE_FEATURES
 )
+MODEL_DIR = os.path.join(BASE_DIR, "models")
 SEED = 42
 np.random.seed(SEED)
 torch.manual_seed(SEED)
     def nearest_merge(base_df, aux_df, cols):
         tree = cKDTree(aux_df[["latitude", "longitude"]].values)
         _, idxs = tree.query(base_df[["latitude", "longitude"]].values)
+        base_df = base_df.copy()
         for col in cols:
             base_df[col] = aux_df[col].iloc[idxs].values
         return base_df
     return X, y
 def generate_earthquake_data(n: int = 3000):
+    """
+    Loads and joins earthquake datasets:
+      - earthquake_history.csv       (spine + historical_seismicity, focal_depth_km, tectonic_stress_index)
+      - fault_lines_earthquake.csv   (distance_to_fault_km, seismic_hazard_index)
+      - soil_liquefaction.csv        (soil_liquefaction_index)
+      - vs30_bedrock.csv             (bedrock_amplification)
+      - building_vulnerability.csv   (building_vulnerability)
+      - population_earthquake.csv    (population_density_norm)
+    """
+    print("[Earthquake] Using REAL data loader")
+    def nearest_merge(base_df, aux_df, cols,
+                      base_lat="latitude", base_lon="longitude",
+                      aux_lat="latitude",  aux_lon="longitude"):
+        if aux_lat not in aux_df.columns or aux_lon not in aux_df.columns:
+            raise ValueError(
+                f"nearest_merge: aux_df missing lat/lon. Has: {list(aux_df.columns)}"
+            )
+        tree = cKDTree(aux_df[[aux_lat, aux_lon]].values)
+        _, idxs = tree.query(base_df[[base_lat, base_lon]].values)
+        base_df = base_df.copy()
+        for col in cols:
+            if col not in aux_df.columns:
+                raise ValueError(
+                    f"nearest_merge: '{col}' not in aux_df. "
+                    f"Has: {list(aux_df.columns)}"
+                )
+            base_df[col] = aux_df[col].iloc[idxs].values
+        return base_df
+    # ── Load ──────────────────────────────────────────────────────────────
+    print("[Earthquake] Loading CSVs...")
+    history  = pd.read_csv(os.path.join(DATA_DIR, "earthquake_history.csv"))
+    faults   = pd.read_csv(os.path.join(DATA_DIR, "fault_lines_earthquake.csv"))
+    liquef   = pd.read_csv(os.path.join(DATA_DIR, "soil_liquefaction.csv"))
+    vs30     = pd.read_csv(os.path.join(DATA_DIR, "vs30_bedrock.csv"))
+    bldg     = pd.read_csv(os.path.join(DATA_DIR, "building_vulnerability.csv"))
+    pop      = pd.read_csv(os.path.join(DATA_DIR, "population_earthquake.csv"))
+    for df in (history, faults, liquef, vs30, bldg, pop):
+        df.columns = df.columns.str.lower().str.strip()
+    print(f"[Earthquake] History: {len(history)} rows, cols: {list(history.columns)}")
+    print(f"[Earthquake] Faults cols:  {list(faults.columns)}")
+    print(f"[Earthquake] Liquef cols:  {list(liquef.columns)}")
+    print(f"[Earthquake] VS30 cols:    {list(vs30.columns)}")
+    print(f"[Earthquake] Bldg cols:    {list(bldg.columns)}")
+    print(f"[Earthquake] Pop cols:     {list(pop.columns)}")
+    # ── Clean history spine ───────────────────────────────────────────────
+    history = history.dropna(subset=["latitude", "longitude"])
+    history["date"] = pd.to_datetime(history["date"], errors="coerce")
+    history = history.dropna(subset=["date"])
+    print(f"[Earthquake] After date clean: {len(history)} rows")
+    if len(history) == 0:
+        raise ValueError(
+            "earthquake_history has 0 rows after date parsing. "
+            f"Sample raw dates: {pd.read_csv(os.path.join(DATA_DIR, 'earthquake_history.csv'))['date'].head().tolist()}"
+        )
+    base = history.copy()
+    # ── Fault lines → distance_to_fault_km ───────────────────────────────
+    # fault_lines_earthquake already has distance_to_fault_km as a column
+    # but we still spatial-join to get the nearest fault's values
+    print("[Earthquake] Merging fault lines...")
+    base = nearest_merge(base, faults, ["distance_to_fault_km"])
+    # ── Soil liquefaction ─────────────────────────────────────────────────
+    print("[Earthquake] Merging soil liquefaction...")
+    base = nearest_merge(base, liquef, ["soil_liquefaction_index"])
+    # ── VS30 / bedrock amplification ──────────────────────────────────────
+    print("[Earthquake] Merging VS30 bedrock...")
+    base = nearest_merge(base, vs30, ["bedrock_amplification"])
+    # ── Building vulnerability ────────────────────────────────────────────
+    print("[Earthquake] Merging building vulnerability...")
+    base = nearest_merge(base, bldg, ["building_vulnerability"])
+    # ── Population density ────────────────────────────────────────────────
+    print("[Earthquake] Merging population...")
+    base = nearest_merge(base, pop, ["population_density_norm"])
+    # ── Validate all required columns present ─────────────────────────────
+    required = [
+        "historical_seismicity", "distance_to_fault_km", "soil_liquefaction_index",
+        "focal_depth_km", "tectonic_stress_index", "building_vulnerability",
+        "population_density_norm", "bedrock_amplification",
+    ]
+    missing = [c for c in required if c not in base.columns]
+    if missing:
+        raise ValueError(
+            f"Missing columns after all merges: {missing}\n"
+            f"Available: {list(base.columns)}"
+        )
+    base = base.dropna(subset=required)
+    print(f"[Earthquake] Rows after dropna: {len(base)}")
+    if len(base) < 50:
+        raise ValueError(
+            f"Only {len(base)} clean rows — check CSV paths and column names"
+        )
+    # ── Risk label ────────────────────────────────────────────────────────
+    # Use magnitude if available, otherwise derive from features
+    if "magnitude" in base.columns:
+        base["magnitude"] = pd.to_numeric(base["magnitude"], errors="coerce").fillna(0)
+        mag_norm = np.clip((base["magnitude"] - 2.0) / 7.0, 0, 1)  # scale 2–9
+    else:
+        mag_norm = pd.Series(np.zeros(len(base)))
+    depth_norm   = np.clip(base["focal_depth_km"] / 700.0, 0, 1)
+    fault_norm   = np.clip(base["distance_to_fault_km"] / 200.0, 0, 1)
+    liquef_norm  = np.clip(base["soil_liquefaction_index"], 0, 1)
+    vuln_norm    = np.clip(base["building_vulnerability"], 0, 1)
+    pop_norm     = np.clip(base["population_density_norm"], 0, 1)
+    amp_norm     = np.clip(base["bedrock_amplification"], 0, 1)
+    stress_norm  = np.clip(base["tectonic_stress_index"], 0, 1)
+    seism_norm   = np.clip(base["historical_seismicity"], 0, 1)
+    base["risk_score"] = np.clip(
+        0.25 * mag_norm.values +
+        0.20 * (1 - depth_norm) +        # shallow = more damage
+        0.15 * (1 - fault_norm) +        # close to fault = more risk
         0.15 * liquef_norm +
         0.10 * vuln_norm +
         0.05 * pop_norm +
+        0.05 * amp_norm +
+        0.05 * seism_norm +
+        np.random.normal(0, 0.02, len(base)),
+        0.0, 1.0
+    )
+    print(f"[Earthquake] Risk score: mean={base['risk_score'].mean():.3f}, "
+          f"std={base['risk_score'].std():.3f}, "
+          f">0.5: {(base['risk_score'] > 0.5).sum()} rows")
+    # ── Normalise features ────────────────────────────────────────────────
+    features = [
+        "historical_seismicity", "distance_to_fault_km", "soil_liquefaction_index",
+        "focal_depth_km", "tectonic_stress_index", "building_vulnerability",
+        "population_density_norm", "bedrock_amplification",
+    ]
+    assert features == list(EARTHQUAKE_FEATURES), (
+        f"Feature mismatch!\n  train:     {features}\n"
+        f"  predictor: {list(EARTHQUAKE_FEATURES)}"
     )
+    from src.disaster_predictors import FEATURE_RANGES
+    X = np.zeros((len(base), len(features)), dtype=np.float32)
+    for i, feat in enumerate(features):
+        lo, hi = FEATURE_RANGES[feat]
+        X[:, i] = np.clip(
+            (base[feat].values - lo) / (hi - lo + 1e-8), 0, 1
+        )
+    y = base["risk_score"].values.astype(np.float32)
+    return X, y
 DATA_GENERATORS = {
 }
 # ============================================================================
 # TRAINING PIPELINE
 # ============================================================================
     print(f"{'='*60}")
     generator_fn, feature_names = DATA_GENERATORS[disaster_type]
+    n = n_samples or {
+        "flood": 5000, "cyclone": 3000,
+        "landslide": 4000, "earthquake": 3000
+    }[disaster_type]
+    REAL_DATA_GENERATORS = {"flood", "cyclone", "landslide", "earthquake"}
+    if disaster_type in REAL_DATA_GENERATORS:
+        print(f"Loading real data for {disaster_type}...")
+    else:
+        print(f"Generating {n} synthetic samples...")
     X, y = generator_fn(n)
+    # ... rest unchanged
 def train_all(epochs: int = 200):