Buckets:
| """ | |
| Creates realistic dummy data files in resources/ for prototyping the Streamlit app. | |
| Run once: python resources/create_dummy_data.py | |
| These files are overwritten when resources/precompute.py runs against real data. | |
| """ | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| RESOURCES = Path(__file__).parent | |
| RNG = np.random.default_rng(42) | |
| AREAS = [ | |
| "Canada", "Guinea", "Amazon Basin", | |
| "Congo Basin DRC", "Indonesia-Malaysia", "Mekong Region", "Cerrado Brazil" | |
| ] | |
| YEARS = list(range(2018, 2025)) # drift starts 2018 (first diff of 2017 baseline) | |
| EMBEDDING_COLS = [f"A{i:02d}" for i in range(64)] | |
| AOI_CENTERS = { | |
| "Canada": (-115.0, 57.5), | |
| "Guinea": (-11.75, 10.0), | |
| "Amazon Basin": (-55.0, -4.0), | |
| "Congo Basin DRC": (26.0, -1.0), | |
| "Indonesia-Malaysia": (113.0, 0.0), | |
| "Mekong Region": (103.0, 22.0), | |
| "Cerrado Brazil": (-47.0, -13.0), | |
| } | |
| def main(): | |
| kpi() | |
| evaluations() | |
| feature_importance() | |
| confusion_matrix() | |
| embedding_profile() | |
| drift_by_area_year() | |
| target_distribution() | |
| timelapse_sample() | |
| pca_sample() | |
| print("Dummy data created in resources/") | |
| def kpi(): | |
| data = { | |
| "total_rows": 803054, | |
| "n_areas": 7, | |
| "n_features_engineered": 19, | |
| "best_model": "xgboost_optuna", | |
| "pr_auc": 0.4231, | |
| "f1": 0.3814, | |
| "brier": 0.0413, | |
| "precision": 0.4102, | |
| "recall": 0.3541, | |
| "baseline_pr_auc": 0.1912, | |
| "improvement_pct": 121.3, | |
| } | |
| (RESOURCES / "kpi_summary.json").write_text(json.dumps(data, indent=2)) | |
| def evaluations(): | |
| rows = [ | |
| { | |
| "model": "logistic_regression", | |
| "display_name": "Logistic Regression (Baseline)", | |
| "description": "The humble baseline. Respect the classic.", | |
| "accuracy": 0.8821, "precision": 0.1243, "recall": 0.6112, | |
| "f1": 0.2072, "pr_auc": 0.1912, "brier": 0.0721, | |
| "tn": 715_000, "fp": 85_000, "fn": 1_500, "tp": 2_354, | |
| }, | |
| { | |
| "model": "random_forest", | |
| "display_name": "Random Forest", | |
| "description": "Solid tree ensemble with balanced class weights.", | |
| "accuracy": 0.9321, "precision": 0.2841, "recall": 0.5203, | |
| "f1": 0.3673, "pr_auc": 0.3541, "brier": 0.0512, | |
| "tn": 741_000, "fp": 59_000, "fn": 1_842, "tp": 2_012, | |
| }, | |
| { | |
| "model": "sgd_optuna", | |
| "display_name": "SGD (Optuna Tuned)", | |
| "description": "Stochastic gradient descent with Bayesian-tuned hyperparameters.", | |
| "accuracy": 0.9011, "precision": 0.2013, "recall": 0.5841, | |
| "f1": 0.2982, "pr_auc": 0.2803, "brier": 0.0634, | |
| "tn": 728_000, "fp": 72_000, "fn": 1_591, "tp": 2_263, | |
| }, | |
| { | |
| "model": "xgboost_optuna", | |
| "display_name": "XGBoost (Optuna Tuned) ✓", | |
| "description": "Winner: 100-trial Optuna search maximizing PR-AUC.", | |
| "accuracy": 0.9612, "precision": 0.4102, "recall": 0.3541, | |
| "f1": 0.3802, "pr_auc": 0.4231, "brier": 0.0413, | |
| "tn": 768_000, "fp": 32_000, "fn": 2_483, "tp": 1_371, | |
| }, | |
| ] | |
| pd.DataFrame(rows).to_parquet(RESOURCES / "evaluations_clean.parquet", index=False) | |
| def feature_importance(): | |
| features = [ | |
| "drift_magnitude", "A03", "A28", "emb_norm", "A51", | |
| "A14", "loss_prototype_sim", "A07", "emb_std", "A33", | |
| "noloss_prototype_sim", "A42", "drift_z_within_area", "A19", "emb_norm_z", | |
| ] | |
| importance_vals = [850, 420, 380, 310, 290, 260, 240, 220, 195, 185, 170, 155, 140, 130, 120] | |
| data = [{"feature": f, "importance": v} for f, v in zip(features, importance_vals)] | |
| (RESOURCES / "feature_importance.json").write_text(json.dumps(data, indent=2)) | |
| def confusion_matrix(): | |
| data = {"tn": 768000, "fp": 32000, "fn": 2483, "tp": 1371, "model": "xgboost_optuna"} | |
| (RESOURCES / "confusion_matrix.json").write_text(json.dumps(data, indent=2)) | |
| def embedding_profile(): | |
| rows = [] | |
| for area in AREAS: | |
| for year in YEARS: | |
| for label in [0, 1]: | |
| base = RNG.normal(0, 0.05, 64) | |
| if label == 1: | |
| base += RNG.normal(0.04, 0.02, 64) # loss pixels drift more | |
| for i, val in enumerate(base): | |
| rows.append({ | |
| "loss_label": label, | |
| "name": area, | |
| "year": year, | |
| "dim": f"A{i:02d}", | |
| "dim_idx": i, | |
| "mean_value": round(float(val), 6), | |
| }) | |
| pd.DataFrame(rows).to_parquet(RESOURCES / "mean_embedding_profile.parquet", index=False) | |
| def drift_by_area_year(): | |
| rows = [] | |
| for area in AREAS: | |
| loss_scale = 1.4 if area == "Indonesia-Malaysia" else 1.0 | |
| for year in YEARS: | |
| for label in [0, 1]: | |
| base_drift = 0.45 if label == 1 else 0.22 | |
| mean_d = base_drift * loss_scale + RNG.uniform(-0.05, 0.05) | |
| rows.append({ | |
| "name": area, | |
| "year": year, | |
| "loss_label": label, | |
| "mean_drift": round(float(mean_d), 4), | |
| "std_drift": round(float(RNG.uniform(0.05, 0.15)), 4), | |
| "count": RNG.integers(500, 3000), | |
| }) | |
| pd.DataFrame(rows).to_parquet(RESOURCES / "drift_by_area_year.parquet", index=False) | |
| def target_distribution(): | |
| per_area = [] | |
| for area in AREAS: | |
| total = RNG.integers(80_000, 140_000) | |
| loss_rate = RNG.uniform(0.04, 0.12) | |
| loss = int(total * loss_rate) | |
| per_area.append({ | |
| "name": area, | |
| "loss": loss, | |
| "no_loss": int(total - loss), | |
| "loss_rate": round(float(loss_rate), 4), | |
| }) | |
| data = { | |
| "total": 803054, | |
| "loss": 38547, | |
| "no_loss": 764507, | |
| "loss_rate": 0.048, | |
| "per_area": per_area, | |
| } | |
| (RESOURCES / "target_distribution.json").write_text(json.dumps(data, indent=2)) | |
| def timelapse_sample(): | |
| rows = [] | |
| n_loss = 5000 | |
| n_noloss = 5000 | |
| for label, n in [(1, n_loss), (0, n_noloss)]: | |
| for _ in range(n): | |
| area = AREAS[RNG.integers(0, len(AREAS))] | |
| cx, cy = AOI_CENTERS[area] | |
| lon = cx + RNG.uniform(-5, 5) | |
| lat = cy + RNG.uniform(-3, 3) | |
| year = int(YEARS[RNG.integers(0, len(YEARS))]) | |
| drift = float(RNG.uniform(0.3, 0.9) if label == 1 else RNG.uniform(0.05, 0.4)) | |
| rows.append({ | |
| "latitude": round(lat, 5), | |
| "longitude": round(lon, 5), | |
| "year": year, | |
| "drift_magnitude": round(drift, 4), | |
| "loss_label": label, | |
| "name": area, | |
| }) | |
| pd.DataFrame(rows).to_parquet(RESOURCES / "timelapse_sample.parquet", index=False) | |
| def pca_sample(): | |
| n = 2000 | |
| labels = np.array([0] * 1900 + [1] * 100) | |
| pc1 = np.where(labels == 1, | |
| RNG.normal(2.5, 1.2, n), | |
| RNG.normal(-0.3, 1.8, n)) | |
| pc2 = np.where(labels == 1, | |
| RNG.normal(1.8, 1.0, n), | |
| RNG.normal(0.1, 1.5, n)) | |
| areas = [AREAS[i % len(AREAS)] for i in range(n)] | |
| pd.DataFrame({"pc1": pc1, "pc2": pc2, "loss_label": labels, "name": areas}).to_parquet( | |
| RESOURCES / "pca_sample.parquet", index=False | |
| ) | |
| if __name__ == "__main__": | |
| main() | |
Xet Storage Details
- Size:
- 7.59 kB
- Xet hash:
- 3e082516c93fe8f39ec9162ac00fa123af9a24f20339a4ca4d8ecb7805508f83
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.