tkbarb10's picture
download
raw
7.59 kB
"""
Creates realistic dummy data files in resources/ for prototyping the Streamlit app.
Run once: python resources/create_dummy_data.py
These files are overwritten when resources/precompute.py runs against real data.
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
RESOURCES = Path(__file__).parent
RNG = np.random.default_rng(42)
AREAS = [
"Canada", "Guinea", "Amazon Basin",
"Congo Basin DRC", "Indonesia-Malaysia", "Mekong Region", "Cerrado Brazil"
]
YEARS = list(range(2018, 2025)) # drift starts 2018 (first diff of 2017 baseline)
EMBEDDING_COLS = [f"A{i:02d}" for i in range(64)]
AOI_CENTERS = {
"Canada": (-115.0, 57.5),
"Guinea": (-11.75, 10.0),
"Amazon Basin": (-55.0, -4.0),
"Congo Basin DRC": (26.0, -1.0),
"Indonesia-Malaysia": (113.0, 0.0),
"Mekong Region": (103.0, 22.0),
"Cerrado Brazil": (-47.0, -13.0),
}
def main():
kpi()
evaluations()
feature_importance()
confusion_matrix()
embedding_profile()
drift_by_area_year()
target_distribution()
timelapse_sample()
pca_sample()
print("Dummy data created in resources/")
def kpi():
data = {
"total_rows": 803054,
"n_areas": 7,
"n_features_engineered": 19,
"best_model": "xgboost_optuna",
"pr_auc": 0.4231,
"f1": 0.3814,
"brier": 0.0413,
"precision": 0.4102,
"recall": 0.3541,
"baseline_pr_auc": 0.1912,
"improvement_pct": 121.3,
}
(RESOURCES / "kpi_summary.json").write_text(json.dumps(data, indent=2))
def evaluations():
rows = [
{
"model": "logistic_regression",
"display_name": "Logistic Regression (Baseline)",
"description": "The humble baseline. Respect the classic.",
"accuracy": 0.8821, "precision": 0.1243, "recall": 0.6112,
"f1": 0.2072, "pr_auc": 0.1912, "brier": 0.0721,
"tn": 715_000, "fp": 85_000, "fn": 1_500, "tp": 2_354,
},
{
"model": "random_forest",
"display_name": "Random Forest",
"description": "Solid tree ensemble with balanced class weights.",
"accuracy": 0.9321, "precision": 0.2841, "recall": 0.5203,
"f1": 0.3673, "pr_auc": 0.3541, "brier": 0.0512,
"tn": 741_000, "fp": 59_000, "fn": 1_842, "tp": 2_012,
},
{
"model": "sgd_optuna",
"display_name": "SGD (Optuna Tuned)",
"description": "Stochastic gradient descent with Bayesian-tuned hyperparameters.",
"accuracy": 0.9011, "precision": 0.2013, "recall": 0.5841,
"f1": 0.2982, "pr_auc": 0.2803, "brier": 0.0634,
"tn": 728_000, "fp": 72_000, "fn": 1_591, "tp": 2_263,
},
{
"model": "xgboost_optuna",
"display_name": "XGBoost (Optuna Tuned) ✓",
"description": "Winner: 100-trial Optuna search maximizing PR-AUC.",
"accuracy": 0.9612, "precision": 0.4102, "recall": 0.3541,
"f1": 0.3802, "pr_auc": 0.4231, "brier": 0.0413,
"tn": 768_000, "fp": 32_000, "fn": 2_483, "tp": 1_371,
},
]
pd.DataFrame(rows).to_parquet(RESOURCES / "evaluations_clean.parquet", index=False)
def feature_importance():
features = [
"drift_magnitude", "A03", "A28", "emb_norm", "A51",
"A14", "loss_prototype_sim", "A07", "emb_std", "A33",
"noloss_prototype_sim", "A42", "drift_z_within_area", "A19", "emb_norm_z",
]
importance_vals = [850, 420, 380, 310, 290, 260, 240, 220, 195, 185, 170, 155, 140, 130, 120]
data = [{"feature": f, "importance": v} for f, v in zip(features, importance_vals)]
(RESOURCES / "feature_importance.json").write_text(json.dumps(data, indent=2))
def confusion_matrix():
data = {"tn": 768000, "fp": 32000, "fn": 2483, "tp": 1371, "model": "xgboost_optuna"}
(RESOURCES / "confusion_matrix.json").write_text(json.dumps(data, indent=2))
def embedding_profile():
rows = []
for area in AREAS:
for year in YEARS:
for label in [0, 1]:
base = RNG.normal(0, 0.05, 64)
if label == 1:
base += RNG.normal(0.04, 0.02, 64) # loss pixels drift more
for i, val in enumerate(base):
rows.append({
"loss_label": label,
"name": area,
"year": year,
"dim": f"A{i:02d}",
"dim_idx": i,
"mean_value": round(float(val), 6),
})
pd.DataFrame(rows).to_parquet(RESOURCES / "mean_embedding_profile.parquet", index=False)
def drift_by_area_year():
rows = []
for area in AREAS:
loss_scale = 1.4 if area == "Indonesia-Malaysia" else 1.0
for year in YEARS:
for label in [0, 1]:
base_drift = 0.45 if label == 1 else 0.22
mean_d = base_drift * loss_scale + RNG.uniform(-0.05, 0.05)
rows.append({
"name": area,
"year": year,
"loss_label": label,
"mean_drift": round(float(mean_d), 4),
"std_drift": round(float(RNG.uniform(0.05, 0.15)), 4),
"count": RNG.integers(500, 3000),
})
pd.DataFrame(rows).to_parquet(RESOURCES / "drift_by_area_year.parquet", index=False)
def target_distribution():
per_area = []
for area in AREAS:
total = RNG.integers(80_000, 140_000)
loss_rate = RNG.uniform(0.04, 0.12)
loss = int(total * loss_rate)
per_area.append({
"name": area,
"loss": loss,
"no_loss": int(total - loss),
"loss_rate": round(float(loss_rate), 4),
})
data = {
"total": 803054,
"loss": 38547,
"no_loss": 764507,
"loss_rate": 0.048,
"per_area": per_area,
}
(RESOURCES / "target_distribution.json").write_text(json.dumps(data, indent=2))
def timelapse_sample():
rows = []
n_loss = 5000
n_noloss = 5000
for label, n in [(1, n_loss), (0, n_noloss)]:
for _ in range(n):
area = AREAS[RNG.integers(0, len(AREAS))]
cx, cy = AOI_CENTERS[area]
lon = cx + RNG.uniform(-5, 5)
lat = cy + RNG.uniform(-3, 3)
year = int(YEARS[RNG.integers(0, len(YEARS))])
drift = float(RNG.uniform(0.3, 0.9) if label == 1 else RNG.uniform(0.05, 0.4))
rows.append({
"latitude": round(lat, 5),
"longitude": round(lon, 5),
"year": year,
"drift_magnitude": round(drift, 4),
"loss_label": label,
"name": area,
})
pd.DataFrame(rows).to_parquet(RESOURCES / "timelapse_sample.parquet", index=False)
def pca_sample():
n = 2000
labels = np.array([0] * 1900 + [1] * 100)
pc1 = np.where(labels == 1,
RNG.normal(2.5, 1.2, n),
RNG.normal(-0.3, 1.8, n))
pc2 = np.where(labels == 1,
RNG.normal(1.8, 1.0, n),
RNG.normal(0.1, 1.5, n))
areas = [AREAS[i % len(AREAS)] for i in range(n)]
pd.DataFrame({"pc1": pc1, "pc2": pc2, "loss_label": labels, "name": areas}).to_parquet(
RESOURCES / "pca_sample.parquet", index=False
)
if __name__ == "__main__":
main()

Xet Storage Details

Size:
7.59 kB
·
Xet hash:
3e082516c93fe8f39ec9162ac00fa123af9a24f20339a4ca4d8ecb7805508f83

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.