Buckets:

tkbarb10
/

deforestation-app-storage

Files

xet

tkbarb10/deforestation-app-storage / resources /create_dummy_data.py

tkbarb10

about 1 month ago

download

raw

7.59 kB

	"""
	Creates realistic dummy data files in resources/ for prototyping the Streamlit app.
	Run once: python resources/create_dummy_data.py
	These files are overwritten when resources/precompute.py runs against real data.
	"""
	import json
	import numpy as np
	import pandas as pd
	from pathlib import Path

	RESOURCES = Path(__file__).parent
	RNG = np.random.default_rng(42)

	AREAS = [
	"Canada", "Guinea", "Amazon Basin",
	"Congo Basin DRC", "Indonesia-Malaysia", "Mekong Region", "Cerrado Brazil"
	]
	YEARS = list(range(2018, 2025)) # drift starts 2018 (first diff of 2017 baseline)
	EMBEDDING_COLS = [f"A{i:02d}" for i in range(64)]

	AOI_CENTERS = {
	"Canada": (-115.0, 57.5),
	"Guinea": (-11.75, 10.0),
	"Amazon Basin": (-55.0, -4.0),
	"Congo Basin DRC": (26.0, -1.0),
	"Indonesia-Malaysia": (113.0, 0.0),
	"Mekong Region": (103.0, 22.0),
	"Cerrado Brazil": (-47.0, -13.0),
	}


	def main():
	kpi()
	evaluations()
	feature_importance()
	confusion_matrix()
	embedding_profile()
	drift_by_area_year()
	target_distribution()
	timelapse_sample()
	pca_sample()
	print("Dummy data created in resources/")


	def kpi():
	data = {
	"total_rows": 803054,
	"n_areas": 7,
	"n_features_engineered": 19,
	"best_model": "xgboost_optuna",
	"pr_auc": 0.4231,
	"f1": 0.3814,
	"brier": 0.0413,
	"precision": 0.4102,
	"recall": 0.3541,
	"baseline_pr_auc": 0.1912,
	"improvement_pct": 121.3,
	}
	(RESOURCES / "kpi_summary.json").write_text(json.dumps(data, indent=2))


	def evaluations():
	rows = [
	{
	"model": "logistic_regression",
	"display_name": "Logistic Regression (Baseline)",
	"description": "The humble baseline. Respect the classic.",
	"accuracy": 0.8821, "precision": 0.1243, "recall": 0.6112,
	"f1": 0.2072, "pr_auc": 0.1912, "brier": 0.0721,
	"tn": 715_000, "fp": 85_000, "fn": 1_500, "tp": 2_354,
	},
	{
	"model": "random_forest",
	"display_name": "Random Forest",
	"description": "Solid tree ensemble with balanced class weights.",
	"accuracy": 0.9321, "precision": 0.2841, "recall": 0.5203,
	"f1": 0.3673, "pr_auc": 0.3541, "brier": 0.0512,
	"tn": 741_000, "fp": 59_000, "fn": 1_842, "tp": 2_012,
	},
	{
	"model": "sgd_optuna",
	"display_name": "SGD (Optuna Tuned)",
	"description": "Stochastic gradient descent with Bayesian-tuned hyperparameters.",
	"accuracy": 0.9011, "precision": 0.2013, "recall": 0.5841,
	"f1": 0.2982, "pr_auc": 0.2803, "brier": 0.0634,
	"tn": 728_000, "fp": 72_000, "fn": 1_591, "tp": 2_263,
	},
	{
	"model": "xgboost_optuna",
	"display_name": "XGBoost (Optuna Tuned) ✓",
	"description": "Winner: 100-trial Optuna search maximizing PR-AUC.",
	"accuracy": 0.9612, "precision": 0.4102, "recall": 0.3541,
	"f1": 0.3802, "pr_auc": 0.4231, "brier": 0.0413,
	"tn": 768_000, "fp": 32_000, "fn": 2_483, "tp": 1_371,
	},
	]
	pd.DataFrame(rows).to_parquet(RESOURCES / "evaluations_clean.parquet", index=False)


	def feature_importance():
	features = [
	"drift_magnitude", "A03", "A28", "emb_norm", "A51",
	"A14", "loss_prototype_sim", "A07", "emb_std", "A33",
	"noloss_prototype_sim", "A42", "drift_z_within_area", "A19", "emb_norm_z",
	]
	importance_vals = [850, 420, 380, 310, 290, 260, 240, 220, 195, 185, 170, 155, 140, 130, 120]
	data = [{"feature": f, "importance": v} for f, v in zip(features, importance_vals)]
	(RESOURCES / "feature_importance.json").write_text(json.dumps(data, indent=2))


	def confusion_matrix():
	data = {"tn": 768000, "fp": 32000, "fn": 2483, "tp": 1371, "model": "xgboost_optuna"}
	(RESOURCES / "confusion_matrix.json").write_text(json.dumps(data, indent=2))


	def embedding_profile():
	rows = []
	for area in AREAS:
	for year in YEARS:
	for label in [0, 1]:
	base = RNG.normal(0, 0.05, 64)
	if label == 1:
	base += RNG.normal(0.04, 0.02, 64) # loss pixels drift more
	for i, val in enumerate(base):
	rows.append({
	"loss_label": label,
	"name": area,
	"year": year,
	"dim": f"A{i:02d}",
	"dim_idx": i,
	"mean_value": round(float(val), 6),
	})
	pd.DataFrame(rows).to_parquet(RESOURCES / "mean_embedding_profile.parquet", index=False)


	def drift_by_area_year():
	rows = []
	for area in AREAS:
	loss_scale = 1.4 if area == "Indonesia-Malaysia" else 1.0
	for year in YEARS:
	for label in [0, 1]:
	base_drift = 0.45 if label == 1 else 0.22
	mean_d = base_drift * loss_scale + RNG.uniform(-0.05, 0.05)
	rows.append({
	"name": area,
	"year": year,
	"loss_label": label,
	"mean_drift": round(float(mean_d), 4),
	"std_drift": round(float(RNG.uniform(0.05, 0.15)), 4),
	"count": RNG.integers(500, 3000),
	})
	pd.DataFrame(rows).to_parquet(RESOURCES / "drift_by_area_year.parquet", index=False)


	def target_distribution():
	per_area = []
	for area in AREAS:
	total = RNG.integers(80_000, 140_000)
	loss_rate = RNG.uniform(0.04, 0.12)
	loss = int(total * loss_rate)
	per_area.append({
	"name": area,
	"loss": loss,
	"no_loss": int(total - loss),
	"loss_rate": round(float(loss_rate), 4),
	})

	data = {
	"total": 803054,
	"loss": 38547,
	"no_loss": 764507,
	"loss_rate": 0.048,
	"per_area": per_area,
	}
	(RESOURCES / "target_distribution.json").write_text(json.dumps(data, indent=2))


	def timelapse_sample():
	rows = []
	n_loss = 5000
	n_noloss = 5000

	for label, n in [(1, n_loss), (0, n_noloss)]:
	for _ in range(n):
	area = AREAS[RNG.integers(0, len(AREAS))]
	cx, cy = AOI_CENTERS[area]
	lon = cx + RNG.uniform(-5, 5)
	lat = cy + RNG.uniform(-3, 3)
	year = int(YEARS[RNG.integers(0, len(YEARS))])
	drift = float(RNG.uniform(0.3, 0.9) if label == 1 else RNG.uniform(0.05, 0.4))
	rows.append({
	"latitude": round(lat, 5),
	"longitude": round(lon, 5),
	"year": year,
	"drift_magnitude": round(drift, 4),
	"loss_label": label,
	"name": area,
	})

	pd.DataFrame(rows).to_parquet(RESOURCES / "timelapse_sample.parquet", index=False)


	def pca_sample():
	n = 2000
	labels = np.array([0] * 1900 + [1] * 100)
	pc1 = np.where(labels == 1,
	RNG.normal(2.5, 1.2, n),
	RNG.normal(-0.3, 1.8, n))
	pc2 = np.where(labels == 1,
	RNG.normal(1.8, 1.0, n),
	RNG.normal(0.1, 1.5, n))

	areas = [AREAS[i % len(AREAS)] for i in range(n)]
	pd.DataFrame({"pc1": pc1, "pc2": pc2, "loss_label": labels, "name": areas}).to_parquet(
	RESOURCES / "pca_sample.parquet", index=False
	)


	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 7.59 kB
Xet hash:: 3e082516c93fe8f39ec9162ac00fa123af9a24f20339a4ca4d8ecb7805508f83

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.