Spaces:

stephmnt
/

rendement_agricole

Sleeping

App Files Files Community

rendement_agricole / scripts /simulation_dataset.py

stephmnt

Sync from GitHub via hub-sync

23b1977 verified 18 days ago

raw

history blame contribute delete

3.39 kB

	"""Briques communes autour du dataset de simulation locale.

	Ce module centralise le renommage et le nettoyage de
	`data/simulation/crop_yield.csv` afin d'eviter que l'ACP et le moteur runtime
	fassent diverger leurs hypotheses de preparation.
	"""

	from __future__ import annotations

	from pathlib import Path
	from typing import Any, Literal

	import pandas as pd


	SIMULATION_COLUMN_RENAMES = {
	"Region": "region",
	"Soil_Type": "soil_type",
	"Crop": "crop",
	"Rainfall_mm": "rainfall_mm",
	"Temperature_Celsius": "temperature_celsius",
	"Fertilizer_Used": "fertilizer_used",
	"Irrigation_Used": "irrigation_used",
	"Weather_Condition": "weather_condition",
	"Days_to_Harvest": "days_to_harvest",
	"Yield_tons_per_hectare": "yield_tons_per_hectare",
	}

	SIMULATION_CATEGORICAL_COLUMNS = [
	"region",
	"soil_type",
	"crop",
	"weather_condition",
	]

	SIMULATION_BOOLEAN_COLUMNS = [
	"fertilizer_used",
	"irrigation_used",
	]

	SIMULATION_NUMERIC_COLUMNS = [
	"rainfall_mm",
	"temperature_celsius",
	"days_to_harvest",
	"yield_tons_per_hectare",
	]

	SIMULATION_ACP_NUMERIC_COLUMNS = [
	"rainfall_mm",
	"temperature_celsius",
	"days_to_harvest",
	]


	def normalize_simulation_label(value: Any) -> str:
	"""Nettoie une etiquette textuelle issue du dataset de simulation."""
	return str(value).strip()


	def _coerce_boolean_value(value: Any) -> bool \| pd._libs.missing.NAType:
	"""Convertit defensivement une valeur vers un booleen pandas-compatible."""
	if pd.isna(value):
	return pd.NA
	if isinstance(value, bool):
	return value
	if isinstance(value, (int, float)):
	return bool(int(value))

	normalized = str(value).strip().lower()
	if normalized in {"true", "1", "yes", "y", "oui"}:
	return True
	if normalized in {"false", "0", "no", "n", "non"}:
	return False
	return bool(value)


	def load_normalized_simulation_dataset(
	csv_path: str \| Path,
	*,
	boolean_dtype: Literal["boolean", "bool"] = "bool",
	) -> pd.DataFrame:
	"""Charge et normalise le dataset de simulation locale.

	Args:
	csv_path: Fichier CSV source a charger.
	boolean_dtype: Type a utiliser pour les colonnes booleennes.

	Returns:
	pd.DataFrame: Dataset nettoye avec schema homogenise.
	"""
	simulation_df = pd.read_csv(Path(csv_path)).rename(columns=SIMULATION_COLUMN_RENAMES)

	simulation_df[SIMULATION_CATEGORICAL_COLUMNS] = simulation_df[SIMULATION_CATEGORICAL_COLUMNS].apply(
	lambda column: column.map(normalize_simulation_label)
	)
	simulation_df[SIMULATION_NUMERIC_COLUMNS] = simulation_df[SIMULATION_NUMERIC_COLUMNS].apply(
	pd.to_numeric,
	errors="coerce",
	)

	for column in SIMULATION_BOOLEAN_COLUMNS:
	normalized_series = simulation_df[column].map(_coerce_boolean_value).astype("boolean")
	if boolean_dtype == "bool":
	if normalized_series.isna().any():
	raise ValueError(
	f"Column {column!r} contains missing values and cannot be coerced to bool."
	)
	simulation_df[column] = normalized_series.astype(bool)
	else:
	simulation_df[column] = normalized_series

	simulation_df = simulation_df.loc[simulation_df["yield_tons_per_hectare"] >= 0].reset_index(drop=True)
	return simulation_df