rendement_agricole / scripts /simulation_dataset.py
stephmnt's picture
Sync from GitHub via hub-sync
23b1977 verified
"""Briques communes autour du dataset de simulation locale.
Ce module centralise le renommage et le nettoyage de
`data/simulation/crop_yield.csv` afin d'eviter que l'ACP et le moteur runtime
fassent diverger leurs hypotheses de preparation.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Literal
import pandas as pd
SIMULATION_COLUMN_RENAMES = {
"Region": "region",
"Soil_Type": "soil_type",
"Crop": "crop",
"Rainfall_mm": "rainfall_mm",
"Temperature_Celsius": "temperature_celsius",
"Fertilizer_Used": "fertilizer_used",
"Irrigation_Used": "irrigation_used",
"Weather_Condition": "weather_condition",
"Days_to_Harvest": "days_to_harvest",
"Yield_tons_per_hectare": "yield_tons_per_hectare",
}
SIMULATION_CATEGORICAL_COLUMNS = [
"region",
"soil_type",
"crop",
"weather_condition",
]
SIMULATION_BOOLEAN_COLUMNS = [
"fertilizer_used",
"irrigation_used",
]
SIMULATION_NUMERIC_COLUMNS = [
"rainfall_mm",
"temperature_celsius",
"days_to_harvest",
"yield_tons_per_hectare",
]
SIMULATION_ACP_NUMERIC_COLUMNS = [
"rainfall_mm",
"temperature_celsius",
"days_to_harvest",
]
def normalize_simulation_label(value: Any) -> str:
"""Nettoie une etiquette textuelle issue du dataset de simulation."""
return str(value).strip()
def _coerce_boolean_value(value: Any) -> bool | pd._libs.missing.NAType:
"""Convertit defensivement une valeur vers un booleen pandas-compatible."""
if pd.isna(value):
return pd.NA
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return bool(int(value))
normalized = str(value).strip().lower()
if normalized in {"true", "1", "yes", "y", "oui"}:
return True
if normalized in {"false", "0", "no", "n", "non"}:
return False
return bool(value)
def load_normalized_simulation_dataset(
csv_path: str | Path,
*,
boolean_dtype: Literal["boolean", "bool"] = "bool",
) -> pd.DataFrame:
"""Charge et normalise le dataset de simulation locale.
Args:
csv_path: Fichier CSV source a charger.
boolean_dtype: Type a utiliser pour les colonnes booleennes.
Returns:
pd.DataFrame: Dataset nettoye avec schema homogenise.
"""
simulation_df = pd.read_csv(Path(csv_path)).rename(columns=SIMULATION_COLUMN_RENAMES)
simulation_df[SIMULATION_CATEGORICAL_COLUMNS] = simulation_df[SIMULATION_CATEGORICAL_COLUMNS].apply(
lambda column: column.map(normalize_simulation_label)
)
simulation_df[SIMULATION_NUMERIC_COLUMNS] = simulation_df[SIMULATION_NUMERIC_COLUMNS].apply(
pd.to_numeric,
errors="coerce",
)
for column in SIMULATION_BOOLEAN_COLUMNS:
normalized_series = simulation_df[column].map(_coerce_boolean_value).astype("boolean")
if boolean_dtype == "bool":
if normalized_series.isna().any():
raise ValueError(
f"Column {column!r} contains missing values and cannot be coerced to bool."
)
simulation_df[column] = normalized_series.astype(bool)
else:
simulation_df[column] = normalized_series
simulation_df = simulation_df.loc[simulation_df["yield_tons_per_hectare"] >= 0].reset_index(drop=True)
return simulation_df