Spaces:
Sleeping
Sleeping
| """Briques communes autour du dataset de simulation locale. | |
| Ce module centralise le renommage et le nettoyage de | |
| `data/simulation/crop_yield.csv` afin d'eviter que l'ACP et le moteur runtime | |
| fassent diverger leurs hypotheses de preparation. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any, Literal | |
| import pandas as pd | |
| SIMULATION_COLUMN_RENAMES = { | |
| "Region": "region", | |
| "Soil_Type": "soil_type", | |
| "Crop": "crop", | |
| "Rainfall_mm": "rainfall_mm", | |
| "Temperature_Celsius": "temperature_celsius", | |
| "Fertilizer_Used": "fertilizer_used", | |
| "Irrigation_Used": "irrigation_used", | |
| "Weather_Condition": "weather_condition", | |
| "Days_to_Harvest": "days_to_harvest", | |
| "Yield_tons_per_hectare": "yield_tons_per_hectare", | |
| } | |
| SIMULATION_CATEGORICAL_COLUMNS = [ | |
| "region", | |
| "soil_type", | |
| "crop", | |
| "weather_condition", | |
| ] | |
| SIMULATION_BOOLEAN_COLUMNS = [ | |
| "fertilizer_used", | |
| "irrigation_used", | |
| ] | |
| SIMULATION_NUMERIC_COLUMNS = [ | |
| "rainfall_mm", | |
| "temperature_celsius", | |
| "days_to_harvest", | |
| "yield_tons_per_hectare", | |
| ] | |
| SIMULATION_ACP_NUMERIC_COLUMNS = [ | |
| "rainfall_mm", | |
| "temperature_celsius", | |
| "days_to_harvest", | |
| ] | |
| def normalize_simulation_label(value: Any) -> str: | |
| """Nettoie une etiquette textuelle issue du dataset de simulation.""" | |
| return str(value).strip() | |
| def _coerce_boolean_value(value: Any) -> bool | pd._libs.missing.NAType: | |
| """Convertit defensivement une valeur vers un booleen pandas-compatible.""" | |
| if pd.isna(value): | |
| return pd.NA | |
| if isinstance(value, bool): | |
| return value | |
| if isinstance(value, (int, float)): | |
| return bool(int(value)) | |
| normalized = str(value).strip().lower() | |
| if normalized in {"true", "1", "yes", "y", "oui"}: | |
| return True | |
| if normalized in {"false", "0", "no", "n", "non"}: | |
| return False | |
| return bool(value) | |
| def load_normalized_simulation_dataset( | |
| csv_path: str | Path, | |
| *, | |
| boolean_dtype: Literal["boolean", "bool"] = "bool", | |
| ) -> pd.DataFrame: | |
| """Charge et normalise le dataset de simulation locale. | |
| Args: | |
| csv_path: Fichier CSV source a charger. | |
| boolean_dtype: Type a utiliser pour les colonnes booleennes. | |
| Returns: | |
| pd.DataFrame: Dataset nettoye avec schema homogenise. | |
| """ | |
| simulation_df = pd.read_csv(Path(csv_path)).rename(columns=SIMULATION_COLUMN_RENAMES) | |
| simulation_df[SIMULATION_CATEGORICAL_COLUMNS] = simulation_df[SIMULATION_CATEGORICAL_COLUMNS].apply( | |
| lambda column: column.map(normalize_simulation_label) | |
| ) | |
| simulation_df[SIMULATION_NUMERIC_COLUMNS] = simulation_df[SIMULATION_NUMERIC_COLUMNS].apply( | |
| pd.to_numeric, | |
| errors="coerce", | |
| ) | |
| for column in SIMULATION_BOOLEAN_COLUMNS: | |
| normalized_series = simulation_df[column].map(_coerce_boolean_value).astype("boolean") | |
| if boolean_dtype == "bool": | |
| if normalized_series.isna().any(): | |
| raise ValueError( | |
| f"Column {column!r} contains missing values and cannot be coerced to bool." | |
| ) | |
| simulation_df[column] = normalized_series.astype(bool) | |
| else: | |
| simulation_df[column] = normalized_series | |
| simulation_df = simulation_df.loc[simulation_df["yield_tons_per_hectare"] >= 0].reset_index(drop=True) | |
| return simulation_df | |