File size: 3,385 Bytes
23b1977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""Briques communes autour du dataset de simulation locale.

Ce module centralise le renommage et le nettoyage de
`data/simulation/crop_yield.csv` afin d'eviter que l'ACP et le moteur runtime
fassent diverger leurs hypotheses de preparation.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Literal

import pandas as pd


SIMULATION_COLUMN_RENAMES = {
    "Region": "region",
    "Soil_Type": "soil_type",
    "Crop": "crop",
    "Rainfall_mm": "rainfall_mm",
    "Temperature_Celsius": "temperature_celsius",
    "Fertilizer_Used": "fertilizer_used",
    "Irrigation_Used": "irrigation_used",
    "Weather_Condition": "weather_condition",
    "Days_to_Harvest": "days_to_harvest",
    "Yield_tons_per_hectare": "yield_tons_per_hectare",
}

SIMULATION_CATEGORICAL_COLUMNS = [
    "region",
    "soil_type",
    "crop",
    "weather_condition",
]

SIMULATION_BOOLEAN_COLUMNS = [
    "fertilizer_used",
    "irrigation_used",
]

SIMULATION_NUMERIC_COLUMNS = [
    "rainfall_mm",
    "temperature_celsius",
    "days_to_harvest",
    "yield_tons_per_hectare",
]

SIMULATION_ACP_NUMERIC_COLUMNS = [
    "rainfall_mm",
    "temperature_celsius",
    "days_to_harvest",
]


def normalize_simulation_label(value: Any) -> str:
    """Nettoie une etiquette textuelle issue du dataset de simulation."""
    return str(value).strip()


def _coerce_boolean_value(value: Any) -> bool | pd._libs.missing.NAType:
    """Convertit defensivement une valeur vers un booleen pandas-compatible."""
    if pd.isna(value):
        return pd.NA
    if isinstance(value, bool):
        return value
    if isinstance(value, (int, float)):
        return bool(int(value))

    normalized = str(value).strip().lower()
    if normalized in {"true", "1", "yes", "y", "oui"}:
        return True
    if normalized in {"false", "0", "no", "n", "non"}:
        return False
    return bool(value)


def load_normalized_simulation_dataset(
    csv_path: str | Path,
    *,
    boolean_dtype: Literal["boolean", "bool"] = "bool",
) -> pd.DataFrame:
    """Charge et normalise le dataset de simulation locale.

    Args:
        csv_path: Fichier CSV source a charger.
        boolean_dtype: Type a utiliser pour les colonnes booleennes.

    Returns:
        pd.DataFrame: Dataset nettoye avec schema homogenise.
    """
    simulation_df = pd.read_csv(Path(csv_path)).rename(columns=SIMULATION_COLUMN_RENAMES)

    simulation_df[SIMULATION_CATEGORICAL_COLUMNS] = simulation_df[SIMULATION_CATEGORICAL_COLUMNS].apply(
        lambda column: column.map(normalize_simulation_label)
    )
    simulation_df[SIMULATION_NUMERIC_COLUMNS] = simulation_df[SIMULATION_NUMERIC_COLUMNS].apply(
        pd.to_numeric,
        errors="coerce",
    )

    for column in SIMULATION_BOOLEAN_COLUMNS:
        normalized_series = simulation_df[column].map(_coerce_boolean_value).astype("boolean")
        if boolean_dtype == "bool":
            if normalized_series.isna().any():
                raise ValueError(
                    f"Column {column!r} contains missing values and cannot be coerced to bool."
                )
            simulation_df[column] = normalized_series.astype(bool)
        else:
            simulation_df[column] = normalized_series

    simulation_df = simulation_df.loc[simulation_df["yield_tons_per_hectare"] >= 0].reset_index(drop=True)
    return simulation_df