File size: 1,573 Bytes
d6c1540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
add_water_variable.py

Adds a Water_mm column to the farm dataset.
Water is drawn uniformly from [WATER_MIN, Rainfall_mm].
Rainfall_mm is reduced by the water drawn to prevent bias.
"""

import sys

import numpy as np
import pandas as pd

WATER_MIN = 20
WATER_MAX = 200


def add_water(df: pd.DataFrame, seed: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    df = df.copy()

    upper = df["Rainfall_mm"].clip(upper=WATER_MAX)
    can_irrigate = upper >= WATER_MIN
    water = np.where(
        can_irrigate,
        rng.uniform(WATER_MIN, upper.where(can_irrigate, WATER_MIN)),
        0.0,
    )

    df["Water_mm"] = np.round(water, 2)
    df["Rainfall_mm"] = np.round(df["Rainfall_mm"] - df["Water_mm"], 2)
    return df


def main() -> None:
    path = sys.argv[1] if len(sys.argv) > 1 else "farm_data.csv"
    out = sys.argv[2] if len(sys.argv) > 2 else path.replace(
        ".csv", "_watered.csv")

    df = pd.read_csv(path)
    required = {"Rainfall_mm"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    df_out = add_water(df)

    print(
        f"Water_mm - min: {df_out['Water_mm'].min():.1f} "
        f"max: {df_out['Water_mm'].max():.1f} "
        f"mean: {df_out['Water_mm'].mean():.1f}"
    )
    print(
        "Rainfall_mm after subtraction - "
        f"min: {df_out['Rainfall_mm'].min():.1f} "
        f"mean: {df_out['Rainfall_mm'].mean():.1f}"
    )

    df_out.to_csv(out, index=False)
    print(f"Saved -> {out}")


if __name__ == "__main__":
    main()