File size: 3,409 Bytes
9142902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import os
import re

def clean_state_name(name: str):
    """Cleans and standardizes state/subdivision names."""
    if not isinstance(name, str):
        return ""
    name = name.lower().strip()
    name = re.sub(r"&", "and", name)
    name = re.sub(r"\s+", " ", name)
    name = re.sub(r"[^a-z\s]", "", name)  # remove special chars
    return name

def integrate_data(agri_df: pd.DataFrame, rain_df: pd.DataFrame):
    """
    ๐Ÿ”— Final Integration Logic โ€” Clean, Normalize, and Merge
    Works even if & or trailing spaces exist.
    """

    os.makedirs("hybrid_dataset", exist_ok=True)

    print(f"๐Ÿงพ Agriculture unique states: {agri_df['state_name'].nunique()}")
    print(f"โ˜๏ธ Rainfall unique subdivisions: {rain_df['subdivision'].nunique()}")

    # Clean columns
    agri_df.columns = agri_df.columns.str.lower().str.strip()
    rain_df.columns = rain_df.columns.str.lower().str.strip()

    # Clean text values
    agri_df["state_name"] = agri_df["state_name"].apply(clean_state_name)
    rain_df["subdivision"] = rain_df["subdivision"].apply(clean_state_name)

    # Create mapping
    mapping = {
        "andaman and nicobar islands": "andaman and nicobar islands",
        "orissa": "odisha",
        "sub himalayan west bengal and sikkim": "west bengal",
        "gangetic west bengal": "west bengal",
        "east uttar pradesh": "uttar pradesh",
        "west uttar pradesh": "uttar pradesh",
        "east rajasthan": "rajasthan",
        "west rajasthan": "rajasthan",
        "haryana delhi and chandigarh": "haryana",
        "assam and meghalaya": "assam",
        "naga mani mizo tripura": "tripura",
    }

    # Apply mapping to rainfall data
    rain_df["state_name"] = rain_df["subdivision"].replace(mapping)

    # Ensure year columns match type
    agri_df["crop_year"] = pd.to_numeric(agri_df["crop_year"], errors="coerce").astype("Int64")
    rain_df["year"] = pd.to_numeric(rain_df["year"], errors="coerce").astype("Int64")
    rain_df.rename(columns={"year": "crop_year"}, inplace=True)

    # Show whatโ€™s common after full cleaning
    common_states = sorted(set(agri_df["state_name"].unique()) & set(rain_df["state_name"].unique()))
    print(f"โœ… Common states found: {common_states}")

    if not common_states:
        print("โš ๏ธ No matching states even after cleaning โ€” check character mismatches manually!")
        print("๐Ÿ” Example Agri states:", agri_df['state_name'].unique().tolist())
        print("๐Ÿ” Example Rainfall states:", rain_df['state_name'].unique().tolist())
        return pd.DataFrame()

    # Filter only matching states
    agri_df = agri_df[agri_df["state_name"].isin(common_states)]
    rain_df = rain_df[rain_df["state_name"].isin(common_states)]

    # Merge datasets
    merged = pd.merge(agri_df, rain_df, on=["state_name", "crop_year"], how="inner")

    # Save output
    output_path = "hybrid_dataset/merged_agri_rainfall.csv"
    merged.to_csv(output_path, index=False)

    print(f"โœ… Data integrated and saved โ†’ {output_path} ({len(merged)} rows, {len(merged.columns)} columns)")
    print("๐Ÿ›๏ธ Unique merged states:", merged["state_name"].unique().tolist())

    return merged


# ๐Ÿงช Quick standalone test
if __name__ == "__main__":
    ag = pd.read_csv("hybrid_dataset/agriculture_data.csv")
    rd = pd.read_csv("hybrid_dataset/imd_rainfall_data.csv")
    integrate_data(ag, rd)