|
|
import pandas as pd |
|
|
import os |
|
|
import re |
|
|
|
|
|
def clean_state_name(name: str): |
|
|
"""Cleans and standardizes state/subdivision names.""" |
|
|
if not isinstance(name, str): |
|
|
return "" |
|
|
name = name.lower().strip() |
|
|
name = re.sub(r"&", "and", name) |
|
|
name = re.sub(r"\s+", " ", name) |
|
|
name = re.sub(r"[^a-z\s]", "", name) |
|
|
return name |
|
|
|
|
|
def integrate_data(agri_df: pd.DataFrame, rain_df: pd.DataFrame): |
|
|
""" |
|
|
π Final Integration Logic β Clean, Normalize, and Merge |
|
|
Works even if & or trailing spaces exist. |
|
|
""" |
|
|
|
|
|
os.makedirs("hybrid_dataset", exist_ok=True) |
|
|
|
|
|
print(f"π§Ύ Agriculture unique states: {agri_df['state_name'].nunique()}") |
|
|
print(f"βοΈ Rainfall unique subdivisions: {rain_df['subdivision'].nunique()}") |
|
|
|
|
|
|
|
|
agri_df.columns = agri_df.columns.str.lower().str.strip() |
|
|
rain_df.columns = rain_df.columns.str.lower().str.strip() |
|
|
|
|
|
|
|
|
agri_df["state_name"] = agri_df["state_name"].apply(clean_state_name) |
|
|
rain_df["subdivision"] = rain_df["subdivision"].apply(clean_state_name) |
|
|
|
|
|
|
|
|
mapping = { |
|
|
"andaman and nicobar islands": "andaman and nicobar islands", |
|
|
"orissa": "odisha", |
|
|
"sub himalayan west bengal and sikkim": "west bengal", |
|
|
"gangetic west bengal": "west bengal", |
|
|
"east uttar pradesh": "uttar pradesh", |
|
|
"west uttar pradesh": "uttar pradesh", |
|
|
"east rajasthan": "rajasthan", |
|
|
"west rajasthan": "rajasthan", |
|
|
"haryana delhi and chandigarh": "haryana", |
|
|
"assam and meghalaya": "assam", |
|
|
"naga mani mizo tripura": "tripura", |
|
|
} |
|
|
|
|
|
|
|
|
rain_df["state_name"] = rain_df["subdivision"].replace(mapping) |
|
|
|
|
|
|
|
|
agri_df["crop_year"] = pd.to_numeric(agri_df["crop_year"], errors="coerce").astype("Int64") |
|
|
rain_df["year"] = pd.to_numeric(rain_df["year"], errors="coerce").astype("Int64") |
|
|
rain_df.rename(columns={"year": "crop_year"}, inplace=True) |
|
|
|
|
|
|
|
|
common_states = sorted(set(agri_df["state_name"].unique()) & set(rain_df["state_name"].unique())) |
|
|
print(f"β
Common states found: {common_states}") |
|
|
|
|
|
if not common_states: |
|
|
print("β οΈ No matching states even after cleaning β check character mismatches manually!") |
|
|
print("π Example Agri states:", agri_df['state_name'].unique().tolist()) |
|
|
print("π Example Rainfall states:", rain_df['state_name'].unique().tolist()) |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
agri_df = agri_df[agri_df["state_name"].isin(common_states)] |
|
|
rain_df = rain_df[rain_df["state_name"].isin(common_states)] |
|
|
|
|
|
|
|
|
merged = pd.merge(agri_df, rain_df, on=["state_name", "crop_year"], how="inner") |
|
|
|
|
|
|
|
|
output_path = "hybrid_dataset/merged_agri_rainfall.csv" |
|
|
merged.to_csv(output_path, index=False) |
|
|
|
|
|
print(f"β
Data integrated and saved β {output_path} ({len(merged)} rows, {len(merged.columns)} columns)") |
|
|
print("ποΈ Unique merged states:", merged["state_name"].unique().tolist()) |
|
|
|
|
|
return merged |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
ag = pd.read_csv("hybrid_dataset/agriculture_data.csv") |
|
|
rd = pd.read_csv("hybrid_dataset/imd_rainfall_data.csv") |
|
|
integrate_data(ag, rd) |
|
|
|