File size: 5,458 Bytes
ee8f3db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Paths
DATA_PATH = "data/processed/california_wildfire.csv"
MODEL_DIR = "models"

def train_all_tasks():
    print("πŸš€ Loading Processed Data...")
    df = pd.read_csv(DATA_PATH)
    
    # Ensure we have data
    if df.empty:
        print("❌ Error: Dataset is empty. Run preprocess.py first.")
        return
    
    # ==========================================
    # TASK 1: REGRESSION (Predict Fire Intensity)
    # Target: 'bi' (Burning Index)
    # Features: Weather metrics (Temp, Humidity, Wind, Rain)
    # ==========================================
    print("\nπŸ”₯ Training Task 1: Regression (Predict Burning Index)...")
    
    # Features: Temp Min/Max, Humidity Min/Max, Wind Speed, Precipitation, Energy Release Component
    reg_features = ['tmmn', 'tmmx', 'rmin', 'rmax', 'vs', 'pr', 'erc']
    target_reg = 'bi'
    
    X_reg = df[reg_features]
    y_reg = df[target_reg]
    
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
    
    # Train Random Forest Regressor
    reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
    reg_model.fit(X_train_r, y_train_r)
    
    rmse = np.sqrt(mean_squared_error(y_test_r, reg_model.predict(X_test_r)))
    print(f"βœ… Regression RMSE: {rmse:.4f}")
    
    joblib.dump(reg_model, f"{MODEL_DIR}/regression_model.pkl")

    # ==========================================
    # TASK 2: CLASSIFICATION (Predict Risk Level)
    # Target: Custom 'Risk_Level' based on Burning Index
    # Logic: 0-40 Low, 40-80 Medium, >80 High
    # ==========================================
    print("\n⚠️ Training Task 2: Classification (Fire Risk Level)...")
    
    def get_risk_level(bi_val):
        if bi_val < 40: return 'Low'
        elif bi_val < 80: return 'Medium'
        else: return 'High'
        
    df['risk_level'] = df['bi'].apply(get_risk_level)
    
    # Encode Target (Low=0, Medium=1, High=2)
    le = LabelEncoder()
    y_clf = le.fit_transform(df['risk_level'])
    
    # Use same weather features for classification
    X_clf = df[reg_features]
    
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
    
    clf_model = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
    clf_model.fit(X_train_c, y_train_c)
    
    acc = accuracy_score(y_test_c, clf_model.predict(X_test_c))
    print(f"βœ… Classification Accuracy: {acc:.4f}")
    
    # Save Model + Encoder (needed to decode predictions later)
    joblib.dump(clf_model, f"{MODEL_DIR}/classification_model.pkl")
    joblib.dump(le, f"{MODEL_DIR}/label_encoder.pkl")

    # ==========================================
    # TASK 3: CLUSTERING (Recovery Zones)
    # Group by Location (Lat/Lon) and Fire Intensity (bi)
    # ==========================================
    print("\n🌍 Training Task 3: Clustering (Recovery Zones)...")
    
    X_cluster = df[['latitude', 'longitude', 'bi']]
    
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    
    joblib.dump(kmeans, f"{MODEL_DIR}/clustering_model.pkl")
    print("βœ… Clustering Model Saved.")

    # ==========================================
    # TASK 4: DIMENSIONALITY REDUCTION (PCA)
    # Reduce weather features to 2D for visualization
    # ==========================================
    print("\n🧩 Training Task 4: Dimensionality Reduction (PCA)...")
    
    # Fit PCA on the weather features to reduce to 2D
    pca = PCA(n_components=2)
    pca.fit(df[reg_features])
    
    # Save the PCA model
    joblib.dump(pca, f"{MODEL_DIR}/pca_model.pkl")
    print("βœ… PCA Model Saved.")

    # ==========================================
    # TASK 5: TIME SERIES (SEASONALITY)
    # Calculate monthly average burning index trends
    # ==========================================
    print("\nπŸ“ˆ Training Task 5: Time Series (Seasonality)...")
    
    # Ensure datetime is correct
    if 'datetime' in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['month'] = df['datetime'].dt.month
        
        # Calculate average Burning Index (BI) per month
        seasonal_trend = df.groupby('month')['bi'].mean().to_dict()
        
        # Save this dictionary (Month -> Avg BI)
        joblib.dump(seasonal_trend, f"{MODEL_DIR}/seasonal_model.pkl")
        print("βœ… Seasonal Model Saved.")
    else:
        print("⚠️ Skipping Seasonality Task: 'datetime' column not found")

    print("\nπŸŽ‰ All Systems Go! Models are ready in 'models/'")
    print("πŸ“ Models created:")
    print("   - regression_model.pkl")
    print("   - classification_model.pkl")
    print("   - label_encoder.pkl")
    print("   - clustering_model.pkl")
    print("   - pca_model.pkl")
    print("   - seasonal_model.pkl")

if __name__ == "__main__":
    train_all_tasks()