File size: 5,458 Bytes
ee8f3db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
# Paths
DATA_PATH = "data/processed/california_wildfire.csv"
MODEL_DIR = "models"
def train_all_tasks():
print("π Loading Processed Data...")
df = pd.read_csv(DATA_PATH)
# Ensure we have data
if df.empty:
print("β Error: Dataset is empty. Run preprocess.py first.")
return
# ==========================================
# TASK 1: REGRESSION (Predict Fire Intensity)
# Target: 'bi' (Burning Index)
# Features: Weather metrics (Temp, Humidity, Wind, Rain)
# ==========================================
print("\nπ₯ Training Task 1: Regression (Predict Burning Index)...")
# Features: Temp Min/Max, Humidity Min/Max, Wind Speed, Precipitation, Energy Release Component
reg_features = ['tmmn', 'tmmx', 'rmin', 'rmax', 'vs', 'pr', 'erc']
target_reg = 'bi'
X_reg = df[reg_features]
y_reg = df[target_reg]
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
# Train Random Forest Regressor
reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
reg_model.fit(X_train_r, y_train_r)
rmse = np.sqrt(mean_squared_error(y_test_r, reg_model.predict(X_test_r)))
print(f"β
Regression RMSE: {rmse:.4f}")
joblib.dump(reg_model, f"{MODEL_DIR}/regression_model.pkl")
# ==========================================
# TASK 2: CLASSIFICATION (Predict Risk Level)
# Target: Custom 'Risk_Level' based on Burning Index
# Logic: 0-40 Low, 40-80 Medium, >80 High
# ==========================================
print("\nβ οΈ Training Task 2: Classification (Fire Risk Level)...")
def get_risk_level(bi_val):
if bi_val < 40: return 'Low'
elif bi_val < 80: return 'Medium'
else: return 'High'
df['risk_level'] = df['bi'].apply(get_risk_level)
# Encode Target (Low=0, Medium=1, High=2)
le = LabelEncoder()
y_clf = le.fit_transform(df['risk_level'])
# Use same weather features for classification
X_clf = df[reg_features]
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf_model = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
clf_model.fit(X_train_c, y_train_c)
acc = accuracy_score(y_test_c, clf_model.predict(X_test_c))
print(f"β
Classification Accuracy: {acc:.4f}")
# Save Model + Encoder (needed to decode predictions later)
joblib.dump(clf_model, f"{MODEL_DIR}/classification_model.pkl")
joblib.dump(le, f"{MODEL_DIR}/label_encoder.pkl")
# ==========================================
# TASK 3: CLUSTERING (Recovery Zones)
# Group by Location (Lat/Lon) and Fire Intensity (bi)
# ==========================================
print("\nπ Training Task 3: Clustering (Recovery Zones)...")
X_cluster = df[['latitude', 'longitude', 'bi']]
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans.fit(X_cluster)
joblib.dump(kmeans, f"{MODEL_DIR}/clustering_model.pkl")
print("β
Clustering Model Saved.")
# ==========================================
# TASK 4: DIMENSIONALITY REDUCTION (PCA)
# Reduce weather features to 2D for visualization
# ==========================================
print("\nπ§© Training Task 4: Dimensionality Reduction (PCA)...")
# Fit PCA on the weather features to reduce to 2D
pca = PCA(n_components=2)
pca.fit(df[reg_features])
# Save the PCA model
joblib.dump(pca, f"{MODEL_DIR}/pca_model.pkl")
print("β
PCA Model Saved.")
# ==========================================
# TASK 5: TIME SERIES (SEASONALITY)
# Calculate monthly average burning index trends
# ==========================================
print("\nπ Training Task 5: Time Series (Seasonality)...")
# Ensure datetime is correct
if 'datetime' in df.columns:
df['datetime'] = pd.to_datetime(df['datetime'])
df['month'] = df['datetime'].dt.month
# Calculate average Burning Index (BI) per month
seasonal_trend = df.groupby('month')['bi'].mean().to_dict()
# Save this dictionary (Month -> Avg BI)
joblib.dump(seasonal_trend, f"{MODEL_DIR}/seasonal_model.pkl")
print("β
Seasonal Model Saved.")
else:
print("β οΈ Skipping Seasonality Task: 'datetime' column not found")
print("\nπ All Systems Go! Models are ready in 'models/'")
print("π Models created:")
print(" - regression_model.pkl")
print(" - classification_model.pkl")
print(" - label_encoder.pkl")
print(" - clustering_model.pkl")
print(" - pca_model.pkl")
print(" - seasonal_model.pkl")
if __name__ == "__main__":
train_all_tasks() |