File size: 11,911 Bytes
64e892b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
"""
Script 02: Exploratory Data Analysis (EDA)
This script performs comprehensive EDA on the wildfire dataset:
- Class distribution analysis (original 7 classes and grouped 3 classes)
- Geographic distribution of fires
- Temporal patterns (yearly, monthly, seasonal)
- Missing value analysis
- Feature correlations
Generates visualization plots saved to reports/figures/
Usage:
python scripts/02_eda.py
"""
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from config.config import (
RAW_PARQUET,
FIGURES_DIR,
FIRE_SIZE_CLASS_MAPPING,
TARGET_CLASS_NAMES
)
# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
def load_data() -> pd.DataFrame:
"""Load the raw parquet data."""
print("Loading data...")
df = pd.read_parquet(RAW_PARQUET)
print(f" Loaded {len(df):,} records")
return df
def analyze_class_distribution(df: pd.DataFrame) -> None:
"""Analyze and visualize fire size class distribution."""
print("\n" + "="*60)
print("CLASS DISTRIBUTION ANALYSIS")
print("="*60)
# Original 7 classes
print("\nOriginal Fire Size Classes:")
original_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index()
for cls, count in original_dist.items():
pct = count / len(df) * 100
print(f" Class {cls}: {count:>10,} ({pct:>6.2f}%)")
# Grouped 3 classes
df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
print("\nGrouped Classes (Target Variable):")
grouped_dist = df['fire_size_grouped'].value_counts().sort_index()
for cls_idx, count in grouped_dist.items():
pct = count / len(df) * 100
cls_name = TARGET_CLASS_NAMES[cls_idx]
print(f" {cls_idx} ({cls_name:>6}): {count:>10,} ({pct:>6.2f}%)")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Original distribution
colors_orig = sns.color_palette("YlOrRd", 7)
ax1 = axes[0]
original_dist.plot(kind='bar', ax=ax1, color=colors_orig, edgecolor='black')
ax1.set_title('Original Fire Size Class Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Fire Size Class')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)
# Add percentage labels
for i, (idx, val) in enumerate(original_dist.items()):
pct = val / len(df) * 100
ax1.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=9)
# Grouped distribution
colors_grouped = ['#2ecc71', '#f39c12', '#e74c3c'] # Green, Orange, Red
ax2 = axes[1]
grouped_dist.plot(kind='bar', ax=ax2, color=colors_grouped, edgecolor='black')
ax2.set_title('Grouped Fire Size Distribution (Target)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Fire Size Category')
ax2.set_ylabel('Count')
ax2.set_xticklabels(TARGET_CLASS_NAMES, rotation=0)
# Add percentage labels
for i, (idx, val) in enumerate(grouped_dist.items()):
pct = val / len(df) * 100
ax2.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Saved: class_distribution.png")
def analyze_geographic_distribution(df: pd.DataFrame) -> None:
"""Analyze and visualize geographic distribution of fires."""
print("\n" + "="*60)
print("GEOGRAPHIC DISTRIBUTION")
print("="*60)
# Top states
print("\nTop 15 States by Fire Count:")
state_dist = df['STATE'].value_counts().head(15)
for state, count in state_dist.items():
pct = count / len(df) * 100
print(f" {state}: {count:>10,} ({pct:>5.1f}%)")
# Fire locations scatter plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# All fires (sampled for performance)
sample_size = min(100000, len(df))
df_sample = df.sample(n=sample_size, random_state=42)
ax1 = axes[0]
scatter = ax1.scatter(
df_sample['LONGITUDE'],
df_sample['LATITUDE'],
c=df_sample['FIRE_SIZE_CLASS'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}),
cmap='YlOrRd',
alpha=0.3,
s=1
)
ax1.set_title(f'Fire Locations (n={sample_size:,} sample)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
ax1.set_xlim(-130, -65)
ax1.set_ylim(24, 50)
plt.colorbar(scatter, ax=ax1, label='Fire Size Class (A=0 to G=6)')
# Large fires only (E, F, G)
df_large = df[df['FIRE_SIZE_CLASS'].isin(['E', 'F', 'G'])]
ax2 = axes[1]
scatter2 = ax2.scatter(
df_large['LONGITUDE'],
df_large['LATITUDE'],
c=df_large['FIRE_SIZE_CLASS'].map({'E': 0, 'F': 1, 'G': 2}),
cmap='Reds',
alpha=0.5,
s=5
)
ax2.set_title(f'Large Fires Only (E/F/G, n={len(df_large):,})', fontsize=14, fontweight='bold')
ax2.set_xlabel('Longitude')
ax2.set_ylabel('Latitude')
ax2.set_xlim(-130, -65)
ax2.set_ylim(24, 50)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'geographic_distribution.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Saved: geographic_distribution.png")
def analyze_temporal_patterns(df: pd.DataFrame) -> None:
"""Analyze temporal patterns in the data."""
print("\n" + "="*60)
print("TEMPORAL PATTERNS")
print("="*60)
# Convert discovery day of year to month
df['month'] = pd.to_datetime(df['DISCOVERY_DOY'], format='%j').dt.month
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Yearly trend
ax1 = axes[0, 0]
yearly = df.groupby('FIRE_YEAR').size()
yearly.plot(kind='line', ax=ax1, marker='o', linewidth=2, markersize=4)
ax1.set_title('Fires per Year', fontsize=12, fontweight='bold')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Fires')
ax1.grid(True, alpha=0.3)
# Monthly distribution
ax2 = axes[0, 1]
monthly = df.groupby('month').size()
monthly.plot(kind='bar', ax=ax2, color='coral', edgecolor='black')
ax2.set_title('Fires by Month', fontsize=12, fontweight='bold')
ax2.set_xlabel('Month')
ax2.set_ylabel('Number of Fires')
ax2.tick_params(axis='x', rotation=0)
# Large fires by month
ax3 = axes[1, 0]
df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
monthly_by_class = df.groupby(['month', 'fire_size_grouped']).size().unstack(fill_value=0)
monthly_by_class.columns = TARGET_CLASS_NAMES
monthly_by_class.plot(kind='bar', ax=ax3, width=0.8,
color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='black')
ax3.set_title('Fire Size Category by Month', fontsize=12, fontweight='bold')
ax3.set_xlabel('Month')
ax3.set_ylabel('Number of Fires')
ax3.tick_params(axis='x', rotation=0)
ax3.legend(title='Size Category')
# Fire causes
ax4 = axes[1, 1]
cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(10)
cause_dist.plot(kind='barh', ax=ax4, color='steelblue', edgecolor='black')
ax4.set_title('Top 10 Fire Causes', fontsize=12, fontweight='bold')
ax4.set_xlabel('Number of Fires')
ax4.invert_yaxis()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'temporal_patterns.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Saved: temporal_patterns.png")
# Print monthly stats
print("\nFires by Month:")
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for month, count in monthly.items():
pct = count / len(df) * 100
print(f" {month_names[month-1]}: {count:>10,} ({pct:>5.1f}%)")
def analyze_missing_values(df: pd.DataFrame) -> None:
"""Analyze missing values in the dataset."""
print("\n" + "="*60)
print("MISSING VALUE ANALYSIS")
print("="*60)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
'Missing Count': missing,
'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)
# Only show columns with missing values
missing_df = missing_df[missing_df['Missing Count'] > 0]
print(f"\nColumns with missing values: {len(missing_df)}")
print("\nTop 20 columns with missing values:")
for col, row in missing_df.head(20).iterrows():
print(f" {col}: {row['Missing Count']:,} ({row['Missing %']:.1f}%)")
# Visualize
if len(missing_df) > 0:
fig, ax = plt.subplots(figsize=(12, 8))
missing_df.head(20)['Missing %'].plot(
kind='barh', ax=ax, color='salmon', edgecolor='black'
)
ax.set_title('Missing Values by Column (Top 20)', fontsize=14, fontweight='bold')
ax.set_xlabel('Missing %')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'missing_values.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Saved: missing_values.png")
def analyze_cause_by_size(df: pd.DataFrame) -> None:
"""Analyze fire causes by fire size category."""
print("\n" + "="*60)
print("FIRE CAUSE BY SIZE ANALYSIS")
print("="*60)
df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
# Cross-tabulation
cause_size = pd.crosstab(
df['STAT_CAUSE_DESCR'],
df['fire_size_grouped'],
normalize='index'
) * 100
cause_size.columns = TARGET_CLASS_NAMES
print("\nFire Cause Distribution by Size Category (% of each cause):")
print(cause_size.round(1).to_string())
# Visualize
fig, ax = plt.subplots(figsize=(12, 8))
cause_size.plot(kind='barh', ax=ax, stacked=True,
color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='white')
ax.set_title('Fire Size Distribution by Cause', fontsize=14, fontweight='bold')
ax.set_xlabel('Percentage')
ax.legend(title='Size Category', loc='lower right')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'cause_by_size.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"\n Saved: cause_by_size.png")
def analyze_owner_distribution(df: pd.DataFrame) -> None:
"""Analyze land owner distribution."""
print("\n" + "="*60)
print("LAND OWNER ANALYSIS")
print("="*60)
owner_dist = df['OWNER_DESCR'].value_counts()
print("\nFires by Land Owner:")
for owner, count in owner_dist.head(10).items():
pct = count / len(df) * 100
print(f" {owner}: {count:,} ({pct:.1f}%)")
def main():
"""Main EDA pipeline."""
print("\n" + "="*60)
print("EXPLORATORY DATA ANALYSIS")
print("="*60)
# Create figures directory
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
# Load data
df = load_data()
# Run analyses
analyze_class_distribution(df)
analyze_geographic_distribution(df)
analyze_temporal_patterns(df)
analyze_missing_values(df)
analyze_cause_by_size(df)
analyze_owner_distribution(df)
print("\n" + "="*60)
print("✓ EDA Complete!")
print(f" Figures saved to: {FIGURES_DIR}")
print("="*60 + "\n")
if __name__ == "__main__":
main()
|