""" Script 02: Exploratory Data Analysis (EDA) This script performs comprehensive EDA on the wildfire dataset: - Class distribution analysis (original 7 classes and grouped 3 classes) - Geographic distribution of fires - Temporal patterns (yearly, monthly, seasonal) - Missing value analysis - Feature correlations Generates visualization plots saved to reports/figures/ Usage: python scripts/02_eda.py """ import sys from pathlib import Path import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from config.config import ( RAW_PARQUET, FIGURES_DIR, FIRE_SIZE_CLASS_MAPPING, TARGET_CLASS_NAMES ) # Set style plt.style.use('seaborn-v0_8-whitegrid') sns.set_palette("husl") def load_data() -> pd.DataFrame: """Load the raw parquet data.""" print("Loading data...") df = pd.read_parquet(RAW_PARQUET) print(f" Loaded {len(df):,} records") return df def analyze_class_distribution(df: pd.DataFrame) -> None: """Analyze and visualize fire size class distribution.""" print("\n" + "="*60) print("CLASS DISTRIBUTION ANALYSIS") print("="*60) # Original 7 classes print("\nOriginal Fire Size Classes:") original_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index() for cls, count in original_dist.items(): pct = count / len(df) * 100 print(f" Class {cls}: {count:>10,} ({pct:>6.2f}%)") # Grouped 3 classes df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING) print("\nGrouped Classes (Target Variable):") grouped_dist = df['fire_size_grouped'].value_counts().sort_index() for cls_idx, count in grouped_dist.items(): pct = count / len(df) * 100 cls_name = TARGET_CLASS_NAMES[cls_idx] print(f" {cls_idx} ({cls_name:>6}): {count:>10,} ({pct:>6.2f}%)") # Visualize fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # Original distribution colors_orig = sns.color_palette("YlOrRd", 7) ax1 = axes[0] original_dist.plot(kind='bar', ax=ax1, color=colors_orig, edgecolor='black') ax1.set_title('Original Fire Size Class Distribution', fontsize=14, fontweight='bold') ax1.set_xlabel('Fire Size Class') ax1.set_ylabel('Count') ax1.tick_params(axis='x', rotation=0) # Add percentage labels for i, (idx, val) in enumerate(original_dist.items()): pct = val / len(df) * 100 ax1.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=9) # Grouped distribution colors_grouped = ['#2ecc71', '#f39c12', '#e74c3c'] # Green, Orange, Red ax2 = axes[1] grouped_dist.plot(kind='bar', ax=ax2, color=colors_grouped, edgecolor='black') ax2.set_title('Grouped Fire Size Distribution (Target)', fontsize=14, fontweight='bold') ax2.set_xlabel('Fire Size Category') ax2.set_ylabel('Count') ax2.set_xticklabels(TARGET_CLASS_NAMES, rotation=0) # Add percentage labels for i, (idx, val) in enumerate(grouped_dist.items()): pct = val / len(df) * 100 ax2.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=10) plt.tight_layout() plt.savefig(FIGURES_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight') plt.close() print(f"\n Saved: class_distribution.png") def analyze_geographic_distribution(df: pd.DataFrame) -> None: """Analyze and visualize geographic distribution of fires.""" print("\n" + "="*60) print("GEOGRAPHIC DISTRIBUTION") print("="*60) # Top states print("\nTop 15 States by Fire Count:") state_dist = df['STATE'].value_counts().head(15) for state, count in state_dist.items(): pct = count / len(df) * 100 print(f" {state}: {count:>10,} ({pct:>5.1f}%)") # Fire locations scatter plot fig, axes = plt.subplots(1, 2, figsize=(16, 6)) # All fires (sampled for performance) sample_size = min(100000, len(df)) df_sample = df.sample(n=sample_size, random_state=42) ax1 = axes[0] scatter = ax1.scatter( df_sample['LONGITUDE'], df_sample['LATITUDE'], c=df_sample['FIRE_SIZE_CLASS'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}), cmap='YlOrRd', alpha=0.3, s=1 ) ax1.set_title(f'Fire Locations (n={sample_size:,} sample)', fontsize=14, fontweight='bold') ax1.set_xlabel('Longitude') ax1.set_ylabel('Latitude') ax1.set_xlim(-130, -65) ax1.set_ylim(24, 50) plt.colorbar(scatter, ax=ax1, label='Fire Size Class (A=0 to G=6)') # Large fires only (E, F, G) df_large = df[df['FIRE_SIZE_CLASS'].isin(['E', 'F', 'G'])] ax2 = axes[1] scatter2 = ax2.scatter( df_large['LONGITUDE'], df_large['LATITUDE'], c=df_large['FIRE_SIZE_CLASS'].map({'E': 0, 'F': 1, 'G': 2}), cmap='Reds', alpha=0.5, s=5 ) ax2.set_title(f'Large Fires Only (E/F/G, n={len(df_large):,})', fontsize=14, fontweight='bold') ax2.set_xlabel('Longitude') ax2.set_ylabel('Latitude') ax2.set_xlim(-130, -65) ax2.set_ylim(24, 50) plt.tight_layout() plt.savefig(FIGURES_DIR / 'geographic_distribution.png', dpi=150, bbox_inches='tight') plt.close() print(f"\n Saved: geographic_distribution.png") def analyze_temporal_patterns(df: pd.DataFrame) -> None: """Analyze temporal patterns in the data.""" print("\n" + "="*60) print("TEMPORAL PATTERNS") print("="*60) # Convert discovery day of year to month df['month'] = pd.to_datetime(df['DISCOVERY_DOY'], format='%j').dt.month fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Yearly trend ax1 = axes[0, 0] yearly = df.groupby('FIRE_YEAR').size() yearly.plot(kind='line', ax=ax1, marker='o', linewidth=2, markersize=4) ax1.set_title('Fires per Year', fontsize=12, fontweight='bold') ax1.set_xlabel('Year') ax1.set_ylabel('Number of Fires') ax1.grid(True, alpha=0.3) # Monthly distribution ax2 = axes[0, 1] monthly = df.groupby('month').size() monthly.plot(kind='bar', ax=ax2, color='coral', edgecolor='black') ax2.set_title('Fires by Month', fontsize=12, fontweight='bold') ax2.set_xlabel('Month') ax2.set_ylabel('Number of Fires') ax2.tick_params(axis='x', rotation=0) # Large fires by month ax3 = axes[1, 0] df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING) monthly_by_class = df.groupby(['month', 'fire_size_grouped']).size().unstack(fill_value=0) monthly_by_class.columns = TARGET_CLASS_NAMES monthly_by_class.plot(kind='bar', ax=ax3, width=0.8, color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='black') ax3.set_title('Fire Size Category by Month', fontsize=12, fontweight='bold') ax3.set_xlabel('Month') ax3.set_ylabel('Number of Fires') ax3.tick_params(axis='x', rotation=0) ax3.legend(title='Size Category') # Fire causes ax4 = axes[1, 1] cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(10) cause_dist.plot(kind='barh', ax=ax4, color='steelblue', edgecolor='black') ax4.set_title('Top 10 Fire Causes', fontsize=12, fontweight='bold') ax4.set_xlabel('Number of Fires') ax4.invert_yaxis() plt.tight_layout() plt.savefig(FIGURES_DIR / 'temporal_patterns.png', dpi=150, bbox_inches='tight') plt.close() print(f"\n Saved: temporal_patterns.png") # Print monthly stats print("\nFires by Month:") month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] for month, count in monthly.items(): pct = count / len(df) * 100 print(f" {month_names[month-1]}: {count:>10,} ({pct:>5.1f}%)") def analyze_missing_values(df: pd.DataFrame) -> None: """Analyze missing values in the dataset.""" print("\n" + "="*60) print("MISSING VALUE ANALYSIS") print("="*60) missing = df.isnull().sum() missing_pct = (missing / len(df) * 100).round(2) missing_df = pd.DataFrame({ 'Missing Count': missing, 'Missing %': missing_pct }).sort_values('Missing Count', ascending=False) # Only show columns with missing values missing_df = missing_df[missing_df['Missing Count'] > 0] print(f"\nColumns with missing values: {len(missing_df)}") print("\nTop 20 columns with missing values:") for col, row in missing_df.head(20).iterrows(): print(f" {col}: {row['Missing Count']:,} ({row['Missing %']:.1f}%)") # Visualize if len(missing_df) > 0: fig, ax = plt.subplots(figsize=(12, 8)) missing_df.head(20)['Missing %'].plot( kind='barh', ax=ax, color='salmon', edgecolor='black' ) ax.set_title('Missing Values by Column (Top 20)', fontsize=14, fontweight='bold') ax.set_xlabel('Missing %') ax.invert_yaxis() plt.tight_layout() plt.savefig(FIGURES_DIR / 'missing_values.png', dpi=150, bbox_inches='tight') plt.close() print(f"\n Saved: missing_values.png") def analyze_cause_by_size(df: pd.DataFrame) -> None: """Analyze fire causes by fire size category.""" print("\n" + "="*60) print("FIRE CAUSE BY SIZE ANALYSIS") print("="*60) df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING) # Cross-tabulation cause_size = pd.crosstab( df['STAT_CAUSE_DESCR'], df['fire_size_grouped'], normalize='index' ) * 100 cause_size.columns = TARGET_CLASS_NAMES print("\nFire Cause Distribution by Size Category (% of each cause):") print(cause_size.round(1).to_string()) # Visualize fig, ax = plt.subplots(figsize=(12, 8)) cause_size.plot(kind='barh', ax=ax, stacked=True, color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='white') ax.set_title('Fire Size Distribution by Cause', fontsize=14, fontweight='bold') ax.set_xlabel('Percentage') ax.legend(title='Size Category', loc='lower right') ax.invert_yaxis() plt.tight_layout() plt.savefig(FIGURES_DIR / 'cause_by_size.png', dpi=150, bbox_inches='tight') plt.close() print(f"\n Saved: cause_by_size.png") def analyze_owner_distribution(df: pd.DataFrame) -> None: """Analyze land owner distribution.""" print("\n" + "="*60) print("LAND OWNER ANALYSIS") print("="*60) owner_dist = df['OWNER_DESCR'].value_counts() print("\nFires by Land Owner:") for owner, count in owner_dist.head(10).items(): pct = count / len(df) * 100 print(f" {owner}: {count:,} ({pct:.1f}%)") def main(): """Main EDA pipeline.""" print("\n" + "="*60) print("EXPLORATORY DATA ANALYSIS") print("="*60) # Create figures directory FIGURES_DIR.mkdir(parents=True, exist_ok=True) # Load data df = load_data() # Run analyses analyze_class_distribution(df) analyze_geographic_distribution(df) analyze_temporal_patterns(df) analyze_missing_values(df) analyze_cause_by_size(df) analyze_owner_distribution(df) print("\n" + "="*60) print("✓ EDA Complete!") print(f" Figures saved to: {FIGURES_DIR}") print("="*60 + "\n") if __name__ == "__main__": main()