File size: 4,853 Bytes
64e892b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""

Script 01: Extract Data from SQLite Database



This script connects to the FPA FOD SQLite database, extracts the Fires table,

and saves it as a Parquet file for faster subsequent processing.



Usage:

    python scripts/01_extract_data.py

"""

import sqlite3
import sys
from pathlib import Path

import pandas as pd

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from config.config import (
    SQLITE_DB_PATH,
    PROCESSED_DATA_DIR,
    RAW_PARQUET
)


def connect_to_database(db_path: Path) -> sqlite3.Connection:
    """Connect to SQLite database."""
    if not db_path.exists():
        raise FileNotFoundError(
            f"Database not found at {db_path}. "
            "Please ensure the SQLite file is in the project root."
        )
    
    print(f"Connecting to database: {db_path}")
    return sqlite3.connect(db_path)


def get_table_info(conn: sqlite3.Connection) -> None:
    """Print information about tables in the database."""
    cursor = conn.cursor()
    
    # Get list of user tables (skip SpatiaLite system tables)
    cursor.execute("""

        SELECT name FROM sqlite_master 

        WHERE type='table' AND name NOT LIKE 'sqlite_%' 

        AND name NOT LIKE 'spatial%' 

        AND name NOT LIKE 'virt%'

        AND name NOT LIKE 'view%'

        AND name NOT LIKE 'geometry%'

    """)
    tables = cursor.fetchall()
    
    print("\n" + "="*60)
    print("DATABASE TABLES")
    print("="*60)
    
    for table in tables:
        table_name = table[0]
        try:
            cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
            count = cursor.fetchone()[0]
            print(f"  {table_name}: {count:,} rows")
        except Exception as e:
            print(f"  {table_name}: (could not read - {str(e)[:30]})")
    
    print("="*60 + "\n")


def extract_fires_table(conn: sqlite3.Connection) -> pd.DataFrame:
    """Extract the Fires table from the database."""
    print("Extracting Fires table...")
    
    query = "SELECT * FROM Fires"
    df = pd.read_sql_query(query, conn)
    
    print(f"  Loaded {len(df):,} records")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
    
    return df


def save_to_parquet(df: pd.DataFrame, output_path: Path) -> None:
    """Save DataFrame to Parquet format."""
    # Create directory if it doesn't exist
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    print(f"\nSaving to Parquet: {output_path}")
    df.to_parquet(output_path, index=False, compression='snappy')
    
    file_size_mb = output_path.stat().st_size / 1e6
    print(f"  File size: {file_size_mb:.1f} MB")


def print_data_summary(df: pd.DataFrame) -> None:
    """Print summary statistics of the extracted data."""
    print("\n" + "="*60)
    print("DATA SUMMARY")
    print("="*60)
    
    print(f"\nDate Range: {df['FIRE_YEAR'].min()} - {df['FIRE_YEAR'].max()}")
    
    print("\nFire Size Class Distribution:")
    size_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index()
    for cls, count in size_dist.items():
        pct = count / len(df) * 100
        print(f"  Class {cls}: {count:>10,} ({pct:>5.1f}%)")
    
    print("\nTop 10 States by Fire Count:")
    state_dist = df['STATE'].value_counts().head(10)
    for state, count in state_dist.items():
        print(f"  {state}: {count:,}")
    
    print("\nTop Causes:")
    cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(5)
    for cause, count in cause_dist.items():
        pct = count / len(df) * 100
        print(f"  {cause}: {count:,} ({pct:.1f}%)")
    
    print("\nMissing Values (top 10 columns):")
    missing = df.isnull().sum().sort_values(ascending=False).head(10)
    for col, count in missing.items():
        if count > 0:
            pct = count / len(df) * 100
            print(f"  {col}: {count:,} ({pct:.1f}%)")
    
    print("="*60 + "\n")


def main():
    """Main extraction pipeline."""
    print("\n" + "="*60)
    print("WILDFIRE DATA EXTRACTION")
    print("="*60 + "\n")
    
    # Connect to database
    conn = connect_to_database(SQLITE_DB_PATH)
    
    try:
        # Show database info
        get_table_info(conn)
        
        # Extract Fires table
        df = extract_fires_table(conn)
        
        # Print summary
        print_data_summary(df)
        
        # Save to Parquet
        save_to_parquet(df, RAW_PARQUET)
        
        print("\n✓ Data extraction complete!")
        print(f"  Output: {RAW_PARQUET}")
        
    finally:
        conn.close()
        print("  Database connection closed.")


if __name__ == "__main__":
    main()