|
|
"""
|
|
|
Script 01: Extract Data from SQLite Database
|
|
|
|
|
|
This script connects to the FPA FOD SQLite database, extracts the Fires table,
|
|
|
and saves it as a Parquet file for faster subsequent processing.
|
|
|
|
|
|
Usage:
|
|
|
python scripts/01_extract_data.py
|
|
|
"""
|
|
|
|
|
|
import sqlite3
|
|
|
import sys
|
|
|
from pathlib import Path
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
from config.config import (
|
|
|
SQLITE_DB_PATH,
|
|
|
PROCESSED_DATA_DIR,
|
|
|
RAW_PARQUET
|
|
|
)
|
|
|
|
|
|
|
|
|
def connect_to_database(db_path: Path) -> sqlite3.Connection:
|
|
|
"""Connect to SQLite database."""
|
|
|
if not db_path.exists():
|
|
|
raise FileNotFoundError(
|
|
|
f"Database not found at {db_path}. "
|
|
|
"Please ensure the SQLite file is in the project root."
|
|
|
)
|
|
|
|
|
|
print(f"Connecting to database: {db_path}")
|
|
|
return sqlite3.connect(db_path)
|
|
|
|
|
|
|
|
|
def get_table_info(conn: sqlite3.Connection) -> None:
|
|
|
"""Print information about tables in the database."""
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
|
|
|
cursor.execute("""
|
|
|
SELECT name FROM sqlite_master
|
|
|
WHERE type='table' AND name NOT LIKE 'sqlite_%'
|
|
|
AND name NOT LIKE 'spatial%'
|
|
|
AND name NOT LIKE 'virt%'
|
|
|
AND name NOT LIKE 'view%'
|
|
|
AND name NOT LIKE 'geometry%'
|
|
|
""")
|
|
|
tables = cursor.fetchall()
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("DATABASE TABLES")
|
|
|
print("="*60)
|
|
|
|
|
|
for table in tables:
|
|
|
table_name = table[0]
|
|
|
try:
|
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
|
count = cursor.fetchone()[0]
|
|
|
print(f" {table_name}: {count:,} rows")
|
|
|
except Exception as e:
|
|
|
print(f" {table_name}: (could not read - {str(e)[:30]})")
|
|
|
|
|
|
print("="*60 + "\n")
|
|
|
|
|
|
|
|
|
def extract_fires_table(conn: sqlite3.Connection) -> pd.DataFrame:
|
|
|
"""Extract the Fires table from the database."""
|
|
|
print("Extracting Fires table...")
|
|
|
|
|
|
query = "SELECT * FROM Fires"
|
|
|
df = pd.read_sql_query(query, conn)
|
|
|
|
|
|
print(f" Loaded {len(df):,} records")
|
|
|
print(f" Columns: {len(df.columns)}")
|
|
|
print(f" Memory usage: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
def save_to_parquet(df: pd.DataFrame, output_path: Path) -> None:
|
|
|
"""Save DataFrame to Parquet format."""
|
|
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
print(f"\nSaving to Parquet: {output_path}")
|
|
|
df.to_parquet(output_path, index=False, compression='snappy')
|
|
|
|
|
|
file_size_mb = output_path.stat().st_size / 1e6
|
|
|
print(f" File size: {file_size_mb:.1f} MB")
|
|
|
|
|
|
|
|
|
def print_data_summary(df: pd.DataFrame) -> None:
|
|
|
"""Print summary statistics of the extracted data."""
|
|
|
print("\n" + "="*60)
|
|
|
print("DATA SUMMARY")
|
|
|
print("="*60)
|
|
|
|
|
|
print(f"\nDate Range: {df['FIRE_YEAR'].min()} - {df['FIRE_YEAR'].max()}")
|
|
|
|
|
|
print("\nFire Size Class Distribution:")
|
|
|
size_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index()
|
|
|
for cls, count in size_dist.items():
|
|
|
pct = count / len(df) * 100
|
|
|
print(f" Class {cls}: {count:>10,} ({pct:>5.1f}%)")
|
|
|
|
|
|
print("\nTop 10 States by Fire Count:")
|
|
|
state_dist = df['STATE'].value_counts().head(10)
|
|
|
for state, count in state_dist.items():
|
|
|
print(f" {state}: {count:,}")
|
|
|
|
|
|
print("\nTop Causes:")
|
|
|
cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(5)
|
|
|
for cause, count in cause_dist.items():
|
|
|
pct = count / len(df) * 100
|
|
|
print(f" {cause}: {count:,} ({pct:.1f}%)")
|
|
|
|
|
|
print("\nMissing Values (top 10 columns):")
|
|
|
missing = df.isnull().sum().sort_values(ascending=False).head(10)
|
|
|
for col, count in missing.items():
|
|
|
if count > 0:
|
|
|
pct = count / len(df) * 100
|
|
|
print(f" {col}: {count:,} ({pct:.1f}%)")
|
|
|
|
|
|
print("="*60 + "\n")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Main extraction pipeline."""
|
|
|
print("\n" + "="*60)
|
|
|
print("WILDFIRE DATA EXTRACTION")
|
|
|
print("="*60 + "\n")
|
|
|
|
|
|
|
|
|
conn = connect_to_database(SQLITE_DB_PATH)
|
|
|
|
|
|
try:
|
|
|
|
|
|
get_table_info(conn)
|
|
|
|
|
|
|
|
|
df = extract_fires_table(conn)
|
|
|
|
|
|
|
|
|
print_data_summary(df)
|
|
|
|
|
|
|
|
|
save_to_parquet(df, RAW_PARQUET)
|
|
|
|
|
|
print("\n✓ Data extraction complete!")
|
|
|
print(f" Output: {RAW_PARQUET}")
|
|
|
|
|
|
finally:
|
|
|
conn.close()
|
|
|
print(" Database connection closed.")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|