Wildfire-Intelligence-MLOps / src /preprocessing.py
HashirAwaiz's picture
Create src/preprocessing.py
7b37976 verified
import pandas as pd
import os
# Define paths
RAW_DATA_PATH = "data/raw/california_wildfire.csv"
PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv"
# California Bounding Box
LAT_MIN, LAT_MAX = 32.5, 42.0
LON_MIN, LON_MAX = -124.5, -114.0
def process_data():
print("🔥 Starting Data Processing... (Chunking 9.5M rows)")
if not os.path.exists(RAW_DATA_PATH):
print(f"❌ Error: File not found at {RAW_DATA_PATH}")
return
chunk_size = 100000
chunks = []
# Read in chunks to handle the 1.3GB size
for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)):
if i % 10 == 0:
print(f" Processing chunk {i}...")
# Filter for California Coordinates using new column names
if 'latitude' in chunk.columns and 'longitude' in chunk.columns:
cali_chunk = chunk[
(chunk['latitude'] >= LAT_MIN) &
(chunk['latitude'] <= LAT_MAX) &
(chunk['longitude'] >= LON_MIN) &
(chunk['longitude'] <= LON_MAX)
]
# Simple clean: Drop rows where critical weather info is missing
cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs'])
chunks.append(cali_chunk)
if chunks:
df_cali = pd.concat(chunks)
print(f"✅ Filtered Data Shape: {df_cali.shape}")
# Save to processed folder
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
df_cali.to_csv(PROCESSED_DATA_PATH, index=False)
print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}")
else:
print("⚠️ No data found for the specified region.")
if __name__ == "__main__":
process_data()