File size: 1,779 Bytes
7b37976 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import pandas as pd
import os
# Define paths
RAW_DATA_PATH = "data/raw/california_wildfire.csv"
PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv"
# California Bounding Box
LAT_MIN, LAT_MAX = 32.5, 42.0
LON_MIN, LON_MAX = -124.5, -114.0
def process_data():
print("🔥 Starting Data Processing... (Chunking 9.5M rows)")
if not os.path.exists(RAW_DATA_PATH):
print(f"❌ Error: File not found at {RAW_DATA_PATH}")
return
chunk_size = 100000
chunks = []
# Read in chunks to handle the 1.3GB size
for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)):
if i % 10 == 0:
print(f" Processing chunk {i}...")
# Filter for California Coordinates using new column names
if 'latitude' in chunk.columns and 'longitude' in chunk.columns:
cali_chunk = chunk[
(chunk['latitude'] >= LAT_MIN) &
(chunk['latitude'] <= LAT_MAX) &
(chunk['longitude'] >= LON_MIN) &
(chunk['longitude'] <= LON_MAX)
]
# Simple clean: Drop rows where critical weather info is missing
cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs'])
chunks.append(cali_chunk)
if chunks:
df_cali = pd.concat(chunks)
print(f"✅ Filtered Data Shape: {df_cali.shape}")
# Save to processed folder
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
df_cali.to_csv(PROCESSED_DATA_PATH, index=False)
print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}")
else:
print("⚠️ No data found for the specified region.")
if __name__ == "__main__":
process_data() |