|
|
import pandas as pd |
|
|
import os |
|
|
|
|
|
|
|
|
RAW_DATA_PATH = "data/raw/california_wildfire.csv" |
|
|
PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv" |
|
|
|
|
|
LAT_MIN, LAT_MAX = 32.5, 42.0 |
|
|
LON_MIN, LON_MAX = -124.5, -114.0 |
|
|
|
|
|
def process_data(): |
|
|
print("🔥 Starting Data Processing... (Chunking 9.5M rows)") |
|
|
|
|
|
if not os.path.exists(RAW_DATA_PATH): |
|
|
print(f"❌ Error: File not found at {RAW_DATA_PATH}") |
|
|
return |
|
|
|
|
|
chunk_size = 100000 |
|
|
chunks = [] |
|
|
|
|
|
|
|
|
for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)): |
|
|
if i % 10 == 0: |
|
|
print(f" Processing chunk {i}...") |
|
|
|
|
|
|
|
|
if 'latitude' in chunk.columns and 'longitude' in chunk.columns: |
|
|
cali_chunk = chunk[ |
|
|
(chunk['latitude'] >= LAT_MIN) & |
|
|
(chunk['latitude'] <= LAT_MAX) & |
|
|
(chunk['longitude'] >= LON_MIN) & |
|
|
(chunk['longitude'] <= LON_MAX) |
|
|
] |
|
|
|
|
|
|
|
|
cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs']) |
|
|
chunks.append(cali_chunk) |
|
|
|
|
|
if chunks: |
|
|
df_cali = pd.concat(chunks) |
|
|
print(f"✅ Filtered Data Shape: {df_cali.shape}") |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True) |
|
|
df_cali.to_csv(PROCESSED_DATA_PATH, index=False) |
|
|
print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}") |
|
|
else: |
|
|
print("⚠️ No data found for the specified region.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_data() |