Delete models/src
Browse files- models/src/preprocessing.py +0 -51
models/src/preprocessing.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
# Define paths
|
| 5 |
-
RAW_DATA_PATH = "data/raw/california_wildfire.csv"
|
| 6 |
-
PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv"
|
| 7 |
-
# California Bounding Box
|
| 8 |
-
LAT_MIN, LAT_MAX = 32.5, 42.0
|
| 9 |
-
LON_MIN, LON_MAX = -124.5, -114.0
|
| 10 |
-
|
| 11 |
-
def process_data():
|
| 12 |
-
print("🔥 Starting Data Processing... (Chunking 9.5M rows)")
|
| 13 |
-
|
| 14 |
-
if not os.path.exists(RAW_DATA_PATH):
|
| 15 |
-
print(f"❌ Error: File not found at {RAW_DATA_PATH}")
|
| 16 |
-
return
|
| 17 |
-
|
| 18 |
-
chunk_size = 100000
|
| 19 |
-
chunks = []
|
| 20 |
-
|
| 21 |
-
# Read in chunks to handle the 1.3GB size
|
| 22 |
-
for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)):
|
| 23 |
-
if i % 10 == 0:
|
| 24 |
-
print(f" Processing chunk {i}...")
|
| 25 |
-
|
| 26 |
-
# Filter for California Coordinates using new column names
|
| 27 |
-
if 'latitude' in chunk.columns and 'longitude' in chunk.columns:
|
| 28 |
-
cali_chunk = chunk[
|
| 29 |
-
(chunk['latitude'] >= LAT_MIN) &
|
| 30 |
-
(chunk['latitude'] <= LAT_MAX) &
|
| 31 |
-
(chunk['longitude'] >= LON_MIN) &
|
| 32 |
-
(chunk['longitude'] <= LON_MAX)
|
| 33 |
-
]
|
| 34 |
-
|
| 35 |
-
# Simple clean: Drop rows where critical weather info is missing
|
| 36 |
-
cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs'])
|
| 37 |
-
chunks.append(cali_chunk)
|
| 38 |
-
|
| 39 |
-
if chunks:
|
| 40 |
-
df_cali = pd.concat(chunks)
|
| 41 |
-
print(f"✅ Filtered Data Shape: {df_cali.shape}")
|
| 42 |
-
|
| 43 |
-
# Save to processed folder
|
| 44 |
-
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
|
| 45 |
-
df_cali.to_csv(PROCESSED_DATA_PATH, index=False)
|
| 46 |
-
print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}")
|
| 47 |
-
else:
|
| 48 |
-
print("⚠️ No data found for the specified region.")
|
| 49 |
-
|
| 50 |
-
if __name__ == "__main__":
|
| 51 |
-
process_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|