HashirAwaiz commited on
Commit
e0a854f
·
verified ·
1 Parent(s): 28c25ef

Delete models/src

Browse files
Files changed (1) hide show
  1. models/src/preprocessing.py +0 -51
models/src/preprocessing.py DELETED
@@ -1,51 +0,0 @@
1
- import pandas as pd
2
- import os
3
-
4
- # Define paths
5
- RAW_DATA_PATH = "data/raw/california_wildfire.csv"
6
- PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv"
7
- # California Bounding Box
8
- LAT_MIN, LAT_MAX = 32.5, 42.0
9
- LON_MIN, LON_MAX = -124.5, -114.0
10
-
11
- def process_data():
12
- print("🔥 Starting Data Processing... (Chunking 9.5M rows)")
13
-
14
- if not os.path.exists(RAW_DATA_PATH):
15
- print(f"❌ Error: File not found at {RAW_DATA_PATH}")
16
- return
17
-
18
- chunk_size = 100000
19
- chunks = []
20
-
21
- # Read in chunks to handle the 1.3GB size
22
- for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)):
23
- if i % 10 == 0:
24
- print(f" Processing chunk {i}...")
25
-
26
- # Filter for California Coordinates using new column names
27
- if 'latitude' in chunk.columns and 'longitude' in chunk.columns:
28
- cali_chunk = chunk[
29
- (chunk['latitude'] >= LAT_MIN) &
30
- (chunk['latitude'] <= LAT_MAX) &
31
- (chunk['longitude'] >= LON_MIN) &
32
- (chunk['longitude'] <= LON_MAX)
33
- ]
34
-
35
- # Simple clean: Drop rows where critical weather info is missing
36
- cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs'])
37
- chunks.append(cali_chunk)
38
-
39
- if chunks:
40
- df_cali = pd.concat(chunks)
41
- print(f"✅ Filtered Data Shape: {df_cali.shape}")
42
-
43
- # Save to processed folder
44
- os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
45
- df_cali.to_csv(PROCESSED_DATA_PATH, index=False)
46
- print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}")
47
- else:
48
- print("⚠️ No data found for the specified region.")
49
-
50
- if __name__ == "__main__":
51
- process_data()