File size: 1,779 Bytes
7b37976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import os

# Define paths
RAW_DATA_PATH = "data/raw/california_wildfire.csv"
PROCESSED_DATA_PATH = "data/processed/Wildfire_Dataset.csv"
# California Bounding Box
LAT_MIN, LAT_MAX = 32.5, 42.0
LON_MIN, LON_MAX = -124.5, -114.0

def process_data():
    print("🔥 Starting Data Processing... (Chunking 9.5M rows)")
    
    if not os.path.exists(RAW_DATA_PATH):
        print(f"❌ Error: File not found at {RAW_DATA_PATH}")
        return

    chunk_size = 100000 
    chunks = []
    
    # Read in chunks to handle the 1.3GB size
    for i, chunk in enumerate(pd.read_csv(RAW_DATA_PATH, chunksize=chunk_size)):
        if i % 10 == 0:
            print(f"   Processing chunk {i}...")

        # Filter for California Coordinates using new column names
        if 'latitude' in chunk.columns and 'longitude' in chunk.columns:
            cali_chunk = chunk[
                (chunk['latitude'] >= LAT_MIN) & 
                (chunk['latitude'] <= LAT_MAX) &
                (chunk['longitude'] >= LON_MIN) & 
                (chunk['longitude'] <= LON_MAX)
            ]
            
            # Simple clean: Drop rows where critical weather info is missing
            cali_chunk = cali_chunk.dropna(subset=['bi', 'tmmn', 'rmax', 'vs'])
            chunks.append(cali_chunk)
    
    if chunks:
        df_cali = pd.concat(chunks)
        print(f"✅ Filtered Data Shape: {df_cali.shape}")
        
        # Save to processed folder
        os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
        df_cali.to_csv(PROCESSED_DATA_PATH, index=False)
        print(f"💾 Saved processed data to: {PROCESSED_DATA_PATH}")
    else:
        print("⚠️ No data found for the specified region.")

if __name__ == "__main__":
    process_data()