File size: 3,154 Bytes
73e9c25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import sys
import glob
import pandas as pd
from process import prepare_data_df, build_sensor_index, map_pms_to_sensors

index_map = {
    '405': 'I',
    '101': 'US',
    '101': 'US',
    '110': 'I',
    '170': 'CA',
    '118': 'CA',
    '134': 'CA',
    '605': 'I',
    '210': 'I',
    '5': 'I'
}
direction_map = {
    'E': 'East',
    'W': 'West',
    'N': 'North',
    'S': 'South'
}


PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(PROJECT_ROOT)


DATA_DIR = os.path.join(PROJECT_ROOT, "data_collection", "data")
COORDINATE_DIR = os.path.join(PROJECT_ROOT,"data_collection", "coordinates")
full_df = pd.DataFrame() 

for entry in os.scandir(DATA_DIR):
    if entry.is_dir():
        road_number = entry.name
        road_name = index_map[road_number] +" "+ road_number 

        with os.scandir(entry.path) as it:
            for sub in it:
                if sub.is_dir() and sub.name in {'E', 'W', 'N', 'S'}:
                    direction = direction_map[sub.name]
                    print(f"Processing {road_name} {direction}")  
                    data_dir = os.path.join(entry.path, sub.name)
                    coordinate_dir = os.path.join(COORDINATE_DIR, f"{road_name} {direction}.xlsx")

                    for i in range(1,32):
                        if i <=9:
                            raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*0{i}*2025.xlsx")
                            date = f"2025-03-0{i}"
                        else:
                            raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*{i}*2025.xlsx")
                            date = f"2025-03-{i}"

                        matching_files = glob.glob(raw_data_pattern)
                        if not matching_files:
                            print(f"No data file found for {road_name} {direction} on {date}, skipping...")
                            continue
                        
                        raw_data = matching_files[0]

                        df_coord = pd.read_excel(coordinate_dir)
                        df_data = pd.read_excel(raw_data)
                        clean_data_df = prepare_data_df(df_data, df_coord,date)
                        sensors = build_sensor_index(clean_data_df)

                        enriched = map_pms_to_sensors(clean_data_df, sensors)
                        enriched["road_name"] = road_name
                        enriched["direction"] = direction

                        full_df = pd.concat([full_df, enriched], ignore_index=True)
                        print(f"finished {date}")
    

full_df.drop(columns=["Postmile (Abs)", "Postmile (CA)", "VDS", "Time_hour", "matched_sensor_lat", "matched_sensor_lon", "distance_m"], inplace=True)
desired_order = [
    "Time","sensor_id", "Latitude", "Longitude",
    "road_name", "direction", "# Lane Points",
    "% Observed", "weather", "Day", "AggSpeed"
]
full_df = full_df[desired_order]
full_df.rename(columns={
    "AggSpeed": "speed_mph",
    "# Lane Points": "lanes"
}, inplace=True)
full_df.to_csv('full_df_weather.csv',index=False)