Spaces:
Sleeping
Sleeping
File size: 2,037 Bytes
90776a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import pandas as pd
import numpy as np
def load_and_prepare(filepath: str) -> pd.DataFrame:
df = pd.read_csv(filepath)
# ββ Parse datetime ββββββββββββββββββββββββββββββββββββββ
df['datetime'] = pd.to_datetime(df['datetime']) # adjust column name
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek # 0=Mon, 6=Sun
df['month'] = df['datetime'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_peak_hour']= df['hour'].isin([7,8,9,17,18,19]).astype(int)
# ββ Create target label ββββββββββββββββββββββββββββββββββ
# Adjust thresholds based on YOUR dataset's ridership range
def label_crowd(count):
if count < 1000:
return 0 # LOW
elif count < 3000:
return 1 # MEDIUM
else:
return 2 # HIGH
df['crowd_level'] = df['passenger_count'].apply(label_crowd)
# ββ Encode transport type ββββββββββββββββββββββββββββββββ
transport_map = {'bus': 0, 'metro': 1, 'train': 2}
df['transport_encoded'] = df['transport_type'].map(transport_map)
# ββ Drop rows with nulls βββββββββββββββββββββββββββββββββ
df = df.dropna(subset=['crowd_level', 'hour', 'passenger_count'])
return df
def get_features_and_target(df: pd.DataFrame):
features = [
'hour',
'day_of_week',
'month',
'is_weekend',
'is_peak_hour',
'is_holiday', # add if available
'temperature', # add if available
'transport_encoded'
]
# Only use columns that exist in your dataset
features = [f for f in features if f in df.columns]
X = df[features]
y = df['crowd_level']
return X, y |