Spaces:
Build error
Build error
elli-teu
commited on
Commit
·
5184bfe
1
Parent(s):
7762bc9
Removed duplicate trips
Browse files
app.py
CHANGED
|
@@ -74,6 +74,48 @@ def get_buses():
|
|
| 74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
|
| 75 |
return bus_df, bus_list, short_bus_list
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
def plot_graph(plot_df):
|
| 78 |
#Nu vill vi plotta!
|
| 79 |
categories = {0 : 'Empty',
|
|
@@ -154,6 +196,9 @@ def main():
|
|
| 154 |
if is_local_data_valid():
|
| 155 |
st.write("Using cached local data.")
|
| 156 |
st.session_state.data = load_local_data("data.csv")
|
|
|
|
|
|
|
|
|
|
| 157 |
else:
|
| 158 |
# Fetch data if local data is invalid
|
| 159 |
if st.session_state.hopsworks_project is None:
|
|
|
|
| 74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
|
| 75 |
return bus_df, bus_list, short_bus_list
|
| 76 |
|
| 77 |
+
|
| 78 |
+
def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col = "trip_id", stop_id_col = "stop_name", datetime_col = "datetime", time_window='3min'):
|
| 79 |
+
"""
|
| 80 |
+
Removes duplicate trips based on route_id, start stop_id, and starting time proximity within a time window.
|
| 81 |
+
|
| 82 |
+
Parameters:
|
| 83 |
+
df (pd.DataFrame): Input DataFrame containing trip data.
|
| 84 |
+
route_id_col (str): Column name for route IDs.
|
| 85 |
+
trip_id_col (str): Column name for trip IDs.
|
| 86 |
+
stop_id_col (str): Column name for stop IDs.
|
| 87 |
+
datetime_col (str): Column name for departure times.
|
| 88 |
+
time_window (str): Time window for considering trips as duplicates (e.g., '3min').
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
pd.DataFrame: Filtered DataFrame with duplicates removed.
|
| 92 |
+
"""
|
| 93 |
+
# Ensure the datetime column is of datetime type
|
| 94 |
+
df[datetime_col] = pd.to_datetime(df[datetime_col])
|
| 95 |
+
|
| 96 |
+
# Sort by route_id, stop_id, and datetime for proper grouping and filtering
|
| 97 |
+
df = df.sort_values(by=[route_id_col, stop_id_col, datetime_col])
|
| 98 |
+
|
| 99 |
+
# Find the first stop for each trip
|
| 100 |
+
first_stops = df.groupby(trip_id_col).first().reset_index()
|
| 101 |
+
|
| 102 |
+
# Identify duplicate trips based on route_id, stop_id, and datetime proximity
|
| 103 |
+
def filter_duplicates(group):
|
| 104 |
+
# Compare trips starting within the time window
|
| 105 |
+
group['keep'] = ~((group[stop_id_col] == group[stop_id_col].shift()) &
|
| 106 |
+
(group[datetime_col] - group[datetime_col].shift() <= pd.Timedelta(time_window)))
|
| 107 |
+
group['keep'] = group['keep'].cumsum() == 1 # Keep only the first trip in each duplicate group
|
| 108 |
+
return group[group['keep']]
|
| 109 |
+
|
| 110 |
+
# Apply filtering for each route_id group
|
| 111 |
+
filtered_first_stops = first_stops.groupby(route_id_col, group_keys=False).apply(filter_duplicates)
|
| 112 |
+
|
| 113 |
+
# Filter the original DataFrame to retain only the non-duplicate trips
|
| 114 |
+
unique_trip_ids = filtered_first_stops[trip_id_col].unique()
|
| 115 |
+
result = df[df[trip_id_col].isin(unique_trip_ids)]
|
| 116 |
+
|
| 117 |
+
return result
|
| 118 |
+
|
| 119 |
def plot_graph(plot_df):
|
| 120 |
#Nu vill vi plotta!
|
| 121 |
categories = {0 : 'Empty',
|
|
|
|
| 196 |
if is_local_data_valid():
|
| 197 |
st.write("Using cached local data.")
|
| 198 |
st.session_state.data = load_local_data("data.csv")
|
| 199 |
+
if "first" not in st.session_state:
|
| 200 |
+
st.session_state.first = True
|
| 201 |
+
st.session_state.data = remove_duplicate_trips(st.session_state.data)
|
| 202 |
else:
|
| 203 |
# Fetch data if local data is invalid
|
| 204 |
if st.session_state.hopsworks_project is None:
|