Spaces:
Build error
Build error
| import os | |
| import pandas as pd | |
| # Load local data | |
| def load_local_data(): | |
| #filepath = os.path.join(current_dir, "test_data.csv") | |
| filepath = "WheelyFunTimes/test_data.csv" | |
| return pd.read_csv(filepath) | |
| """if os.path.exists(filepath): | |
| return pd.read_csv(filepath) | |
| else: | |
| return None""" | |
| def remove_near_duplicates(data): | |
| print(data["trip_id"].nunique()) | |
| result = [] | |
| data["datetime"] = pd.to_datetime(data["datetime"]) | |
| for _, group in data.groupby(['route_id', 'stop_name']): | |
| # Initialize a list to store rows that are not duplicates | |
| filtered_rows = [] | |
| last_row = None | |
| for idx, row in group.iterrows(): | |
| if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)): | |
| # Keep the row if it's the first or sufficiently far apart in time | |
| filtered_rows.append(row) | |
| last_row = row | |
| # Add filtered rows to the result | |
| result.extend(filtered_rows) | |
| filtered_df = pd.DataFrame(result) | |
| # Return the filtered dataframe | |
| print(filtered_df["trip_id"].nunique()) | |
| return filtered_df | |
| df = load_local_data() | |
| print(df.head(12)) | |
| df = remove_near_duplicates(df) | |
| print(df.head(12)) | |