Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import random | |
| from datetime import datetime, timedelta | |
| import os | |
| import time | |
| import requests | |
| from geopy.geocoders import Nominatim | |
| # Set random seed for reproducibility | |
| np.random.seed(42) | |
| def generate_delivery_data(n_deliveries=50, use_geocoding=False): | |
| """ | |
| Generate synthetic delivery data with realistic Singapore addresses | |
| """ | |
| # Define real Singapore neighborhoods and their actual streets | |
| # Format: [neighborhood_name, [list_of_real_streets], postal_code_prefix] | |
| sg_neighborhoods = [ | |
| ['Ang Mo Kio', ['Ang Mo Kio Avenue 1', 'Ang Mo Kio Avenue 3', 'Ang Mo Kio Avenue 4', 'Ang Mo Kio Avenue 10'], '56'], | |
| ['Bedok', ['Bedok North Avenue 1', 'Bedok North Road', 'Bedok Reservoir Road', 'New Upper Changi Road'], '46'], | |
| ['Bishan', ['Bishan Street 11', 'Bishan Street 12', 'Bishan Street 13', 'Bishan Street 22'], '57'], | |
| ['Bukit Merah', ['Jalan Bukit Merah', 'Henderson Road', 'Tiong Bahru Road', 'Redhill Close'], '15'], | |
| ['Bukit Batok', ['Bukit Batok East Avenue 6', 'Bukit Batok West Avenue 8', 'Bukit Batok Street 21'], '65'], | |
| ['Clementi', ['Clementi Avenue 1', 'Clementi Avenue 4', 'Clementi Road', 'Commonwealth Avenue West'], '12'], | |
| ['Geylang', ['Geylang East Avenue 1', 'Geylang Road', 'Guillemard Road', 'Sims Avenue'], '38'], | |
| ['Hougang', ['Hougang Avenue 1', 'Hougang Avenue 7', 'Hougang Street 91', 'Upper Serangoon Road'], '53'], | |
| ['Jurong East', ['Jurong East Street 13', 'Jurong East Avenue 1', 'Jurong Gateway Road'], '60'], | |
| ['Jurong West', ['Jurong West Street 41', 'Jurong West Street 52', 'Jurong West Street 93'], '64'], | |
| ['Kallang', ['Kallang Avenue', 'Geylang Bahru', 'Boon Keng Road', 'Upper Boon Keng Road'], '33'], | |
| ['Punggol', ['Punggol Central', 'Punggol Field', 'Punggol Road', 'Punggol Way'], '82'], | |
| ['Queenstown', ['Commonwealth Avenue', 'Commonwealth Drive', 'Mei Chin Road', 'Stirling Road'], '14'], | |
| ['Sengkang', ['Sengkang East Way', 'Sengkang West Way', 'Compassvale Road', 'Fernvale Road'], '54'], | |
| ['Serangoon', ['Serangoon Avenue 2', 'Serangoon Avenue 3', 'Serangoon North Avenue 1'], '55'], | |
| ['Tampines', ['Tampines Street 11', 'Tampines Street 21', 'Tampines Avenue 1', 'Tampines Avenue 4'], '52'], | |
| ['Toa Payoh', ['Toa Payoh Lorong 1', 'Toa Payoh Lorong 2', 'Toa Payoh Lorong 4', 'Toa Payoh Central'], '31'], | |
| ['Woodlands', ['Woodlands Avenue 1', 'Woodlands Drive 16', 'Woodlands Drive 72', 'Woodlands Circle'], '73'], | |
| ['Yishun', ['Yishun Avenue 1', 'Yishun Avenue 4', 'Yishun Ring Road', 'Yishun Street 22'], '76'] | |
| ] | |
| # Bounding boxes for neighborhoods (for fallback coordinates) | |
| # Format: [name, min_lat, max_lat, min_lon, max_lon] | |
| neighborhood_bounds = { | |
| 'Ang Mo Kio': [1.360000, 1.380000, 103.830000, 103.860000], | |
| 'Bedok': [1.320000, 1.335000, 103.920000, 103.950000], | |
| 'Bishan': [1.345000, 1.360000, 103.830000, 103.855000], | |
| 'Bukit Merah': [1.270000, 1.290000, 103.800000, 103.830000], | |
| 'Bukit Batok': [1.340000, 1.360000, 103.740000, 103.770000], | |
| 'Clementi': [1.310000, 1.325000, 103.750000, 103.780000], | |
| 'Geylang': [1.310000, 1.325000, 103.880000, 103.900000], | |
| 'Hougang': [1.370000, 1.385000, 103.880000, 103.900000], | |
| 'Jurong East': [1.330000, 1.345000, 103.730000, 103.750000], | |
| 'Jurong West': [1.340000, 1.360000, 103.690000, 103.720000], | |
| 'Kallang': [1.300000, 1.320000, 103.850000, 103.880000], | |
| 'Punggol': [1.390000, 1.410000, 103.900000, 103.920000], | |
| 'Queenstown': [1.290000, 1.310000, 103.780000, 103.805000], | |
| 'Sengkang': [1.380000, 1.395000, 103.870000, 103.900000], | |
| 'Serangoon': [1.345000, 1.360000, 103.865000, 103.885000], | |
| 'Tampines': [1.345000, 1.365000, 103.930000, 103.960000], | |
| 'Toa Payoh': [1.326000, 1.341000, 103.840000, 103.865000], | |
| 'Woodlands': [1.430000, 1.450000, 103.770000, 103.800000], | |
| 'Yishun': [1.410000, 1.430000, 103.820000, 103.850000] | |
| } | |
| # Generate delivery IDs | |
| delivery_ids = [f'DEL{str(i).zfill(4)}' for i in range(1, n_deliveries + 1)] | |
| # Generate customer names (fictional) | |
| first_names = ['Tan', 'Lim', 'Lee', 'Ng', 'Wong', 'Chan', 'Goh', 'Ong', 'Teo', 'Koh', | |
| 'Chua', 'Loh', 'Yeo', 'Sim', 'Ho', 'Ang', 'Tay', 'Yap', 'Leong', 'Foo'] | |
| last_names = ['Wei', 'Ming', 'Hui', 'Ling', 'Yong', 'Jun', 'Hong', 'Xin', 'Yi', 'Jie', | |
| 'Cheng', 'Kai', 'Zhi', 'Tian', 'Yu', 'En', 'Yang', 'Hao', 'Chong', 'Zheng'] | |
| customer_names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_deliveries)] | |
| addresses = [] | |
| postal_codes = [] | |
| latitudes = [] | |
| longitudes = [] | |
| neighborhood_names = [] | |
| # Initialize geocoder if using geocoding | |
| if use_geocoding: | |
| geolocator = Nominatim(user_agent="delivery_app") | |
| # Generate realistic addresses | |
| for i in range(n_deliveries): | |
| # Randomly select a neighborhood | |
| neighborhood_data = random.choice(sg_neighborhoods) | |
| neighborhood = neighborhood_data[0] | |
| streets = neighborhood_data[1] | |
| postal_prefix = neighborhood_data[2] | |
| # Randomly select a street in that neighborhood | |
| street = random.choice(streets) | |
| # Generate block number (realistic for HDB) | |
| block = random.randint(100, 600) | |
| # Generate unit number | |
| unit_floor = random.randint(2, 20) | |
| unit_number = random.randint(1, 150) | |
| # Generate postal code (with realistic prefix) | |
| postal_suffix = str(random.randint(0, 999)).zfill(3) | |
| postal_code = postal_prefix + postal_suffix | |
| # Create two formats of address - one for display, one for geocoding | |
| display_address = f"Block {block}, #{unit_floor:02d}-{unit_number:03d}, {street}, Singapore {postal_code}" | |
| geocode_address = f"{block} {street}, Singapore {postal_code}" # Simpler format for geocoding | |
| # Default coordinates from neighborhood bounding box (fallback) | |
| bounds = neighborhood_bounds[neighborhood] | |
| default_lat = round(random.uniform(bounds[0], bounds[1]), 6) | |
| default_lon = round(random.uniform(bounds[2], bounds[3]), 6) | |
| # Use geocoding API if requested | |
| if use_geocoding: | |
| try: | |
| location = geolocator.geocode(geocode_address) | |
| if location: | |
| lat = location.latitude | |
| lon = location.longitude | |
| print(f"✓ Successfully geocoded: {geocode_address} → ({lat}, {lon})") | |
| else: | |
| # First fallback: try with just street and postal code | |
| simpler_address = f"{street}, Singapore {postal_code}" | |
| location = geolocator.geocode(simpler_address) | |
| if location: | |
| lat = location.latitude | |
| lon = location.longitude | |
| print(f"✓ Fallback geocoded: {simpler_address} → ({lat}, {lon})") | |
| else: | |
| # Second fallback: just use the neighborhood center | |
| lat = default_lat | |
| lon = default_lon | |
| print(f"✗ Could not geocode: {geocode_address}, using neighborhood coordinates") | |
| # Add delay to avoid being rate limited | |
| time.sleep(1) | |
| except Exception as e: | |
| print(f"✗ Geocoding error for {geocode_address}: {str(e)}") | |
| lat = default_lat | |
| lon = default_lon | |
| else: | |
| # Without geocoding, use the default coordinates | |
| lat = default_lat | |
| lon = default_lon | |
| addresses.append(display_address) | |
| postal_codes.append(postal_code) | |
| latitudes.append(lat) | |
| longitudes.append(lon) | |
| neighborhood_names.append(neighborhood) | |
| # Generate delivery dates (within the next 7 days) | |
| base_date = datetime.now().date() | |
| delivery_dates = [base_date + timedelta(days=random.randint(1, 7)) for _ in range(n_deliveries)] | |
| # Generate time windows (between 9 AM and 5 PM) | |
| time_windows = [] | |
| for _ in range(n_deliveries): | |
| start_hour = random.randint(9, 16) | |
| window_length = random.choice([1, 2, 3]) # 1, 2, or 3 hour windows | |
| end_hour = min(start_hour + window_length, 18) | |
| start_time = f"{start_hour:02d}:00" | |
| end_time = f"{end_hour:02d}:00" | |
| time_windows.append(f"{start_time}-{end_time}") | |
| # Generate package details | |
| weights = np.random.uniform(0.5, 20.0, n_deliveries) # in kg | |
| volumes = np.random.uniform(0.01, 0.5, n_deliveries) # in m³ | |
| # Priority levels | |
| priorities = np.random.choice(['High', 'Medium', 'Low'], n_deliveries, | |
| p=[0.2, 0.5, 0.3]) # 20% High, 50% Medium, 30% Low | |
| # Required vehicle type | |
| vehicle_types = np.random.choice(['Standard', 'Large', 'Refrigerated'], n_deliveries, | |
| p=[0.7, 0.2, 0.1]) | |
| # Status | |
| statuses = np.random.choice(['Pending', 'Assigned', 'In Transit', 'Delivered'], n_deliveries, | |
| p=[0.6, 0.2, 0.15, 0.05]) | |
| # Additional notes | |
| notes = [] | |
| special_instructions = [ | |
| 'Call customer before delivery', | |
| 'Fragile items', | |
| 'Leave at door', | |
| 'Signature required', | |
| 'No delivery on weekends', | |
| None | |
| ] | |
| for _ in range(n_deliveries): | |
| if random.random() < 0.7: # 70% chance of having a note | |
| notes.append(random.choice(special_instructions)) | |
| else: | |
| notes.append(None) | |
| # Create DataFrame | |
| df = pd.DataFrame({ | |
| 'delivery_id': delivery_ids, | |
| 'customer_name': customer_names, | |
| 'address': addresses, | |
| 'postal_code': postal_codes, | |
| 'neighborhood': neighborhood_names, | |
| 'latitude': latitudes, | |
| 'longitude': longitudes, | |
| 'delivery_date': delivery_dates, | |
| 'time_window': time_windows, | |
| 'weight_kg': weights.round(2), | |
| 'volume_m3': volumes.round(3), | |
| 'priority': priorities, | |
| 'vehicle_type': vehicle_types, | |
| 'status': statuses, | |
| 'special_instructions': notes | |
| }) | |
| # Ensure the directory exists | |
| data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'data', 'delivery-data') | |
| os.makedirs(data_dir, exist_ok=True) | |
| # Save to CSV | |
| output_path = os.path.join(data_dir, 'delivery_data.csv') | |
| df.to_csv(output_path, index=False) | |
| print(f"Delivery data generated and saved to {output_path}") | |
| return df | |
| if __name__ == "__main__": | |
| # Set to True if you want to use real geocoding (slower but more accurate) | |
| USE_GEOCODING = True | |
| delivery_data = generate_delivery_data(50, use_geocoding=USE_GEOCODING) | |
| print("Sample of delivery data:") | |
| print(delivery_data.head()) | |