Spaces:
Runtime error
Runtime error
Create utils2.py
Browse files
utils2.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from scipy.spatial import cKDTree
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
df_amenities = pd.read_csv("df_indonesia.csv").rename(
|
| 8 |
+
columns={"latitude":"lat", "longitude":"lon"}
|
| 9 |
+
)
|
| 10 |
+
df_banks = pd.read_csv("df_bank_indonesia.csv").rename(
|
| 11 |
+
columns={"latitude":"lat", "longitude":"lon"}
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply(
|
| 15 |
+
lambda x: eval(x)
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
bank_coords = df_banks[['lat','lon']].values
|
| 19 |
+
tree_banks = cKDTree(bank_coords)
|
| 20 |
+
|
| 21 |
+
amenity_coords = df_amenities[['lat','lon']].values
|
| 22 |
+
tree_amenities = cKDTree(amenity_coords)
|
| 23 |
+
|
| 24 |
+
DATASET_COLUMNS = [
|
| 25 |
+
'Dining and Drinking', 'Community and Government', 'Retail',
|
| 26 |
+
'Business and Professional Services', 'Landmarks and Outdoors',
|
| 27 |
+
'Arts and Entertainment', 'Health and Medicine',
|
| 28 |
+
'Travel and Transportation', 'Sports and Recreation',
|
| 29 |
+
'Event'
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
def compute_features(candidate_point, radius=0.005):
|
| 33 |
+
lat, lon = candidate_point
|
| 34 |
+
|
| 35 |
+
# Banks
|
| 36 |
+
bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius)
|
| 37 |
+
|
| 38 |
+
print("[BANK]", bank_idxs)
|
| 39 |
+
|
| 40 |
+
n_banks = len(bank_idxs)
|
| 41 |
+
if n_banks > 0:
|
| 42 |
+
neighbors = df_banks.iloc[bank_idxs]
|
| 43 |
+
mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
|
| 44 |
+
min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
|
| 45 |
+
else:
|
| 46 |
+
mean_dist_banks = radius
|
| 47 |
+
min_dist_bank = radius
|
| 48 |
+
|
| 49 |
+
# Amenities
|
| 50 |
+
amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius)
|
| 51 |
+
amenities = df_amenities.iloc[amenity_idxs]
|
| 52 |
+
|
| 53 |
+
total_amenities = len(amenities)
|
| 54 |
+
|
| 55 |
+
# Flatten all category IDs
|
| 56 |
+
# for cats in amenities['fsq_category_labels']:
|
| 57 |
+
all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0]
|
| 58 |
+
category_diversity = len(set(all_category_ids))
|
| 59 |
+
|
| 60 |
+
features = {
|
| 61 |
+
'num_banks_in_radius': n_banks,
|
| 62 |
+
# 'mean_dist_banks': mean_dist_banks,
|
| 63 |
+
# 'min_dist_bank': min_dist_bank,
|
| 64 |
+
'total_amenities': total_amenities,
|
| 65 |
+
'category_diversity': category_diversity
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Count occurrences per category
|
| 69 |
+
print("[CATEGORIES]", all_category_ids)
|
| 70 |
+
count_per_category = Counter(all_category_ids)
|
| 71 |
+
for feat in DATASET_COLUMNS:
|
| 72 |
+
print("[FEAT]",feat)
|
| 73 |
+
# for cat, cnt in count_per_category.items():
|
| 74 |
+
features[f'num_{feat}'] = count_per_category.get(feat, 0)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# # Count occurrences of first category
|
| 78 |
+
# first_categories = [cats[0] for cats in amenities['fsq_category_ids'] if len(cats)>0]
|
| 79 |
+
# count_first_category = Counter(first_categories)
|
| 80 |
+
# for cat, cnt in count_first_category.items():
|
| 81 |
+
# features[f'num_first_{cat}'] = cnt
|
| 82 |
+
|
| 83 |
+
return features
|