jonathanjordan21 commited on
Commit
4662da0
·
verified ·
1 Parent(s): ef3f9b1

Create utils2.py

Browse files
Files changed (1) hide show
  1. utils2.py +83 -0
utils2.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy.spatial import cKDTree
5
+
6
+
7
+ df_amenities = pd.read_csv("df_indonesia.csv").rename(
8
+ columns={"latitude":"lat", "longitude":"lon"}
9
+ )
10
+ df_banks = pd.read_csv("df_bank_indonesia.csv").rename(
11
+ columns={"latitude":"lat", "longitude":"lon"}
12
+ )
13
+
14
+ df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply(
15
+ lambda x: eval(x)
16
+ )
17
+
18
+ bank_coords = df_banks[['lat','lon']].values
19
+ tree_banks = cKDTree(bank_coords)
20
+
21
+ amenity_coords = df_amenities[['lat','lon']].values
22
+ tree_amenities = cKDTree(amenity_coords)
23
+
24
+ DATASET_COLUMNS = [
25
+ 'Dining and Drinking', 'Community and Government', 'Retail',
26
+ 'Business and Professional Services', 'Landmarks and Outdoors',
27
+ 'Arts and Entertainment', 'Health and Medicine',
28
+ 'Travel and Transportation', 'Sports and Recreation',
29
+ 'Event'
30
+ ]
31
+
32
+ def compute_features(candidate_point, radius=0.005):
33
+ lat, lon = candidate_point
34
+
35
+ # Banks
36
+ bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius)
37
+
38
+ print("[BANK]", bank_idxs)
39
+
40
+ n_banks = len(bank_idxs)
41
+ if n_banks > 0:
42
+ neighbors = df_banks.iloc[bank_idxs]
43
+ mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
44
+ min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
45
+ else:
46
+ mean_dist_banks = radius
47
+ min_dist_bank = radius
48
+
49
+ # Amenities
50
+ amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius)
51
+ amenities = df_amenities.iloc[amenity_idxs]
52
+
53
+ total_amenities = len(amenities)
54
+
55
+ # Flatten all category IDs
56
+ # for cats in amenities['fsq_category_labels']:
57
+ all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0]
58
+ category_diversity = len(set(all_category_ids))
59
+
60
+ features = {
61
+ 'num_banks_in_radius': n_banks,
62
+ # 'mean_dist_banks': mean_dist_banks,
63
+ # 'min_dist_bank': min_dist_bank,
64
+ 'total_amenities': total_amenities,
65
+ 'category_diversity': category_diversity
66
+ }
67
+
68
+ # Count occurrences per category
69
+ print("[CATEGORIES]", all_category_ids)
70
+ count_per_category = Counter(all_category_ids)
71
+ for feat in DATASET_COLUMNS:
72
+ print("[FEAT]",feat)
73
+ # for cat, cnt in count_per_category.items():
74
+ features[f'num_{feat}'] = count_per_category.get(feat, 0)
75
+
76
+
77
+ # # Count occurrences of first category
78
+ # first_categories = [cats[0] for cats in amenities['fsq_category_ids'] if len(cats)>0]
79
+ # count_first_category = Counter(first_categories)
80
+ # for cat, cnt in count_first_category.items():
81
+ # features[f'num_first_{cat}'] = cnt
82
+
83
+ return features