| | import hydra |
| | import torch |
| | import numpy as np |
| | import pandas as pd |
| | import statistics |
| | from os.path import join, dirname |
| | import matplotlib.pyplot as plt |
| |
|
| |
|
| | class QuadTree(object): |
| | def __init__(self, data, id="", depth=3, do_split=5000): |
| | self.id = id |
| | self.data = data |
| |
|
| | coord = data[["latitude", "longitude"]].to_numpy() |
| |
|
| | |
| | mins = coord.min(0) |
| | |
| | maxs = coord.max(0) |
| |
|
| | self.mins = np.asarray(mins) |
| | self.maxs = np.asarray(maxs) |
| | self.sizes = self.maxs - self.mins |
| |
|
| | self.children = [] |
| |
|
| | |
| | sorted_data_lat = sorted(coord, key=lambda point: point[0]) |
| |
|
| | |
| | median_lat = statistics.median(point[0] for point in sorted_data_lat) |
| |
|
| | |
| | data_left = [point for point in sorted_data_lat if point[0] <= median_lat] |
| | data_right = [point for point in sorted_data_lat if point[0] > median_lat] |
| |
|
| | |
| | sorted_data_left_lon = sorted(data_left, key=lambda point: point[1]) |
| | sorted_data_right_lon = sorted(data_right, key=lambda point: point[1]) |
| |
|
| | |
| | median_lon_left = statistics.median(point[1] for point in sorted_data_left_lon) |
| | median_lon_right = statistics.median( |
| | point[1] for point in sorted_data_right_lon |
| | ) |
| |
|
| | if (depth > 0) and (len(self.data) >= do_split): |
| | |
| | data_q1 = data[ |
| | (data["latitude"] < median_lat) & (data["longitude"] < median_lon_left) |
| | ] |
| | data_q2 = data[ |
| | (data["latitude"] < median_lat) & (data["longitude"] >= median_lon_left) |
| | ] |
| | data_q3 = data[ |
| | (data["latitude"] >= median_lat) |
| | & (data["longitude"] < median_lon_right) |
| | ] |
| | data_q4 = data[ |
| | (data["latitude"] >= median_lat) |
| | & (data["longitude"] >= median_lon_right) |
| | ] |
| |
|
| | |
| | if data_q1.shape[0] > 0: |
| | self.children.append( |
| | QuadTree( |
| | data_q1, |
| | id + "0", |
| | depth - 1, |
| | do_split=do_split, |
| | ) |
| | ) |
| | if data_q2.shape[0] > 0: |
| | self.children.append( |
| | QuadTree( |
| | data_q2, |
| | id + "1", |
| | depth - 1, |
| | do_split=do_split, |
| | ) |
| | ) |
| | if data_q3.shape[0] > 0: |
| | self.children.append( |
| | QuadTree( |
| | data_q3, |
| | id + "2", |
| | depth - 1, |
| | do_split=do_split, |
| | ) |
| | ) |
| | if data_q4.shape[0] > 0: |
| | self.children.append( |
| | QuadTree( |
| | data_q4, |
| | id + "3", |
| | depth - 1, |
| | do_split=do_split, |
| | ) |
| | ) |
| |
|
| | def unwrap(self): |
| | if len(self.children) == 0: |
| | return {self.id: [self.mins, self.maxs, self.data.copy()]} |
| | else: |
| | d = dict() |
| | for child in self.children: |
| | d.update(child.unwrap()) |
| | return d |
| |
|
| |
|
| | def extract(qt, name_new_column): |
| | cluster = qt.unwrap() |
| | boundaries, data = {}, [] |
| | for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()): |
| | (min_lat, min_lon), (max_lat, max_lon), points = vs |
| | points[name_new_column] = int(i) |
| | data.append(points) |
| | boundaries[i] = ( |
| | float(min_lat), |
| | float(min_lon), |
| | float(max_lat), |
| | float(max_lon), |
| | points["latitude"].mean(), |
| | points["longitude"].mean(), |
| | ) |
| |
|
| | data = pd.concat(data) |
| | return boundaries, data |
| |
|
| |
|
| | def vizu(name_new_column, df_train, boundaries, do_split): |
| | plt.hist(df_train[name_new_column], bins=len(boundaries)) |
| | plt.xlabel("Cluster ID") |
| | plt.ylabel("Number of images") |
| | plt.title("Cluster distribution") |
| | plt.yscale("log") |
| | plt.ylim(10, do_split) |
| | plt.savefig(f"{name_new_column}_distrib.png") |
| | plt.clf() |
| |
|
| | plt.scatter( |
| | df_train["longitude"].to_numpy(), |
| | df_train["latitude"].to_numpy(), |
| | c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()], |
| | cmap="tab20", |
| | s=0.1, |
| | alpha=0.5, |
| | ) |
| | plt.xlabel("Longitude") |
| | plt.ylabel("Latitude") |
| | plt.title("Quadtree map") |
| | plt.savefig(f"{name_new_column}_map.png") |
| |
|
| |
|
| | @hydra.main( |
| | config_path="../configs/scripts", |
| | config_name="enrich-metadata-quadtree", |
| | version_base=None, |
| | ) |
| | def main(cfg): |
| |
|
| | data_path = join(cfg.data_dir, "osv5m") |
| | name_new_column = f"adaptive_quadtree_{cfg.depth}_{cfg.do_split}" |
| |
|
| | |
| | train_fp = join(data_path, f"train.csv") |
| | df_train = pd.read_csv(train_fp) |
| |
|
| | qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split) |
| | boundaries, df_train = extract(qt, name_new_column) |
| |
|
| | vizu(name_new_column, df_train, boundaries, cfg.do_split) |
| |
|
| | |
| | boundaries = pd.DataFrame.from_dict( |
| | boundaries, |
| | orient="index", |
| | columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"], |
| | ) |
| | boundaries.to_csv(f"{name_new_column}.csv", index_label="cluster_id") |
| |
|
| | |
| | test_fp = join(data_path, f"test.csv") |
| | df_test = pd.read_csv(test_fp) |
| |
|
| | above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims( |
| | boundaries["min_lat"].to_numpy(), 0 |
| | ) |
| | below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims( |
| | boundaries["max_lat"].to_numpy(), 0 |
| | ) |
| | above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims( |
| | boundaries["min_lon"].to_numpy(), 0 |
| | ) |
| | below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims( |
| | boundaries["max_lon"].to_numpy(), 0 |
| | ) |
| |
|
| | mask = np.logical_and( |
| | np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon) |
| | ) |
| |
|
| | df_test[name_new_column] = np.argmax(mask, axis=1) |
| |
|
| | |
| | lat = torch.tensor(boundaries["mean_lat"]) |
| | lon = torch.tensor(boundaries["mean_lon"]) |
| | coord = torch.stack([lat / 90, lon / 180], dim=-1) |
| | torch.save( |
| | coord, |
| | join( |
| | data_path, f"index_to_gps_adaptive_quadtree_{cfg.depth}_{cfg.do_split}.pt" |
| | ), |
| | ) |
| |
|
| | |
| | if cfg.overwrite_csv: |
| | df_train.to_csv(train_fp, index=False) |
| | df_test.to_csv(test_fp, index=False) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|