| | import dask |
| | import dask.dataframe as dd |
| | from dask.diagnostics import ProgressBar |
| |
|
| | with ProgressBar(): |
| | ddf = dd.read_csv( |
| | "../datasets/YFCC100M/yfcc100m_dataset", |
| | names=[ |
| | "photo_id", |
| | "user_nsid", |
| | "user_nickname", |
| | "date_taken", |
| | "date_uploaded", |
| | "capture_device", |
| | "title", |
| | "description", |
| | "user_tags", |
| | "machine_tags", |
| | "longitude", |
| | "latitude", |
| | "accuracy", |
| | "page_url", |
| | "download_url", |
| | "license_name", |
| | "license_url", |
| | "server_id", |
| | "farm_id", |
| | "secret", |
| | "secret_original", |
| | "extension", |
| | "media_type", |
| | ], |
| | dtype={ |
| | "photo_id": str, |
| | "user_nsid": str, |
| | "user_nickname": str, |
| | "user_tags": str, |
| | "machine_tags": str, |
| | "longitude": float, |
| | "latitude": float, |
| | "accuracy": float, |
| | "server_id": str, |
| | "farm_id": str, |
| | "secret": str, |
| | "secret_original": str, |
| | "extension": str, |
| | "media_type": float, |
| | }, |
| | sep="\t", |
| | ) |
| | ddf = ddf[ |
| | [ |
| | "photo_id", |
| | "longitude", |
| | "latitude", |
| | "accuracy", |
| | "extension", |
| | "download_url", |
| | "media_type", |
| | ] |
| | ] |
| | filtered_ddf = ddf[ |
| | ddf["longitude"].notnull() |
| | & ddf["latitude"].notnull() |
| | & (ddf["media_type"] == 0) |
| | ] |
| | del ddf["media_type"] |
| | hash_ddf = dd.read_csv( |
| | "../datasets/YFCC100M/yfcc100m_hash", |
| | names=["photo_id", "hash"], |
| | dtype={"photo_id": str, "hash": str}, |
| | sep="\t", |
| | ) |
| | filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left") |
| | |
| | with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f: |
| | test_photo_ids = set(f.read().splitlines()) |
| |
|
| | |
| | filter = filtered_ddf["photo_id"].isin(test_photo_ids) |
| | test_ddf = filtered_ddf[filter] |
| | train_ddf = filtered_ddf[~filter] |
| |
|
| | train_ddf = train_ddf[train_ddf["accuracy"] >= 12] |
| |
|
| | |
| | test_ddf.to_csv( |
| | "../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv", |
| | sep="\t", |
| | index=False, |
| | single_file=True, |
| | ) |
| | train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1) |
| | train_ddf.to_csv( |
| | "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv", |
| | sep="\t", |
| | index=False, |
| | single_file=False, |
| | ) |
| |
|