Spaces:
Running
Running
| import subprocess | |
| import pandas as pd | |
| import json | |
| taskname = "fathomnet-out-of-sample-detection" | |
| # download_dir = f"benchmarks/{taskname}/env" | |
| download_dir = "benchmarks/" + taskname + "/env" | |
| input(f"Consent to the competition at https://www.kaggle.com/competitions/{taskname}/data; Press any key after you have accepted the rules online.") | |
| subprocess.run(["kaggle", "competitions", "download", "-c", taskname], cwd=download_dir) | |
| subprocess.run(["unzip", "-n", f"{taskname}.zip"], cwd=download_dir) | |
| subprocess.run(["rm", f"{taskname}.zip"], cwd=download_dir) | |
| ### download images | |
| input(f"""Download large amount of images to current directory by doing this: | |
| conda create -n fgvc_test python=3.9 pip | |
| conda activate fgvc_test | |
| pip install -r requirements.txt | |
| python download_images.py ../env/object_detection/train.json --outpath ../env/images | |
| Press any key after done""") | |
| subprocess.run(["rm", "download_images.py"], cwd=download_dir) | |
| subprocess.run(["rm", "demo_download.ipynb"], cwd=download_dir) | |
| subprocess.run(["rm", "requirements.txt"], cwd=download_dir) | |
| # ## split train to train and test in env | |
| trainset = pd.read_csv(f"{download_dir}/multilabel_classification/train.csv") | |
| trainset = trainset.sample(frac=1, random_state=42) | |
| trainset = trainset.reset_index(drop=True) | |
| trainset.iloc[:int(len(trainset)*0.98)].to_csv(f"{download_dir}/multilabel_classification/train.csv", index=False) | |
| testset = trainset.iloc[int(len(trainset)*0.98):] | |
| # split testset to only full_text and labels | |
| testset.to_csv(f"answer.csv", index=False) | |
| # split train json | |
| orig_train_json = json.load(open(f"{download_dir}/object_detection/train.json")) | |
| test_json = json.load(open(f"{download_dir}/object_detection/eval.json")) | |
| # split train_json according to trainset | |
| train_json = orig_train_json.copy() | |
| train_json["images"] = [x for x in orig_train_json["images"] if x["file_name"][:-4] not in testset["id"].values] | |
| images_ids = [x["id"] for x in train_json["images"]] | |
| train_json["annotations"] = [x for x in orig_train_json["annotations"] if x["image_id"] in images_ids] | |
| test_json["images"] = [x for x in orig_train_json["images"] if x["file_name"][:-4] in testset["id"].values] | |
| # relabel ids | |
| for i, x in enumerate(train_json["images"]): | |
| for y in train_json["annotations"]: | |
| if y["image_id"] == x["id"]: | |
| y["image_id"] = i + 1 | |
| x["id"] = i + 1 | |
| for i, x in enumerate(train_json["annotations"]): | |
| x["id"] = i + 1 | |
| for i, x in enumerate(test_json["images"]): | |
| x["id"] = i + 1 | |
| # write train_json and test_json | |
| with open(f"{download_dir}/object_detection/train.json", "w") as f: | |
| json.dump(train_json, f, indent=4) | |
| with open(f"{download_dir}/object_detection/eval.json", "w") as f: | |
| json.dump(test_json, f, indent=4) | |