GraPHFormer / scripts /prepare_data.py
uzshah's picture
Initial commit: GraPHFormer codebase
cf84204
"""
Please refer to data/README.md for data download and preparation.
Then run this script to preprocess data.
"""
import os, sys
from data_io.process_raw_script import (
convert_janelia_json,
normalize_root_and_check,
filter_axon_and_check,
summarize_branch,
split_sample_10fold_cv_and_merge,
)
import pandas as pd
data_path = "./data"
# convert JML json file
convert_janelia_json(
os.path.join(data_path, "raw/janelia_mouselight/json30/*.json"),
os.path.join(data_path, "raw/janelia_mouselight/swc"),
os.path.join(data_path, "info/JML_info_swc.csv"),
)
# preprocess data
for source in ["janelia_mouselight", "allen_cell_type", "bil"]:
print(f"Processing:{source}")
# normalize neuron's center, orientation and size
print(f"Normalize neuron")
folder_in = f"{data_path}/raw/{source}/swc/"
folder_out = f"{data_path}/raw/{source}/swc_soma0/"
if source == "bil":
normalize_root_and_check(folder_in + "*reg.swc", folder_out)
# some BIL reconstructions are not correctly scaled. this will fix them
normalize_root_and_check(
folder_in + "*__reg.swc", folder_out, scale=[0.114, 0.114, 0.28]
)
else:
normalize_root_and_check(folder_in + "*.swc", folder_out)
# remove axon file
print(f"Remove axons")
folder_in = f"{data_path}/raw/{source}/swc_soma0/*.swc"
folder_out = f"{data_path}/dendrite/{source}/swc_soma0/"
filter_axon_and_check(folder_in, folder_out)
print(f"Calculate features")
folder_in = f"{data_path}/dendrite/{source}/swc_soma0/*.swc"
folder_out = f"{data_path}/dendrite/{source}/eswc_soma0/"
summarize_branch(folder_in, folder_out)
# split data into 10 folds
split_sample_10fold_cv_and_merge(data_path)
# hack folder creation
folder_names = ["allen_cell_type", "bil", "janelia_mouselight"]
all_wo_others = {
"VPM": 0,
"Isocortex_layer23": 1,
"Isocortex_layer4": 2,
"PRE": 3,
"SUB": 4,
"CP": 5,
"VPL": 6,
"Isocortex_layer6": 7,
"MG": 8,
"Isocortex_layer5": 9,
}
for i, split_csv in enumerate(
[
f"{data_path}/info/ACT_info_swc_10folds.csv",
f"{data_path}/info/BIL_info_swc_10folds.csv",
f"{data_path}/info/JML_info_swc_10folds.csv",
]
):
csv = pd.read_csv(split_csv)
folder_name = folder_names[i]
for split in range(10):
for fname in csv[csv["model__fold"] == split]["swc__fname"]:
# get acronym from "structure_merge__acronym"
acronym = csv[csv["swc__fname"] == fname][
"structure_merge__acronym"
].values[0]
if acronym not in all_wo_others:
if acronym == "Isocortex_layer2/3":
acronym = "Isocortex_layer23"
else:
continue
os.makedirs(
f"{data_path}/dendrite/all_eswc_soma0_ssl/{acronym}/{folder_name}-{split}/",
exist_ok=True,
)
source_path = os.path.abspath(
f"{data_path}/dendrite/{folder_name}/eswc_soma0/{fname}"
)
target_path = f"{data_path}/dendrite/all_eswc_soma0_ssl/{acronym}/{folder_name}-{split}/{fname}"
# Check if the source path is a valid file
if os.path.isfile(source_path):
os.symlink(source_path, target_path)
else:
print(f"{source_path} is not a valid file!")
# # hack folder creation
# folder_names = ["allen_cell_type"]
# all_wo_others = {
# "Isocortex_layer23": 0,
# "Isocortex_layer4": 1,
# "Isocortex_layer5": 2,
# "Isocortex_layer6": 3,
# }
# for i, split_csv in enumerate(
# [
# f"{data_path}/info/ACT_info_swc_10folds.csv",
# # f"{data_path}/info/BIL_info_swc_10folds.csv",
# # f"{data_path}/info/JML_info_swc_10folds.csv",
# ]
# ):
# csv = pd.read_csv(split_csv)
# folder_name = folder_names[i]
# for split in range(10):
# for fname in csv[csv["model__fold"] == split]["swc__fname"]:
# # get acronym from "structure_merge__acronym"
# acronym = csv[csv["swc__fname"] == fname][
# "structure_merge__acronym"
# ].values[0]
# if acronym not in all_wo_others:
# if acronym == "Isocortex_layer2/3":
# acronym = "Isocortex_layer23"
# else:
# continue
# os.makedirs(
# f"{data_path}/dendrite/ACT/{acronym}/{folder_name}-{split}/",
# exist_ok=True,
# )
# source_path = os.path.abspath(
# f"{data_path}/dendrite/{folder_name}/eswc_soma0/{fname}"
# )
# target_path = f"{data_path}/dendrite/ACT/{acronym}/{folder_name}-{split}/{fname}"
# # Check if the source path is a valid file
# if os.path.isfile(source_path):
# os.symlink(source_path, target_path)
# else:
# print(f"{source_path} is not a valid file!")
# # hack folder creation
# folder_names = ["janelia_mouselight"]
# all_wo_others = {
# "Isocortex_layer23": 0,
# "Isocortex_layer5": 1,
# "Isocortex_layer6": 2,
# "VPM": 3,
# }
# for i, split_csv in enumerate(
# [
# # f"{data_path}/info/ACT_info_swc_10folds.csv",
# # f"{data_path}/info/BIL_info_swc_10folds.csv",
# f"{data_path}/info/JML_info_swc_10folds.csv",
# ]
# ):
# csv = pd.read_csv(split_csv)
# folder_name = folder_names[i]
# for split in range(10):
# for fname in csv[csv["model__fold"] == split]["swc__fname"]:
# # get acronym from "structure_merge__acronym"
# acronym = csv[csv["swc__fname"] == fname][
# "structure_merge__acronym"
# ].values[0]
# if acronym not in all_wo_others:
# if acronym == "Isocortex_layer2/3":
# acronym = "Isocortex_layer23"
# else:
# continue
# os.makedirs(
# f"{data_path}/dendrite/JML/{acronym}/{folder_name}-{split}/",
# exist_ok=True,
# )
# source_path = os.path.abspath(
# f"{data_path}/dendrite/{folder_name}/eswc_soma0/{fname}"
# )
# target_path = f"{data_path}/dendrite/JML/{acronym}/{folder_name}-{split}/{fname}"
# # Check if the source path is a valid file
# if os.path.isfile(source_path):
# os.symlink(source_path, target_path)
# else:
# print(f"{source_path} is not a valid file!")
# # hack folder creation
# folder_names = ["bil"]
# all_wo_others = {
# "CP": 0,
# "Isocortex_layer23": 1,
# "Isocortex_layer4": 2,
# "Isocortex_layer5": 3,
# "Isocortex_layer6": 4,
# "VPM": 5,
# }
# for i, split_csv in enumerate(
# [
# # f"{data_path}/info/ACT_info_swc_10folds.csv",
# f"{data_path}/info/BIL_info_swc_10folds.csv",
# # f"{data_path}/info/JML_info_swc_10folds.csv",
# ]
# ):
# csv = pd.read_csv(split_csv)
# folder_name = folder_names[i]
# for split in range(10):
# for fname in csv[csv["model__fold"] == split]["swc__fname"]:
# # get acronym from "structure_merge__acronym"
# acronym = csv[csv["swc__fname"] == fname][
# "structure_merge__acronym"
# ].values[0]
# if acronym not in all_wo_others:
# if acronym == "Isocortex_layer2/3":
# acronym = "Isocortex_layer23"
# else:
# continue
# os.makedirs(
# f"{data_path}/dendrite/BIL/{acronym}/{folder_name}-{split}/",
# exist_ok=True,
# )
# source_path = os.path.abspath(
# f"{data_path}/dendrite/{folder_name}/eswc_soma0/{fname}"
# )
# target_path = f"{data_path}/dendrite/BIL/{acronym}/{folder_name}-{split}/{fname}"
# # Check if the source path is a valid file
# if os.path.isfile(source_path):
# os.symlink(source_path, target_path)
# else:
# print(f"{source_path} is not a valid file!")