|
|
import pandas as pd |
|
|
import datasets |
|
|
import re |
|
|
import os |
|
|
import shutil |
|
|
|
|
|
splits = ["test","train","val"] |
|
|
|
|
|
for item in splits: |
|
|
os.makedirs(f"our_clean/{item}/", exist_ok=True) |
|
|
data = pd.read_csv(f"{item}.csv") |
|
|
data["image_path"] = data["image_path"].map(lambda x:x.split("/")[-1]) |
|
|
|
|
|
f = lambda x: re.sub(' +', ' ', str(x).lower()).replace(" ?", "?").strip() |
|
|
|
|
|
data.insert(0, "file_name", "") |
|
|
data["question"] = data["question"].apply(f) |
|
|
data["answer"] = data["answer"].apply(f) |
|
|
|
|
|
for i, row in data.iterrows(): |
|
|
file_name = f"img_{i}.jpg" |
|
|
data["file_name"].iloc[i] = file_name |
|
|
shutil.copyfile(src=f"author-folder/pvqa/pvqa/images/{item}/{row['image']}.jpg", dst=f"our_clean/{item}/{file_name}") |
|
|
|
|
|
_ = data.pop("image") |
|
|
data.drop(["pathology","image_path"],axis=1,inplace=True) |
|
|
data.to_csv(f"our_clean/{item}/metadata.csv", index=False) |
|
|
|
|
|
dataset = datasets.load_dataset("imagefolder", data_dir="our_clean/") |
|
|
|
|
|
dataset.push_to_hub("CNX-PathLLM/PVQAClean") |