English
File size: 774 Bytes
cbff41a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
from datasets import load_dataset, DatasetDict

k = 10


dataset = load_dataset('CNX-PathLLM/TCGA-WSI-Text', split='train', cache_dir='/bask/projects/p/phwq4930-gbm/Zeyu/PathVLM/.cache')

# Create empty DatasetDict
dataset_dict = DatasetDict()

df_indices = pd.read_csv('./dataset_csv/indices_and_slide_ids_with_folds.csv')

# split data and add to DatasetDict
for i in range(k):
    fold_indices = df_indices[df_indices['fold'] == i]['index'].tolist()
    fold_dataset = dataset.select(fold_indices)
    # added to DatasetDict 
    dataset_dict[f'fold_{i}'] = fold_dataset

print(dataset_dict)

dataset_dict.save_to_disk('/bask/projects/p/phwq4930-gbm/Zeyu/WSI_Dataset/TCGA-WSI-Text-Folds')

# dataset_dict.push_to_hub('CNX-PathLLM/TCGA-WSI-Text-Folds')