File size: 5,123 Bytes

78b4171

import subprocess
import pandas as pd
import numpy as np
import ast
import os
import tqdm
import shutil
import wfdb


##############################
# Class Aggregation Function #
##############################
def aggregate_diagnostic_one_class(y_dic):
    best_class = "None"
    best_value = -float('inf')
    for key, value in y_dic.items():
        if key in agg_df.index and value > best_value:
            best_value = value
            best_class = agg_df.loc[key].diagnostic_class
    return best_class

def aggregate_diagnostic_one_subclass(y_dic):
    best_class = "None"
    best_value = -float('inf')
    for key, value in y_dic.items():
        if key in agg_df.index and value > best_value:
            best_value = value
            best_class = agg_df.loc[key].diagnostic_subclass
    return best_class

def aggregate_diagnostic_all_classes(y_dic):
    best_class = "None"
    best_value = -float('inf')
    for key, value in y_dic.items():
        if key in agg_df.index and value > best_value:
            best_value = value
            best_class = key
    return best_class
    
'''
Function to create patches for NeuroRVQ
'''
def create_patches(ecg_signal, maximum_patches, patch_size, channels_use):
    n, c, t = ecg_signal.shape  # Batch / trials, channels, time
    n_time = (maximum_patches // len(channels_use))
    ecg_signal = ecg_signal[:, :, :n_time * patch_size]
    ecg_signal_patches = ecg_signal[:, channels_use, :]
    return ecg_signal_patches, n_time


if not os.path.exists("./example_files/ecg_sample/ptb_xl_cut_benchmarking.npy"):
    ##############################
    #      Downloading PTB-XL    #
    ##############################
    print("Downloading ptb_xl....")
    subprocess.run([
        "wget",
        "-r", "-N", "-c", "-np",
        "-P", "./ptb_xl",
        "https://physionet.org/files/ptb-xl/1.0.3/"
    ], check=False)


    ##############################
    #     Merging Folders        #
    ##############################
    source_root = "./ptb_xl/physionet.org/files/ptb-xl/1.0.3/records500"
    target_root = "./ptb_xl/records500_all"
    os.makedirs(target_root, exist_ok=True)

    for subfolder in os.listdir(source_root):
        subfolder_path = os.path.join(source_root, subfolder)

        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                src_file = os.path.join(subfolder_path, filename)
                dst_file = os.path.join(target_root, filename)

                # If filename already exists, rename to avoid overwrite
                if os.path.exists(dst_file):
                    name, ext = os.path.splitext(filename)
                    dst_file = os.path.join(target_root, f"{name}_{subfolder}{ext}")

                shutil.move(src_file, dst_file)

    print("Merge complete.")
    print("Dataset cutting....")

    ##############################
    #     Dataset Cutting        #
    ##############################
    path = './ptb_xl/physionet.org/files/ptb-xl/1.0.3/'
    data_path = './ptb_xl/records500_all'

    # load and convert annotation data
    Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
    Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

    new_Y = Y.loc[:, ['patient_id', 'scp_codes', 'strat_fold', 'filename_lr', 'filename_hr']]
    new_Y.index.name == 'ecg_id'

    # Load scp_statements.csv for diagnostic aggregation
    agg_df_all = pd.read_csv(path+'scp_statements.csv', index_col=0)
    agg_df = agg_df_all[agg_df_all.diagnostic == 1]

    # Apply diagnostic superclass
    new_Y['diagnostic_5_classes'] = new_Y.scp_codes.apply(aggregate_diagnostic_one_class)
    new_Y['diagnostic_23_classes'] = new_Y.scp_codes.apply(aggregate_diagnostic_one_subclass)
    new_Y['diagnostic_44_classes'] = new_Y.scp_codes.apply(aggregate_diagnostic_all_classes)
    new_Y['filename_hr'] = new_Y['filename_hr'].str.split('/').str[-1]
    new_Y.to_csv('./example_files/ecg_sample/ptb_xl_cut_benchmarking.csv')

    # Initialize 3D array: files x channels x time
    X = np.zeros((len(new_Y['filename_hr']), 12, 5000))

    for idx, f_i in enumerate(tqdm.tqdm(new_Y['filename_hr'])):
        x = wfdb.rdsamp(os.path.join(data_path, f_i))[0].T
        if x.shape != (12, 5000):
            raise ValueError(f"Signal {f_i} has shape {x.shape}, expected (12, 5000)")
        ch_names = np.array(wfdb.rdsamp(os.path.join(data_path, f_i.split('.')[0]))[1]['sig_name'])
        ch_names = np.array([e.lower() for e in ch_names])
        if (idx==0):
            ref_ch_names = ch_names
            X[idx, :, :] = x
        else:
            try:
                # Find the indices that map current channels to reference order
                reorder_idx = [np.where(ch_names == ch)[0][0] for ch in ref_ch_names]
                x_reor = x[reorder_idx, :]
            except IndexError:
                raise ValueError(f"Channel names in {f_i} do not match reference channels.")
            X[idx, :, :] = x_reor

    print(X.shape)
    np.save('./example_files/ecg_sample/ptb_xl_cut_benchmarking.npy', X)
    print("Dataset is ready")