| | import numpy as np |
| | import pandas as pd |
| | from sklearn.model_selection import train_test_split |
| | from random import choices |
| |
|
| |
|
| | def log(*args): |
| | print(*args, flush=True) |
| |
|
| | def create_group(code): |
| | """ |
| | Creating group column, transforming an input string |
| | Parameters: |
| | code (str): string with ICD-10 code name |
| | Returns: |
| | group(str): string with ICD-10 group name |
| | """ |
| |
|
| | group = code.split('.')[0] |
| | return group |
| |
|
| | def df_creation(texts, labels, |
| | all_classes, prompt_column_name, |
| | code_column_name, group_column_name): |
| | """ |
| | Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information. |
| | |
| | Parameters: |
| | texts (List[str]): A list of strings, where each string is a medical report. |
| | labels (List[str]): A list of strings, where each string is an ICD-10 code name |
| | relevant to the corresponding text in 'texts'. |
| | all_classes (List[str]): A list of all ICD-10 code names from the initial dataset. |
| | prompt_column_name (str): The column name in the DataFrame for the prompts. |
| | code_column_name (str): The column name in the DataFrame for the codes. |
| | group_column_name (str): The column name in the DataFrame for the groups. |
| | |
| | Returns: |
| | pandas.DataFrame: A DataFrame where each row contains the text of the report, |
| | its corresponding ICD-10 code, and the group category derived |
| | from the code. |
| | """ |
| |
|
| | df = pd.DataFrame() |
| | df[prompt_column_name] = texts |
| | df[code_column_name] = [all_classes[c] for c in labels] |
| | df[group_column_name] = [all_classes[c].split('.')[0] for c in labels] |
| | return df |
| |
|
| | def select_random_rows(df_test, balance_column, random_n): |
| | """ |
| | Selects a random, balanced subset of rows from a DataFrame based on a specified column. |
| | |
| | This function aims to create a balanced DataFrame by randomly selecting a specified number of rows |
| | from each unique value in the balance column. It's particularly useful in scenarios where you |
| | need a balanced sample from a dataset for testing or validation purposes. |
| | |
| | Parameters: |
| | df_test (pandas.DataFrame): The DataFrame to select rows from. |
| | balance_column (str): The name of the column used to balance the data. The function will |
| | select rows such that each unique value in this column is equally represented. |
| | random_n (int): The number of rows to select for each unique value in the balance column. |
| | |
| | Returns: |
| | pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows. |
| | """ |
| |
|
| | classes = df_test[balance_column].unique() |
| | balanced_data = [] |
| | for class_name in classes: |
| | balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n) |
| |
|
| | df = pd.DataFrame(balanced_data) |
| | return df |
| |
|
| | def extract_valuable_data(path_to_raw_csv, prompt_column_name, |
| | code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls): |
| | """ |
| | Extracts and processes valuable data from a raw CSV file based on specified criteria. |
| | |
| | This function loads data from a CSV file, filters out rows based on non-null values in specified columns, |
| | removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group' |
| | column, and saves the processed data to a new CSV file. |
| | |
| | Parameters: |
| | path_to_raw_csv (str): The file path to the raw CSV data file. |
| | prompt_column_name (str): The column name in the CSV file for prompts. |
| | code_column_name (str): The column name in the CSV file for codes. |
| | path_to_processed_csv (str): The file path where the processed CSV data will be saved. |
| | |
| | Returns: |
| | pandas.DataFrame: A DataFrame containing the processed dataset. |
| | """ |
| |
|
| | df = pd.read_csv(path_to_raw_csv) |
| | log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls) |
| |
|
| | df = df[df[prompt_column_name].notna() & df[code_column_name].notna()] |
| | log(f"New data is loaded. New data has {len(df)} reports.") |
| | log(f"New data contains {len(df['code'].unique())} unique codes.") |
| |
|
| | |
| | unique_values = df['code'].value_counts() |
| | values_to_remove = unique_values[unique_values <= min_samples_per_cls].index |
| | df = df[~df['code'].isin(values_to_remove)] |
| |
|
| | |
| | df = df[df[prompt_column_name].str.len() >= min_text_len] |
| |
|
| | |
| | df['group'] = df['code'].apply(create_group) |
| | |
| | log(f"New data is processed. Processed data has {len(df)} reports.") |
| | log(f"Processed dataset contains {len(df['code'].unique())} codes.") |
| | log(f"Processed dataset contains {len(df['group'].unique())} groups.") |
| |
|
| | |
| | df.to_csv(path_to_processed_csv, index=False) |
| | log(f"Processed dataset is saved to {path_to_processed_csv}.") |
| | return df |
| |
|
| |
|
| | def balance_data(df, prompt_column_name, code_column_name, |
| | group_column_name,random_n, test_size, path_to_train_csv, |
| | path_to_csv_test_codes, path_to_csv_test_groups): |
| | """ |
| | Balances and splits a dataset into training and test sets, then saves these sets to CSV files. |
| | |
| | This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name' |
| | to create balanced training and test datasets. It then saves the training dataset and two versions of |
| | the test dataset (one for codes and one for groups) to separate CSV files. |
| | |
| | Parameters: |
| | df (pandas.DataFrame): The DataFrame to be processed and split. |
| | prompt_column_name (str): The column name in the DataFrame for the prompts. |
| | code_column_name (str): The column name in the DataFrame for the codes. |
| | group_column_name (str): The column name in the DataFrame for the groups. |
| | random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group. |
| | test_size (float): The proportion of the dataset to include in the test split. |
| | path_to_train_csv (str): The file path where the training dataset CSV will be saved. |
| | path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved. |
| | path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved. |
| | |
| | Returns: |
| | None |
| | """ |
| |
|
| | texts = np.array(df[prompt_column_name]) |
| | labels = np.array(df[code_column_name]) |
| | groups = np.array(df[group_column_name]) |
| |
|
| | all_classes = np.unique(labels).tolist() |
| | labels = [all_classes.index(l) for l in labels] |
| | log('='*50) |
| | log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}") |
| | log('='*50) |
| | texts_train, texts_test, labels_train, labels_test = train_test_split( |
| | texts, labels, test_size=test_size, random_state=42, stratify=labels |
| | ) |
| |
|
| | log(f"Train dataset len={len(texts_train)}") |
| | log(f"Test dataset len={len(texts_test)}") |
| | log(f"Count of classes={len(np.unique(labels))}") |
| |
|
| | |
| | df_train = df_creation(texts_train, labels_train, all_classes, |
| | prompt_column_name, code_column_name, group_column_name) |
| | df_train.to_csv(path_to_train_csv, index=False) |
| | log(f"TRAIN dataset is saved to {path_to_train_csv}") |
| |
|
| | |
| | df_test = df_creation(texts_test, labels_test, all_classes, |
| | prompt_column_name, code_column_name, group_column_name) |
| |
|
| | df_test_codes = df_test |
| | df_test_codes.to_csv(path_to_csv_test_codes, index=False) |
| | log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}") |
| |
|
| | df_test_groups = df_test |
| | df_test_groups.to_csv(path_to_csv_test_groups, index=False) |
| | log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}") |
| |
|