opdx

Sleeping

opdx / helpers /data_processor.py

lyangas

missed files

6931ba0 over 1 year ago

8.28 kB

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from random import choices


	def log(*args):
	print(*args, flush=True)

	def create_group(code):
	"""
	Creating group column, transforming an input string
	Parameters:
	code (str): string with ICD-10 code name
	Returns:
	group(str): string with ICD-10 group name
	"""

	group = code.split('.')[0]
	return group

	def df_creation(texts, labels,
	all_classes, prompt_column_name,
	code_column_name, group_column_name):
	"""
	Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information.

	Parameters:
	texts (List[str]): A list of strings, where each string is a medical report.
	labels (List[str]): A list of strings, where each string is an ICD-10 code name
	relevant to the corresponding text in 'texts'.
	all_classes (List[str]): A list of all ICD-10 code names from the initial dataset.
	prompt_column_name (str): The column name in the DataFrame for the prompts.
	code_column_name (str): The column name in the DataFrame for the codes.
	group_column_name (str): The column name in the DataFrame for the groups.

	Returns:
	pandas.DataFrame: A DataFrame where each row contains the text of the report,
	its corresponding ICD-10 code, and the group category derived
	from the code.
	"""

	df = pd.DataFrame()
	df[prompt_column_name] = texts
	df[code_column_name] = [all_classes[c] for c in labels]
	df[group_column_name] = [all_classes[c].split('.')[0] for c in labels]
	return df

	def select_random_rows(df_test, balance_column, random_n):
	"""
	Selects a random, balanced subset of rows from a DataFrame based on a specified column.

	This function aims to create a balanced DataFrame by randomly selecting a specified number of rows
	from each unique value in the balance column. It's particularly useful in scenarios where you
	need a balanced sample from a dataset for testing or validation purposes.

	Parameters:
	df_test (pandas.DataFrame): The DataFrame to select rows from.
	balance_column (str): The name of the column used to balance the data. The function will
	select rows such that each unique value in this column is equally represented.
	random_n (int): The number of rows to select for each unique value in the balance column.

	Returns:
	pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows.
	"""

	classes = df_test[balance_column].unique()
	balanced_data = []
	for class_name in classes:
	balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n)

	df = pd.DataFrame(balanced_data)
	return df

	def extract_valuable_data(path_to_raw_csv, prompt_column_name,
	code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls):
	"""
	Extracts and processes valuable data from a raw CSV file based on specified criteria.

	This function loads data from a CSV file, filters out rows based on non-null values in specified columns,
	removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group'
	column, and saves the processed data to a new CSV file.

	Parameters:
	path_to_raw_csv (str): The file path to the raw CSV data file.
	prompt_column_name (str): The column name in the CSV file for prompts.
	code_column_name (str): The column name in the CSV file for codes.
	path_to_processed_csv (str): The file path where the processed CSV data will be saved.

	Returns:
	pandas.DataFrame: A DataFrame containing the processed dataset.
	"""

	df = pd.read_csv(path_to_raw_csv)
	log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls)

	df = df[df[prompt_column_name].notna() & df[code_column_name].notna()]
	log(f"New data is loaded. New data has {len(df)} reports.")
	log(f"New data contains {len(df['code'].unique())} unique codes.")

	# Leave data for codes where more than min_samples_per_cls prompts.
	unique_values = df['code'].value_counts()
	values_to_remove = unique_values[unique_values <= min_samples_per_cls].index
	df = df[~df['code'].isin(values_to_remove)]

	# leave prompts that are longer that min_text_len characters
	df = df[df[prompt_column_name].str.len() >= min_text_len]

	# Creating GROUP column in dataset
	df['group'] = df['code'].apply(create_group)

	log(f"New data is processed. Processed data has {len(df)} reports.")
	log(f"Processed dataset contains {len(df['code'].unique())} codes.")
	log(f"Processed dataset contains {len(df['group'].unique())} groups.")

	# Saving processed dataset
	df.to_csv(path_to_processed_csv, index=False)
	log(f"Processed dataset is saved to {path_to_processed_csv}.")
	return df


	def balance_data(df, prompt_column_name, code_column_name,
	group_column_name,random_n, test_size, path_to_train_csv,
	path_to_csv_test_codes, path_to_csv_test_groups):
	"""
	Balances and splits a dataset into training and test sets, then saves these sets to CSV files.

	This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name'
	to create balanced training and test datasets. It then saves the training dataset and two versions of
	the test dataset (one for codes and one for groups) to separate CSV files.

	Parameters:
	df (pandas.DataFrame): The DataFrame to be processed and split.
	prompt_column_name (str): The column name in the DataFrame for the prompts.
	code_column_name (str): The column name in the DataFrame for the codes.
	group_column_name (str): The column name in the DataFrame for the groups.
	random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group.
	test_size (float): The proportion of the dataset to include in the test split.
	path_to_train_csv (str): The file path where the training dataset CSV will be saved.
	path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved.
	path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved.

	Returns:
	None
	"""

	texts = np.array(df[prompt_column_name])
	labels = np.array(df[code_column_name])
	groups = np.array(df[group_column_name])

	all_classes = np.unique(labels).tolist()
	labels = [all_classes.index(l) for l in labels]
	log('='*50)
	log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}")
	log('='*50)
	texts_train, texts_test, labels_train, labels_test = train_test_split(
	texts, labels, test_size=test_size, random_state=42, stratify=labels
	)

	log(f"Train dataset len={len(texts_train)}")
	log(f"Test dataset len={len(texts_test)}")
	log(f"Count of classes={len(np.unique(labels))}")

	# Creating TRAIN and TEST dataset
	df_train = df_creation(texts_train, labels_train, all_classes,
	prompt_column_name, code_column_name, group_column_name)
	df_train.to_csv(path_to_train_csv, index=False)
	log(f"TRAIN dataset is saved to {path_to_train_csv}")

	# Creating test datasets for codes and groups
	df_test = df_creation(texts_test, labels_test, all_classes,
	prompt_column_name, code_column_name, group_column_name)

	df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n)
	df_test_codes.to_csv(path_to_csv_test_codes, index=False)
	log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}")

	df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n)
	df_test_groups.to_csv(path_to_csv_test_groups, index=False)
	log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")