Spaces:

78anand
/

KasaHealth

Running

App Files Files Community

KasaHealth / utils /organize_data.py

78anand

Upload folder using huggingface_hub

f317798 verified about 2 months ago

raw

history blame contribute delete

1.8 kB

	import os
	import shutil
	import pandas as pd

	# Paths
	base_dir = r"c:\Users\ASUS\lung_ai_project"
	extracted_dir = os.path.join(base_dir, "data", "extracted_cough", "Respiratory_Sound_Dataset-main")
	audio_txt_dir = os.path.join(extracted_dir, "audio_and_txt_files")
	diagnosis_file = os.path.join(extracted_dir, "patient_diagnosis.csv")

	output_healthy = os.path.join(base_dir, "data", "cough", "healthy")
	output_sick = os.path.join(base_dir, "data", "cough", "sick")

	# Create directories if not exist
	os.makedirs(output_healthy, exist_ok=True)
	os.makedirs(output_sick, exist_ok=True)

	# Read diagnosis
	df = pd.read_csv(diagnosis_file)
	print("Columns:", df.columns)
	print(df.head())

	# Map Patient_ID to Diagnosis
	diagnosis_map = dict(zip(df['Patient_ID'], df['DIAGNOSIS'])) # CSV header implies 'Patient_ID' and 'DIAGNOSIS'

	# Process files
	files = os.listdir(audio_txt_dir)
	wav_files = [f for f in files if f.endswith('.wav')]
	print(f"Found {len(wav_files)} wav files")

	count_healthy = 0
	count_sick = 0

	for wav_file in wav_files:
	# Filename format: 101_1b1_Al_sc_Meditron.wav
	try:
	patient_id = int(wav_file.split('_')[0])
	except ValueError:
	print(f"Skipping {wav_file}")
	continue

	diagnosis = diagnosis_map.get(patient_id, "Unknown")


	if diagnosis == 'Healthy':
	shutil.copy2(os.path.join(audio_txt_dir, wav_file), os.path.join(output_healthy, wav_file))
	count_healthy += 1
	elif diagnosis != "Unknown":
	shutil.copy2(os.path.join(audio_txt_dir, wav_file), os.path.join(output_sick, wav_file))
	count_sick += 1
	else:
	print(f"Unknown diagnosis for patient {patient_id}")

	print(f"Result: {count_healthy} healthy files, {count_sick} sick files.")