KasaHealth / utils /organize_data.py
78anand's picture
Upload folder using huggingface_hub
f317798 verified
import os
import shutil
import pandas as pd
# Paths
base_dir = r"c:\Users\ASUS\lung_ai_project"
extracted_dir = os.path.join(base_dir, "data", "extracted_cough", "Respiratory_Sound_Dataset-main")
audio_txt_dir = os.path.join(extracted_dir, "audio_and_txt_files")
diagnosis_file = os.path.join(extracted_dir, "patient_diagnosis.csv")
output_healthy = os.path.join(base_dir, "data", "cough", "healthy")
output_sick = os.path.join(base_dir, "data", "cough", "sick")
# Create directories if not exist
os.makedirs(output_healthy, exist_ok=True)
os.makedirs(output_sick, exist_ok=True)
# Read diagnosis
df = pd.read_csv(diagnosis_file)
print("Columns:", df.columns)
print(df.head())
# Map Patient_ID to Diagnosis
diagnosis_map = dict(zip(df['Patient_ID'], df['DIAGNOSIS'])) # CSV header implies 'Patient_ID' and 'DIAGNOSIS'
# Process files
files = os.listdir(audio_txt_dir)
wav_files = [f for f in files if f.endswith('.wav')]
print(f"Found {len(wav_files)} wav files")
count_healthy = 0
count_sick = 0
for wav_file in wav_files:
# Filename format: 101_1b1_Al_sc_Meditron.wav
try:
patient_id = int(wav_file.split('_')[0])
except ValueError:
print(f"Skipping {wav_file}")
continue
diagnosis = diagnosis_map.get(patient_id, "Unknown")
if diagnosis == 'Healthy':
shutil.copy2(os.path.join(audio_txt_dir, wav_file), os.path.join(output_healthy, wav_file))
count_healthy += 1
elif diagnosis != "Unknown":
shutil.copy2(os.path.join(audio_txt_dir, wav_file), os.path.join(output_sick, wav_file))
count_sick += 1
else:
print(f"Unknown diagnosis for patient {patient_id}")
print(f"Result: {count_healthy} healthy files, {count_sick} sick files.")