respitriage / scripts /fix_kauh_parser.py
SujalSha's picture
Upload folder using huggingface_hub
d0ace1e verified
"""
scripts/fix_kauh_parser.py β€” Parse KAUH dataset filenames into a clean CSV.
KAUH filenames encode: patient_id_DISEASE,SOUND,REGION,AGE,GENDER.wav
Example: BP108_COPD,E W,P R L,63,M.wav
Run this FIRST before any preprocessing or embedding extraction.
Output: data/kauh_parsed.csv
"""
import os
import pandas as pd
KAUH_DIR = "./DATASET/KAUH_DATASET/Audio Files/"
def map_disease(raw: str) -> str:
"""Normalise disease name to canonical form (case-insensitive)."""
r = raw.strip().lower()
if r in ('copd', 'copd '):
return 'COPD'
if r in ('pneumonia', 'pneumonia '):
return 'Pneumonia'
if r in ('n', 'normal', 'healthy'):
return 'Normal'
if r in ('asthma',):
return 'Asthma'
if r in ('heart failure', 'heart_failure'):
return 'Heart_Failure'
# Comorbidities containing COPD β€” still useful as COPD samples
if 'copd' in r and 'heart' not in r:
return 'COPD'
return 'OTHER'
SOUND_MAP = {
'N': 'Normal',
'E W': 'Wheeze', 'I E W': 'Wheeze', 'I W': 'Wheeze',
'B W': 'Wheeze', 'W': 'Wheeze',
'C': 'Crackle', 'Crep': 'Crackle', 'E Crep': 'Crackle',
}
records = []
for fname in os.listdir(KAUH_DIR):
if not fname.endswith('.wav'):
continue
try:
# Remove .wav, split on FIRST underscore only
base = fname.replace('.wav', '')
underscore_idx = base.index('_')
patient_id = base[:underscore_idx]
rest = base[underscore_idx + 1:]
# Split rest on comma
fields = [f.strip() for f in rest.split(',')]
disease_raw = fields[0] if len(fields) > 0 else 'Unknown'
sound_raw = fields[1] if len(fields) > 1 else 'N'
age = int(fields[3]) if len(fields) > 3 and fields[3].strip().isdigit() else -1
gender = fields[4].strip() if len(fields) > 4 else 'Unknown'
disease = map_disease(disease_raw)
sound = SOUND_MAP.get(sound_raw.strip(), 'Normal')
records.append({
'file_path': os.path.abspath(os.path.join(KAUH_DIR, fname)),
'patient_id': patient_id,
'disease': disease,
'sound_type': sound,
'age': age,
'gender': gender,
'source': 'kauh',
})
except Exception as e:
print(f"Failed to parse {fname}: {e}")
continue
df = pd.DataFrame(records)
print("KAUH per-disease counts:")
print(df['disease'].value_counts())
print("\nKAUH per-sound counts:")
print(df['sound_type'].value_counts())
print(f"\nTotal files parsed: {len(df)}")
# Sanity checks β€” KAUH is a small dataset (336 files total)
# COPD: expect ~24-30, Pneumonia: expect ~5-15
n_copd = df[df['disease'] == 'COPD'].shape[0]
n_pneu = df[df['disease'] == 'Pneumonia'].shape[0]
print(f"\nCOPD samples: {n_copd}")
print(f"Pneumonia samples: {n_pneu}")
assert n_copd > 10, f"COPD count {n_copd} too low β€” parser broken"
assert n_pneu > 3, f"Pneumonia count {n_pneu} too low β€” check filenames"
os.makedirs('data', exist_ok=True)
df.to_csv('data/kauh_parsed.csv', index=False)
print("\nSaved to data/kauh_parsed.csv")