Spaces:

SujalSha
/

respitriage

Sleeping

App Files Files Community

respitriage / scripts /fix_kauh_parser.py

SujalSha

Upload folder using huggingface_hub

d0ace1e verified 30 days ago

raw

history blame contribute delete

3.15 kB

	"""
	scripts/fix_kauh_parser.py — Parse KAUH dataset filenames into a clean CSV.

	KAUH filenames encode: patient_id_DISEASE,SOUND,REGION,AGE,GENDER.wav
	Example: BP108_COPD,E W,P R L,63,M.wav

	Run this FIRST before any preprocessing or embedding extraction.
	Output: data/kauh_parsed.csv
	"""

	import os
	import pandas as pd

	KAUH_DIR = "./DATASET/KAUH_DATASET/Audio Files/"

	def map_disease(raw: str) -> str:
	"""Normalise disease name to canonical form (case-insensitive)."""
	r = raw.strip().lower()
	if r in ('copd', 'copd '):
	return 'COPD'
	if r in ('pneumonia', 'pneumonia '):
	return 'Pneumonia'
	if r in ('n', 'normal', 'healthy'):
	return 'Normal'
	if r in ('asthma',):
	return 'Asthma'
	if r in ('heart failure', 'heart_failure'):
	return 'Heart_Failure'
	# Comorbidities containing COPD — still useful as COPD samples
	if 'copd' in r and 'heart' not in r:
	return 'COPD'
	return 'OTHER'


	SOUND_MAP = {
	'N': 'Normal',
	'E W': 'Wheeze', 'I E W': 'Wheeze', 'I W': 'Wheeze',
	'B W': 'Wheeze', 'W': 'Wheeze',
	'C': 'Crackle', 'Crep': 'Crackle', 'E Crep': 'Crackle',
	}

	records = []
	for fname in os.listdir(KAUH_DIR):
	if not fname.endswith('.wav'):
	continue
	try:
	# Remove .wav, split on FIRST underscore only
	base = fname.replace('.wav', '')
	underscore_idx = base.index('_')
	patient_id = base[:underscore_idx]
	rest = base[underscore_idx + 1:]

	# Split rest on comma
	fields = [f.strip() for f in rest.split(',')]

	disease_raw = fields[0] if len(fields) > 0 else 'Unknown'
	sound_raw = fields[1] if len(fields) > 1 else 'N'
	age = int(fields[3]) if len(fields) > 3 and fields[3].strip().isdigit() else -1
	gender = fields[4].strip() if len(fields) > 4 else 'Unknown'

	disease = map_disease(disease_raw)
	sound = SOUND_MAP.get(sound_raw.strip(), 'Normal')

	records.append({
	'file_path': os.path.abspath(os.path.join(KAUH_DIR, fname)),
	'patient_id': patient_id,
	'disease': disease,
	'sound_type': sound,
	'age': age,
	'gender': gender,
	'source': 'kauh',
	})
	except Exception as e:
	print(f"Failed to parse {fname}: {e}")
	continue

	df = pd.DataFrame(records)
	print("KAUH per-disease counts:")
	print(df['disease'].value_counts())
	print("\nKAUH per-sound counts:")
	print(df['sound_type'].value_counts())
	print(f"\nTotal files parsed: {len(df)}")

	# Sanity checks — KAUH is a small dataset (336 files total)
	# COPD: expect ~24-30, Pneumonia: expect ~5-15
	n_copd = df[df['disease'] == 'COPD'].shape[0]
	n_pneu = df[df['disease'] == 'Pneumonia'].shape[0]
	print(f"\nCOPD samples: {n_copd}")
	print(f"Pneumonia samples: {n_pneu}")
	assert n_copd > 10, f"COPD count {n_copd} too low — parser broken"
	assert n_pneu > 3, f"Pneumonia count {n_pneu} too low — check filenames"

	os.makedirs('data', exist_ok=True)
	df.to_csv('data/kauh_parsed.csv', index=False)
	print("\nSaved to data/kauh_parsed.csv")