Spaces:
Sleeping
Sleeping
| """ | |
| scripts/fix_kauh_parser.py β Parse KAUH dataset filenames into a clean CSV. | |
| KAUH filenames encode: patient_id_DISEASE,SOUND,REGION,AGE,GENDER.wav | |
| Example: BP108_COPD,E W,P R L,63,M.wav | |
| Run this FIRST before any preprocessing or embedding extraction. | |
| Output: data/kauh_parsed.csv | |
| """ | |
| import os | |
| import pandas as pd | |
| KAUH_DIR = "./DATASET/KAUH_DATASET/Audio Files/" | |
| def map_disease(raw: str) -> str: | |
| """Normalise disease name to canonical form (case-insensitive).""" | |
| r = raw.strip().lower() | |
| if r in ('copd', 'copd '): | |
| return 'COPD' | |
| if r in ('pneumonia', 'pneumonia '): | |
| return 'Pneumonia' | |
| if r in ('n', 'normal', 'healthy'): | |
| return 'Normal' | |
| if r in ('asthma',): | |
| return 'Asthma' | |
| if r in ('heart failure', 'heart_failure'): | |
| return 'Heart_Failure' | |
| # Comorbidities containing COPD β still useful as COPD samples | |
| if 'copd' in r and 'heart' not in r: | |
| return 'COPD' | |
| return 'OTHER' | |
| SOUND_MAP = { | |
| 'N': 'Normal', | |
| 'E W': 'Wheeze', 'I E W': 'Wheeze', 'I W': 'Wheeze', | |
| 'B W': 'Wheeze', 'W': 'Wheeze', | |
| 'C': 'Crackle', 'Crep': 'Crackle', 'E Crep': 'Crackle', | |
| } | |
| records = [] | |
| for fname in os.listdir(KAUH_DIR): | |
| if not fname.endswith('.wav'): | |
| continue | |
| try: | |
| # Remove .wav, split on FIRST underscore only | |
| base = fname.replace('.wav', '') | |
| underscore_idx = base.index('_') | |
| patient_id = base[:underscore_idx] | |
| rest = base[underscore_idx + 1:] | |
| # Split rest on comma | |
| fields = [f.strip() for f in rest.split(',')] | |
| disease_raw = fields[0] if len(fields) > 0 else 'Unknown' | |
| sound_raw = fields[1] if len(fields) > 1 else 'N' | |
| age = int(fields[3]) if len(fields) > 3 and fields[3].strip().isdigit() else -1 | |
| gender = fields[4].strip() if len(fields) > 4 else 'Unknown' | |
| disease = map_disease(disease_raw) | |
| sound = SOUND_MAP.get(sound_raw.strip(), 'Normal') | |
| records.append({ | |
| 'file_path': os.path.abspath(os.path.join(KAUH_DIR, fname)), | |
| 'patient_id': patient_id, | |
| 'disease': disease, | |
| 'sound_type': sound, | |
| 'age': age, | |
| 'gender': gender, | |
| 'source': 'kauh', | |
| }) | |
| except Exception as e: | |
| print(f"Failed to parse {fname}: {e}") | |
| continue | |
| df = pd.DataFrame(records) | |
| print("KAUH per-disease counts:") | |
| print(df['disease'].value_counts()) | |
| print("\nKAUH per-sound counts:") | |
| print(df['sound_type'].value_counts()) | |
| print(f"\nTotal files parsed: {len(df)}") | |
| # Sanity checks β KAUH is a small dataset (336 files total) | |
| # COPD: expect ~24-30, Pneumonia: expect ~5-15 | |
| n_copd = df[df['disease'] == 'COPD'].shape[0] | |
| n_pneu = df[df['disease'] == 'Pneumonia'].shape[0] | |
| print(f"\nCOPD samples: {n_copd}") | |
| print(f"Pneumonia samples: {n_pneu}") | |
| assert n_copd > 10, f"COPD count {n_copd} too low β parser broken" | |
| assert n_pneu > 3, f"Pneumonia count {n_pneu} too low β check filenames" | |
| os.makedirs('data', exist_ok=True) | |
| df.to_csv('data/kauh_parsed.csv', index=False) | |
| print("\nSaved to data/kauh_parsed.csv") | |