respitriage / scripts /build_label_csvs.py
SujalSha's picture
Upload folder using huggingface_hub
d0ace1e verified
"""
scripts/build_label_csvs.py β€” Build the three label CSVs from all datasets.
Run after fix_kauh_parser.py. Creates:
data/copd_binary_labels.csv β€” COPD vs Normal (binary)
data/pneumonia_binary_labels.csv β€” Pneumonia vs Normal (binary)
data/sound_labels.csv β€” Normal/Crackle/Wheeze/Both (4-class)
Sources:
COPD labels : ICBHI + KAUH
Pneumonia labels: ICBHI + KAUH
Normal (negatives): ICBHI Normal + KAUH Normal + COUGHVID Healthy
Sound labels : ICBHI + KAUH + HF Lung V1
"""
import os
import sys
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from config import (
ICBHI_AUDIO_DIR, ICBHI_DIAGNOSIS,
KAUH_AUDIO_DIR,
COUGHVID_AUDIO_DIR, COUGHVID_METADATA,
HF_LUNG_MANIFEST,
)
os.makedirs('data', exist_ok=True)
SOUND_INT = {'Normal': 0, 'Crackle': 1, 'Wheeze': 2, 'Both': 3}
# ══════════════════════════════════════════════════════════════════════════════
# 1. ICBHI 2017
# ══════════════════════════════════════════════════════════════════════════════
def load_icbhi():
"""
Parse ICBHI dataset: patient_diagnosis.csv + per-cycle .txt annotations.
Returns DataFrame with columns:
file_path, disease, sound_type, patient_id, source
"""
records = []
# Load patient-level diagnosis (CSV: patient_id,diagnosis β€” no header)
diag_df = pd.read_csv(ICBHI_DIAGNOSIS, header=None,
names=['patient_id', 'diagnosis'])
diag_map = dict(zip(diag_df['patient_id'].astype(str),
diag_df['diagnosis']))
for fname in os.listdir(ICBHI_AUDIO_DIR):
if not fname.endswith('.wav'):
continue
patient_id = fname.split('_')[0]
disease_raw = diag_map.get(patient_id, 'Unknown')
# Map to our 3-class disease set
if disease_raw == 'COPD':
disease = 'COPD'
elif disease_raw == 'Pneumonia':
disease = 'Pneumonia'
elif disease_raw == 'Healthy':
disease = 'Normal'
else:
continue # skip URTI, Bronchiectasis, etc.
# Try to read sound annotation from companion .txt
txt_path = os.path.join(ICBHI_AUDIO_DIR,
fname.replace('.wav', '.txt'))
sound_type = 'Normal'
if os.path.exists(txt_path):
try:
ann = pd.read_csv(txt_path, sep='\t', header=None)
# Columns: start, end, crackle_flag, wheeze_flag
crackles = ann.iloc[:, 2].sum() > 0
wheezes = ann.iloc[:, 3].sum() > 0
if crackles and wheezes:
sound_type = 'Both'
elif crackles:
sound_type = 'Crackle'
elif wheezes:
sound_type = 'Wheeze'
except Exception:
pass
records.append({
'file_path': os.path.abspath(os.path.join(ICBHI_AUDIO_DIR, fname)),
'disease': disease,
'sound_type': sound_type,
'patient_id': patient_id,
'source': 'icbhi',
})
df = pd.DataFrame(records)
print(f"[ICBHI] Loaded {len(df)} files")
if len(df) == 0:
raise RuntimeError(
f"ICBHI loaded 0 files. Check ICBHI_AUDIO_DIR in config.py.\n"
f" AUDIO_DIR = {ICBHI_AUDIO_DIR}\n"
f" DIAGNOSIS = {ICBHI_DIAGNOSIS}"
)
print(f" Disease counts:\n{df['disease'].value_counts().to_string()}")
return df
# ══════════════════════════════════════════════════════════════════════════════
# 2. KAUH (requires data/kauh_parsed.csv from fix_kauh_parser.py)
# ══════════════════════════════════════════════════════════════════════════════
def load_kauh():
"""Load KAUH from pre-parsed CSV (run fix_kauh_parser.py first)."""
kauh_csv = 'data/kauh_parsed.csv'
if not os.path.exists(kauh_csv):
raise FileNotFoundError(
"data/kauh_parsed.csv not found. "
"Run: python scripts/fix_kauh_parser.py"
)
df = pd.read_csv(kauh_csv)
# Keep only COPD, Pneumonia, Normal
df = df[df['disease'].isin(['COPD', 'Pneumonia', 'Normal'])].copy()
df = df.rename(columns={'sound_type': 'sound_type'})
df['source'] = 'kauh'
print(f"[KAUH] Loaded {len(df)} files (COPD/Pneumonia/Normal only)")
print(f" Disease counts:\n{df['disease'].value_counts().to_string()}")
return df[['file_path', 'disease', 'sound_type', 'patient_id', 'source']]
# ══════════════════════════════════════════════════════════════════════════════
# 3. COUGHVID (Healthy samples only β€” used as negatives)
# ══════════════════════════════════════════════════════════════════════════════
def load_coughvid_healthy(max_samples: int = 1000):
"""Load healthy COUGHVID samples as Normal class negatives."""
if not os.path.exists(COUGHVID_METADATA):
print("[COUGHVID] Metadata not found β€” skipping")
return pd.DataFrame()
meta = pd.read_csv(COUGHVID_METADATA)
# Keep only confirmed healthy
healthy = meta[meta['status'].str.lower() == 'healthy'].copy()
if len(healthy) > max_samples:
healthy = healthy.sample(max_samples, random_state=42)
records = []
for _, row in healthy.iterrows():
# Try .webm first, then .wav
for ext in ('.webm', '.wav', '.mp3'):
fpath = os.path.join(COUGHVID_AUDIO_DIR, row['uuid'] + ext)
if os.path.exists(fpath):
records.append({
'file_path': os.path.abspath(fpath),
'disease': 'Normal',
'sound_type': 'Normal',
'patient_id': str(row['uuid']),
'source': 'coughvid',
})
break
df = pd.DataFrame(records)
print(f"[COUGHVID] Loaded {len(df)} healthy samples")
return df
# ══════════════════════════════════════════════════════════════════════════════
# 4. HF Lung V1 (sound labels only β€” no disease label)
# ══════════════════════════════════════════════════════════════════════════════
def load_hf_lung():
"""Load HF Lung V1 sound labels (for sound_labels.csv only).
Manifest columns: audio_path, label, split
label values: Crackle, Wheeze, Normal, Artifact (skip Artifact)
"""
if not os.path.exists(HF_LUNG_MANIFEST):
print("[HF Lung] Manifest not found β€” skipping")
return pd.DataFrame()
df = pd.read_csv(HF_LUNG_MANIFEST)
# Rename to standard column names
df = df.rename(columns={'audio_path': 'file_path', 'label': 'sound_type'})
# Skip Artifact rows β€” not a real respiratory sound class
df = df[df['sound_type'].isin(['Normal', 'Crackle', 'Wheeze'])].copy()
# Make file paths absolute (they use Windows backslash relative paths)
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
df['file_path'] = df['file_path'].apply(
lambda p: os.path.abspath(os.path.join(project_root, p.replace('\\', os.sep)))
)
df['disease'] = 'Unknown' # no disease label in HF Lung
df['source'] = 'hf_lung'
df['patient_id'] = 'hf_' + df.index.astype(str)
print(f"[HF Lung] Loaded {len(df)} files")
print(f" Sound counts:\n{df['sound_type'].value_counts().to_string()}")
return df[['file_path', 'disease', 'sound_type', 'patient_id', 'source']]
# ══════════════════════════════════════════════════════════════════════════════
# Build and save the three CSVs
# ══════════════════════════════════════════════════════════════════════════════
def main():
icbhi = load_icbhi()
kauh = load_kauh()
coughvid = load_coughvid_healthy()
hf_lung = load_hf_lung()
all_labeled = pd.concat([icbhi, kauh], ignore_index=True)
# ── copd_binary_labels.csv ────────────────────────────────────────────────
copd_pos = all_labeled[all_labeled['disease'] == 'COPD'].copy()
copd_neg = pd.concat([
all_labeled[all_labeled['disease'] == 'Normal'],
coughvid,
], ignore_index=True)
copd_pos['label'] = 1
copd_neg['label'] = 0
df_copd = pd.concat([copd_pos, copd_neg], ignore_index=True)
df_copd = df_copd[['file_path', 'label', 'source']].drop_duplicates()
df_copd.to_csv('data/copd_binary_labels.csv', index=False)
print(f"\n[OUT] data/copd_binary_labels.csv β€” {len(df_copd)} rows")
print(f" Labels: {df_copd['label'].value_counts().to_dict()}")
# ── pneumonia_binary_labels.csv ───────────────────────────────────────────
pneu_pos = all_labeled[all_labeled['disease'] == 'Pneumonia'].copy()
pneu_neg = pd.concat([
all_labeled[all_labeled['disease'] == 'Normal'],
coughvid,
], ignore_index=True)
pneu_pos['label'] = 1
pneu_neg['label'] = 0
df_pneu = pd.concat([pneu_pos, pneu_neg], ignore_index=True)
df_pneu = df_pneu[['file_path', 'label', 'source']].drop_duplicates()
df_pneu.to_csv('data/pneumonia_binary_labels.csv', index=False)
print(f"\n[OUT] data/pneumonia_binary_labels.csv β€” {len(df_pneu)} rows")
print(f" Labels: {df_pneu['label'].value_counts().to_dict()}")
# ── sound_labels.csv ──────────────────────────────────────────────────────
sound_sources = pd.concat([all_labeled, hf_lung], ignore_index=True)
sound_sources = sound_sources[sound_sources['sound_type'].isin(
SOUND_INT.keys()
)].copy()
sound_sources['sound_label'] = sound_sources['sound_type'].map(SOUND_INT)
df_sound = sound_sources[['file_path', 'sound_label', 'source']].drop_duplicates()
df_sound.to_csv('data/sound_labels.csv', index=False)
print(f"\n[OUT] data/sound_labels.csv β€” {len(df_sound)} rows")
print(f" Sound labels: {df_sound['sound_label'].value_counts().to_dict()}")
print("\nAll label CSVs created. Next: run extract_opera_embeddings.py")
if __name__ == '__main__':
main()