KasaHealth / utils /extract_elite_samples.py
78anand's picture
Upload folder using huggingface_hub
4fcfef4 verified
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
# Add project root to path
PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
if PROJECT_ROOT not in sys.path:
sys.path.append(PROJECT_ROOT)
from utils.hear_extractor import HeARExtractor
# --- Config ---
META_PATH = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\metadata_compiled.csv"
AUDIO_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_elite"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def main():
print("Identifying Elite Samples...")
df = pd.read_csv(META_PATH)
# SICK (Elite Match)
elite_sick_uuids = df[(df['status'].isin(['COVID-19', 'sick'])) & (df['cough_detected'] > 0.8) & (df['SNR'] > 5)]['uuid'].tolist()
# HEALTHY (Elite Match)
elite_healthy_uuids = df[(df['status'] == 'healthy') & (df['cough_detected'] > 0.95)]['uuid'].tolist()
print(f"Total Eligible Elite Sick: {len(elite_sick_uuids)}")
print(f"Total Eligible Elite Healthy: {len(elite_healthy_uuids)}")
# Let's limit Healthy to 1,000 for speed, Sick as many as we can find (approx 600-700)
sick_to_process = elite_sick_uuids[:1000]
healthy_to_process = elite_healthy_uuids[:1000]
# Map UUIDs to actual paths
all_tasks = []
# Sick mapping
sick_folder = os.path.join(AUDIO_ROOT, 'sick')
for uuid in sick_to_process:
found = False
for ext in ['.webm', '.wav', '.ogg']:
# Files in organized folder start with 'cv_'
path = os.path.join(sick_folder, "cv_" + uuid + ext)
if os.path.exists(path):
all_tasks.append((path, 'sick'))
found = True
break
if not found:
# Try without prefix just in case some are different
for ext in ['.webm', '.wav', '.ogg']:
path = os.path.join(sick_folder, uuid + ext)
if os.path.exists(path):
all_tasks.append((path, 'sick'))
break
# Healthy mapping
healthy_folder = os.path.join(AUDIO_ROOT, 'healthy')
for uuid in healthy_to_process:
found = False
for ext in ['.webm', '.wav', '.ogg']:
path = os.path.join(healthy_folder, "cv_" + uuid + ext)
if os.path.exists(path):
all_tasks.append((path, 'healthy'))
found = True
break
if not found:
for ext in ['.webm', '.wav', '.ogg']:
path = os.path.join(healthy_folder, uuid + ext)
if os.path.exists(path):
all_tasks.append((path, 'healthy'))
break
print(f"Starting Elite Extraction (Total: {len(all_tasks)} samples)...")
extractor = HeARExtractor()
features = []
labels = []
for path, label in tqdm(all_tasks):
try:
emb = extractor.extract(path)
if emb is not None:
features.append(emb)
labels.append(label)
except Exception:
continue
np.save(os.path.join(OUTPUT_DIR, "X_elite.npy"), np.array(features))
np.save(os.path.join(OUTPUT_DIR, "y_elite.npy"), np.array(labels))
print(f"Elite Data Saved: {len(features)} samples.")
if __name__ == "__main__":
main()