Spaces:
Running
Running
| import os | |
| import pandas as pd | |
| import numpy as np | |
| from tqdm import tqdm | |
| import sys | |
| # Add project root to path | |
| PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project" | |
| if PROJECT_ROOT not in sys.path: | |
| sys.path.append(PROJECT_ROOT) | |
| from utils.hear_extractor import HeARExtractor | |
| # --- Config --- | |
| META_PATH = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\metadata_compiled.csv" | |
| AUDIO_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized" | |
| OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_elite" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| def main(): | |
| print("Identifying Elite Samples...") | |
| df = pd.read_csv(META_PATH) | |
| # SICK (Elite Match) | |
| elite_sick_uuids = df[(df['status'].isin(['COVID-19', 'sick'])) & (df['cough_detected'] > 0.8) & (df['SNR'] > 5)]['uuid'].tolist() | |
| # HEALTHY (Elite Match) | |
| elite_healthy_uuids = df[(df['status'] == 'healthy') & (df['cough_detected'] > 0.95)]['uuid'].tolist() | |
| print(f"Total Eligible Elite Sick: {len(elite_sick_uuids)}") | |
| print(f"Total Eligible Elite Healthy: {len(elite_healthy_uuids)}") | |
| # Let's limit Healthy to 1,000 for speed, Sick as many as we can find (approx 600-700) | |
| sick_to_process = elite_sick_uuids[:1000] | |
| healthy_to_process = elite_healthy_uuids[:1000] | |
| # Map UUIDs to actual paths | |
| all_tasks = [] | |
| # Sick mapping | |
| sick_folder = os.path.join(AUDIO_ROOT, 'sick') | |
| for uuid in sick_to_process: | |
| found = False | |
| for ext in ['.webm', '.wav', '.ogg']: | |
| # Files in organized folder start with 'cv_' | |
| path = os.path.join(sick_folder, "cv_" + uuid + ext) | |
| if os.path.exists(path): | |
| all_tasks.append((path, 'sick')) | |
| found = True | |
| break | |
| if not found: | |
| # Try without prefix just in case some are different | |
| for ext in ['.webm', '.wav', '.ogg']: | |
| path = os.path.join(sick_folder, uuid + ext) | |
| if os.path.exists(path): | |
| all_tasks.append((path, 'sick')) | |
| break | |
| # Healthy mapping | |
| healthy_folder = os.path.join(AUDIO_ROOT, 'healthy') | |
| for uuid in healthy_to_process: | |
| found = False | |
| for ext in ['.webm', '.wav', '.ogg']: | |
| path = os.path.join(healthy_folder, "cv_" + uuid + ext) | |
| if os.path.exists(path): | |
| all_tasks.append((path, 'healthy')) | |
| found = True | |
| break | |
| if not found: | |
| for ext in ['.webm', '.wav', '.ogg']: | |
| path = os.path.join(healthy_folder, uuid + ext) | |
| if os.path.exists(path): | |
| all_tasks.append((path, 'healthy')) | |
| break | |
| print(f"Starting Elite Extraction (Total: {len(all_tasks)} samples)...") | |
| extractor = HeARExtractor() | |
| features = [] | |
| labels = [] | |
| for path, label in tqdm(all_tasks): | |
| try: | |
| emb = extractor.extract(path) | |
| if emb is not None: | |
| features.append(emb) | |
| labels.append(label) | |
| except Exception: | |
| continue | |
| np.save(os.path.join(OUTPUT_DIR, "X_elite.npy"), np.array(features)) | |
| np.save(os.path.join(OUTPUT_DIR, "y_elite.npy"), np.array(labels)) | |
| print(f"Elite Data Saved: {len(features)} samples.") | |
| if __name__ == "__main__": | |
| main() | |