""" scripts/extract_opera_embeddings.py — One-time OPERA-CT embedding extraction. Runs OPERA-CT on every audio file in the three label CSVs and saves embeddings as .npy files. After this, OPERA never runs again — training loads only from .npy files. OPERA-CT checkpoint auto-downloads from HuggingFace on first run. Prerequisites: - data/copd_binary_labels.csv (from scripts/build_label_csvs.py) - data/pneumonia_binary_labels.csv - data/sound_labels.csv - ./OPERA/ cloned from github.com/evelyn0414/OPERA Output: - data/opera_embeddings//.npy - data/copd_binary_labels_with_embeddings.csv - data/pneumonia_binary_labels_with_embeddings.csv - data/sound_labels_with_embeddings.csv """ import os import sys import numpy as np import pandas as pd from tqdm import tqdm sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from models.opera_encoder import OPERAEncoder OUTPUT_DIR = './data/opera_embeddings' CHUNK_SIZE = 64 # files sent to encoder.encode_batch() at once GPU_BATCH = 16 # GPU forward pass batch size (safe for GTX 1650 4GB) N_WORKERS = 4 # CPU threads for parallel audio preprocessing PRETRAIN = 'operaCT' INPUT_SEC = 8 os.makedirs(OUTPUT_DIR, exist_ok=True) def extract_and_save(csv_path: str, encoder: OPERAEncoder) -> pd.DataFrame: """ Extract OPERA embeddings for all files in csv_path. Skips files whose .npy already exists (safe to re-run after interruption). Returns updated DataFrame with 'embedding_path' column filled. """ df = pd.read_csv(csv_path) if 'embedding_path' not in df.columns: df['embedding_path'] = None # Determine which rows still need extraction todo_mask = df['embedding_path'].isna() | (df['embedding_path'] == '') # Also check if the .npy file actually exists for non-null paths for idx, row in df[~todo_mask].iterrows(): if not os.path.exists(str(row['embedding_path'])): todo_mask.at[idx] = True todo_df = df[todo_mask].copy() print(f" {len(todo_df)} files to extract ({len(df) - len(todo_df)} already done)") if len(todo_df) == 0: return df failed = [] # Process in chunks for batch_start in tqdm(range(0, len(todo_df), CHUNK_SIZE), desc=f" Extracting {os.path.basename(csv_path)}"): batch = todo_df.iloc[batch_start: batch_start + CHUNK_SIZE] # Build output paths first out_paths = [] valid_rows = [] for _, row in batch.iterrows(): file_path = row['file_path'] if not os.path.exists(file_path): failed.append(file_path) continue fname = os.path.basename(file_path) fname = (fname.replace('.wav', '.npy') .replace('.webm', '.npy') .replace('.mp3', '.npy')) source = str(row.get('source', 'unknown')) out_dir = os.path.join(OUTPUT_DIR, source) os.makedirs(out_dir, exist_ok=True) out_path = os.path.join(out_dir, fname) out_paths.append(out_path) valid_rows.append((row.name, file_path, out_path)) if not valid_rows: continue # Skip rows where .npy already exists to_encode = [(idx, fp, op) for idx, fp, op in valid_rows if not os.path.exists(op)] already_done = [(idx, fp, op) for idx, fp, op in valid_rows if os.path.exists(op)] # Update already-done rows for idx, fp, op in already_done: df.at[idx, 'embedding_path'] = op if not to_encode: continue file_paths_batch = [fp for _, fp, _ in to_encode] out_paths_batch = [op for _, _, op in to_encode] indices_batch = [idx for idx, _, _ in to_encode] try: embeddings = encoder.encode_batch(file_paths_batch) for i, (idx, out_path) in enumerate(zip(indices_batch, out_paths_batch)): np.save(out_path, embeddings[i].astype(np.float32)) df.at[idx, 'embedding_path'] = out_path except Exception as e: print(f"\n Batch failed: {e}") # Fall back to one-by-one for idx, file_path, out_path in to_encode: try: emb = encoder.encode_batch([file_path]) np.save(out_path, emb[0].astype(np.float32)) df.at[idx, 'embedding_path'] = out_path except Exception as e2: print(f"\n Failed: {file_path} — {e2}") failed.append(file_path) n_done = df['embedding_path'].notna().sum() print(f" Done: {n_done} extracted | Failed: {len(failed)}") if failed: print(f" Failed files: {failed[:5]}{'...' if len(failed) > 5 else ''}") return df def main(): encoder = OPERAEncoder(pretrain=PRETRAIN, input_sec=INPUT_SEC, batch_size=GPU_BATCH, n_workers=N_WORKERS) print("\n[1/3] COPD binary dataset") df_copd = extract_and_save('data/copd_binary_labels.csv', encoder) df_copd.to_csv('data/copd_binary_labels_with_embeddings.csv', index=False) print(" Saved: data/copd_binary_labels_with_embeddings.csv") print("\n[2/3] Pneumonia binary dataset") df_pneu = extract_and_save('data/pneumonia_binary_labels.csv', encoder) df_pneu.to_csv('data/pneumonia_binary_labels_with_embeddings.csv', index=False) print(" Saved: data/pneumonia_binary_labels_with_embeddings.csv") print("\n[3/3] Sound labels dataset") df_snd = extract_and_save('data/sound_labels.csv', encoder) df_snd.to_csv('data/sound_labels_with_embeddings.csv', index=False) print(" Saved: data/sound_labels_with_embeddings.csv") print("\nAll embeddings extracted. OPERA will not run again during training.") if __name__ == '__main__': main()