Spaces:
Sleeping
Sleeping
File size: 5,998 Bytes
d0ace1e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """
scripts/extract_opera_embeddings.py — One-time OPERA-CT embedding extraction.
Runs OPERA-CT on every audio file in the three label CSVs and saves
embeddings as .npy files. After this, OPERA never runs again — training
loads only from .npy files.
OPERA-CT checkpoint auto-downloads from HuggingFace on first run.
Prerequisites:
- data/copd_binary_labels.csv (from scripts/build_label_csvs.py)
- data/pneumonia_binary_labels.csv
- data/sound_labels.csv
- ./OPERA/ cloned from github.com/evelyn0414/OPERA
Output:
- data/opera_embeddings/<source>/<filename>.npy
- data/copd_binary_labels_with_embeddings.csv
- data/pneumonia_binary_labels_with_embeddings.csv
- data/sound_labels_with_embeddings.csv
"""
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from models.opera_encoder import OPERAEncoder
OUTPUT_DIR = './data/opera_embeddings'
CHUNK_SIZE = 64 # files sent to encoder.encode_batch() at once
GPU_BATCH = 16 # GPU forward pass batch size (safe for GTX 1650 4GB)
N_WORKERS = 4 # CPU threads for parallel audio preprocessing
PRETRAIN = 'operaCT'
INPUT_SEC = 8
os.makedirs(OUTPUT_DIR, exist_ok=True)
def extract_and_save(csv_path: str, encoder: OPERAEncoder) -> pd.DataFrame:
"""
Extract OPERA embeddings for all files in csv_path.
Skips files whose .npy already exists (safe to re-run after interruption).
Returns updated DataFrame with 'embedding_path' column filled.
"""
df = pd.read_csv(csv_path)
if 'embedding_path' not in df.columns:
df['embedding_path'] = None
# Determine which rows still need extraction
todo_mask = df['embedding_path'].isna() | (df['embedding_path'] == '')
# Also check if the .npy file actually exists for non-null paths
for idx, row in df[~todo_mask].iterrows():
if not os.path.exists(str(row['embedding_path'])):
todo_mask.at[idx] = True
todo_df = df[todo_mask].copy()
print(f" {len(todo_df)} files to extract ({len(df) - len(todo_df)} already done)")
if len(todo_df) == 0:
return df
failed = []
# Process in chunks
for batch_start in tqdm(range(0, len(todo_df), CHUNK_SIZE),
desc=f" Extracting {os.path.basename(csv_path)}"):
batch = todo_df.iloc[batch_start: batch_start + CHUNK_SIZE]
# Build output paths first
out_paths = []
valid_rows = []
for _, row in batch.iterrows():
file_path = row['file_path']
if not os.path.exists(file_path):
failed.append(file_path)
continue
fname = os.path.basename(file_path)
fname = (fname.replace('.wav', '.npy')
.replace('.webm', '.npy')
.replace('.mp3', '.npy'))
source = str(row.get('source', 'unknown'))
out_dir = os.path.join(OUTPUT_DIR, source)
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, fname)
out_paths.append(out_path)
valid_rows.append((row.name, file_path, out_path))
if not valid_rows:
continue
# Skip rows where .npy already exists
to_encode = [(idx, fp, op) for idx, fp, op in valid_rows
if not os.path.exists(op)]
already_done = [(idx, fp, op) for idx, fp, op in valid_rows
if os.path.exists(op)]
# Update already-done rows
for idx, fp, op in already_done:
df.at[idx, 'embedding_path'] = op
if not to_encode:
continue
file_paths_batch = [fp for _, fp, _ in to_encode]
out_paths_batch = [op for _, _, op in to_encode]
indices_batch = [idx for idx, _, _ in to_encode]
try:
embeddings = encoder.encode_batch(file_paths_batch)
for i, (idx, out_path) in enumerate(zip(indices_batch, out_paths_batch)):
np.save(out_path, embeddings[i].astype(np.float32))
df.at[idx, 'embedding_path'] = out_path
except Exception as e:
print(f"\n Batch failed: {e}")
# Fall back to one-by-one
for idx, file_path, out_path in to_encode:
try:
emb = encoder.encode_batch([file_path])
np.save(out_path, emb[0].astype(np.float32))
df.at[idx, 'embedding_path'] = out_path
except Exception as e2:
print(f"\n Failed: {file_path} — {e2}")
failed.append(file_path)
n_done = df['embedding_path'].notna().sum()
print(f" Done: {n_done} extracted | Failed: {len(failed)}")
if failed:
print(f" Failed files: {failed[:5]}{'...' if len(failed) > 5 else ''}")
return df
def main():
encoder = OPERAEncoder(pretrain=PRETRAIN, input_sec=INPUT_SEC,
batch_size=GPU_BATCH, n_workers=N_WORKERS)
print("\n[1/3] COPD binary dataset")
df_copd = extract_and_save('data/copd_binary_labels.csv', encoder)
df_copd.to_csv('data/copd_binary_labels_with_embeddings.csv', index=False)
print(" Saved: data/copd_binary_labels_with_embeddings.csv")
print("\n[2/3] Pneumonia binary dataset")
df_pneu = extract_and_save('data/pneumonia_binary_labels.csv', encoder)
df_pneu.to_csv('data/pneumonia_binary_labels_with_embeddings.csv', index=False)
print(" Saved: data/pneumonia_binary_labels_with_embeddings.csv")
print("\n[3/3] Sound labels dataset")
df_snd = extract_and_save('data/sound_labels.csv', encoder)
df_snd.to_csv('data/sound_labels_with_embeddings.csv', index=False)
print(" Saved: data/sound_labels_with_embeddings.csv")
print("\nAll embeddings extracted. OPERA will not run again during training.")
if __name__ == '__main__':
main()
|