Spaces:
Sleeping
Sleeping
| """ | |
| models/embedding_dataset.py — PyTorch Dataset for pre-computed OPERA embeddings. | |
| Training is extremely fast because no audio processing happens at runtime — | |
| embeddings are loaded directly from pre-saved .npy files. | |
| Label columns: | |
| - Binary disease tasks: 'label' (0 or 1) | |
| - Sound classification: 'sound_label' (0-3) | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| from torch.utils.data import Dataset | |
| class EmbeddingDataset(Dataset): | |
| """ | |
| Loads pre-computed OPERA embeddings from .npy files. | |
| Parameters | |
| ---------- | |
| csv_path : path to CSV with columns [embedding_path, <label_col>] | |
| label_col : column name for the class label | |
| augment : whether to apply lightweight embedding-space augmentation | |
| """ | |
| def __init__(self, csv_path: str, | |
| label_col: str = 'label', | |
| augment: bool = False): | |
| self.label_col = label_col | |
| self.augment = augment | |
| df = pd.read_csv(csv_path) | |
| # Drop rows without a valid embedding path | |
| df = df.dropna(subset=['embedding_path']) | |
| df = df[df['embedding_path'].apply( | |
| lambda p: isinstance(p, str) and p.strip().endswith('.npy') | |
| )] | |
| self.df = df.reset_index(drop=True) | |
| print(f"[EmbeddingDataset] {csv_path}: {len(self.df)} samples") | |
| print(f" Label distribution:\n{self.df[label_col].value_counts().to_string()}") | |
| def __len__(self) -> int: | |
| return len(self.df) | |
| def __getitem__(self, idx: int): | |
| row = self.df.iloc[idx] | |
| embedding = np.load(row['embedding_path']).astype(np.float32) | |
| label = int(row[self.label_col]) | |
| if self.augment: | |
| # Gaussian noise in embedding space (very small — maintains semantics) | |
| if np.random.random() < 0.3: | |
| embedding += np.random.randn(*embedding.shape).astype(np.float32) * 0.01 | |
| # Random amplitude scaling | |
| if np.random.random() < 0.2: | |
| embedding *= np.random.uniform(0.95, 1.05) | |
| # Re-normalise to unit sphere after augmentation | |
| norm = np.linalg.norm(embedding) | |
| if norm > 0: | |
| embedding = embedding / norm | |
| return ( | |
| torch.tensor(embedding, dtype=torch.float32), | |
| torch.tensor(label, dtype=torch.long), | |
| ) | |