Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +49 -18
__pycache__/pipeline.cpython-310.pyc +0 -0
config.json +1 -1
pipeline.py +73 -12
requirements.txt +8 -0
test.py +12 -0

README.md CHANGED Viewed

@@ -17,6 +17,8 @@ A regression probe trained on top of Whisper-large-v3 encoder features for estim
 **Score scale:** 1.0 (most severe dysarthria) to 7.0 (typical speech)
 ## Model Description
 This model uses a three-stage training pipeline:
@@ -37,14 +39,43 @@ This repository contains **9 checkpoints** trained with different contrastive lo
 |---|---|---|
 | `proposed_L_coarse_tau0.1` | Proposed (L_coarse) | 0.1 |
 | `proposed_L_coarse_tau1.0` | Proposed (L_coarse) | 1.0 |
-| **`proposed_L_coarse_tau10.0`** (default) | Proposed (L_coarse) | 10.0 |
 | `proposed_L_coarse_tau50.0` | Proposed (L_coarse) | 50.0 |
-| `proposed_L_coarse_tau100.0` | Proposed (L_coarse) | 100.0 |
 | `proposed_L_cont_tau0.1` | Proposed (L_cont) | 0.1 |
 | `proposed_L_dis_tau1.0` | Proposed (L_dis) | 1.0 |
 | `rank-n-contrast_tau100.0` | Rank-N-Contrast | 100.0 |
 | `simclr_tau0.1` | SimCLR | 0.1 |
 ## Usage
 ### With the custom pipeline
@@ -53,16 +84,16 @@ This repository contains **9 checkpoints** trained with different contrastive lo
 from huggingface_hub import snapshot_download
 # Download the model
-model_dir = snapshot_download("jaesungbae/severity-level-classifier")
-# Load pipeline (defaults to proposed_L_coarse_tau10.0)
 from pipeline import PreTrainedPipeline
 pipe = PreTrainedPipeline(model_dir)
 # Run inference
 result = pipe("/path/to/audio.wav")
 print(result)
-# {"severity_score": 4.25, "raw_score": 4.2483, "model_name": "proposed_L_coarse_tau10.0"}
 ```
 ### Select a specific checkpoint
@@ -79,6 +110,18 @@ result = pipe("/path/to/audio.wav")
 result = pipe("/path/to/audio.wav", model_name="proposed_L_dis_tau1.0")
 ```
 ### List available checkpoints
 ```python
@@ -101,21 +144,9 @@ Clone the [full repository](https://github.com/JaesungBae/DA-DSQA) and run:
 ```bash
 python inference.py \
     --wav /path/to/audio.wav \
-    --checkpoint ./checkpoints/stage3/proposed_L_coarse_tau10.0/average
 ```
-## Requirements
-- Python 3.10+
-- PyTorch + torchaudio
-- transformers >= 4.40.0
-- safetensors >= 0.4.0
-- Silero VAD (loaded via `torch.hub` at runtime)
-## Runtime Dependencies
-This model loads **openai/whisper-large-v3** (~6GB) and **Silero VAD** at initialization time. Ensure sufficient memory is available.
 ## Citation
 ```bibtex

 **Score scale:** 1.0 (most severe dysarthria) to 7.0 (typical speech)
+**GitHub:** [JaesungBae/DA-DSQA](https://github.com/JaesungBae/DA-DSQA)
 ## Model Description
 This model uses a three-stage training pipeline:
 |---|---|---|
 | `proposed_L_coarse_tau0.1` | Proposed (L_coarse) | 0.1 |
 | `proposed_L_coarse_tau1.0` | Proposed (L_coarse) | 1.0 |
+| `proposed_L_coarse_tau10.0` | Proposed (L_coarse) | 10.0 |
 | `proposed_L_coarse_tau50.0` | Proposed (L_coarse) | 50.0 |
+| **`proposed_L_coarse_tau100.0`** (default) | Proposed (L_coarse) | 100.0 |
 | `proposed_L_cont_tau0.1` | Proposed (L_cont) | 0.1 |
 | `proposed_L_dis_tau1.0` | Proposed (L_dis) | 1.0 |
 | `rank-n-contrast_tau100.0` | Rank-N-Contrast | 100.0 |
 | `simclr_tau0.1` | SimCLR | 0.1 |
+## Setup
+### 1. Create conda environment
+```bash
+conda create -n da-dsqa python=3.10 -y
+conda activate da-dsqa
+```
+### 2. Install PyTorch with CUDA
+```bash
+conda install pytorch torchaudio -c pytorch -y
+```
+> For a GPU build with a specific CUDA version, see [pytorch.org](https://pytorch.org/get-started/locally/) for the appropriate command.
+### 3. Install remaining dependencies
+```bash
+pip install -r requirements.txt
+```
+> **Note:** [Silero VAD](https://github.com/snakers4/silero-vad) is loaded automatically at runtime via `torch.hub` — no separate installation needed.
+### Runtime Dependencies
+This model loads **openai/whisper-large-v3** (~6GB) and **Silero VAD** at initialization time. Ensure sufficient memory is available.
 ## Usage
 ### With the custom pipeline
 from huggingface_hub import snapshot_download
 # Download the model
+model_dir = snapshot_download("jaesungbae/da-dsqa")
+# Load pipeline (defaults to proposed_L_coarse_tau100.0)
 from pipeline import PreTrainedPipeline
 pipe = PreTrainedPipeline(model_dir)
 # Run inference
 result = pipe("/path/to/audio.wav")
 print(result)
+# {"severity_score": 4.25, "raw_score": 4.2483, "model_name": "proposed_L_coarse_tau100.0"}
 ```
 ### Select a specific checkpoint
 result = pipe("/path/to/audio.wav", model_name="proposed_L_dis_tau1.0")
 ```
+### Batch inference
+```python
+results = pipe.batch_inference([
+    "/path/to/audio1.wav",
+    "/path/to/audio2.wav",
+    "/path/to/audio3.wav",
+])
+for r in results:
+    print(f"{r['file']}: {r['severity_score']}")
+```
 ### List available checkpoints
 ```python
 ```bash
 python inference.py \
     --wav /path/to/audio.wav \
+    --checkpoint ./checkpoints/stage3/proposed_L_coarse_tau100.0/average
 ```
 ## Citation
 ```bibtex

__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (9.13 kB). View file

config.json CHANGED Viewed

@@ -7,7 +7,7 @@
   "num_classes": 1,
   "whisper_model_name": "openai/whisper-large-v3",
   "sampling_rate": 16000,
-  "default_checkpoint": "proposed_L_coarse_tau10.0",
   "available_checkpoints": [
     "proposed_L_coarse_tau0.1",
     "proposed_L_coarse_tau1.0",

   "num_classes": 1,
   "whisper_model_name": "openai/whisper-large-v3",
   "sampling_rate": 16000,
+  "default_checkpoint": "proposed_L_coarse_tau100.0",
   "available_checkpoints": [
     "proposed_L_coarse_tau0.1",
     "proposed_L_coarse_tau1.0",

pipeline.py CHANGED Viewed

@@ -28,12 +28,13 @@ import os
 import torch
 import torch.nn as nn
 import torchaudio
 SAMPLING_RATE = 16000
 WHISPER_MODEL_NAME = "openai/whisper-large-v3"
 WHISPER_HIDDEN_DIM = 1280
-DEFAULT_CHECKPOINT = "proposed_L_coarse_tau10.0"
 class WhisperFeatureProbeV2(nn.Module):
@@ -260,6 +261,19 @@ class PreTrainedPipeline:
         """Return list of available checkpoint names."""
         return list(self.available_checkpoints)
     def __call__(self, inputs, model_name: str = None):
         """
         Run severity estimation on audio input.
@@ -275,17 +289,7 @@ class PreTrainedPipeline:
         if model_name is not None:
             self.switch_model(model_name)
-        # Load audio
-        if isinstance(inputs, str):
-            wav, sr = torchaudio.load(inputs)
-        elif isinstance(inputs, bytes):
-            wav, sr = torchaudio.load(io.BytesIO(inputs))
-        else:
-            wav, sr = torchaudio.load(io.BytesIO(inputs))
-        if sr != SAMPLING_RATE:
-            wav = torchaudio.functional.resample(wav, sr, SAMPLING_RATE)
-        wav = wav.squeeze()
         # VAD
         wav = _apply_vad(wav, self.vad_model, self.get_speech_timestamps)
@@ -305,3 +309,60 @@ class PreTrainedPipeline:
             "raw_score": round(score, 4),
             "model_name": self.current_model_name,
         }

 import torch
 import torch.nn as nn
+import soundfile as sf
 import torchaudio
 SAMPLING_RATE = 16000
 WHISPER_MODEL_NAME = "openai/whisper-large-v3"
 WHISPER_HIDDEN_DIM = 1280
+DEFAULT_CHECKPOINT = "proposed_L_coarse_tau100.0"
 class WhisperFeatureProbeV2(nn.Module):
         """Return list of available checkpoint names."""
         return list(self.available_checkpoints)
+    def _load_wav(self, inputs):
+        """Load and preprocess a single audio input to a 1D waveform tensor."""
+        if isinstance(inputs, (bytes, bytearray)):
+            data, sr = sf.read(io.BytesIO(inputs), dtype="float32")
+        else:
+            data, sr = sf.read(inputs, dtype="float32")
+        wav = torch.from_numpy(data).float()
+        if wav.dim() > 1:
+            wav = wav.mean(dim=-1)
+        if sr != SAMPLING_RATE:
+            wav = torchaudio.functional.resample(wav, sr, SAMPLING_RATE)
+        return wav
     def __call__(self, inputs, model_name: str = None):
         """
         Run severity estimation on audio input.
         if model_name is not None:
             self.switch_model(model_name)
+        wav = self._load_wav(inputs)
         # VAD
         wav = _apply_vad(wav, self.vad_model, self.get_speech_timestamps)
             "raw_score": round(score, 4),
             "model_name": self.current_model_name,
         }
+    def batch_inference(self, input_list, model_name: str = None):
+        """
+        Run severity estimation on a batch of audio files.
+        Whisper processes one file at a time (due to variable-length VAD output),
+        but the probe runs as a single padded batch for efficiency.
+        Args:
+            input_list: list of file paths (str) or raw audio bytes
+            model_name: optionally override the checkpoint for this call
+        Returns:
+            list of dicts, each with "file", "severity_score", "raw_score",
+            and "model_name"
+        """
+        if model_name is not None:
+            self.switch_model(model_name)
+        # Extract features for each file
+        all_features = []
+        lengths = []
+        for inputs in input_list:
+            wav = self._load_wav(inputs)
+            wav = _apply_vad(wav, self.vad_model, self.get_speech_timestamps)
+            features = _extract_features(
+                wav, self.whisper, self.processor, self.device
+            )
+            feat = features.squeeze(0)  # (T, hidden_dim)
+            all_features.append(feat)
+            lengths.append(feat.shape[0])
+        # Pad and batch
+        max_len = max(lengths)
+        hidden_dim = all_features[0].shape[1]
+        batch_size = len(all_features)
+        padded = torch.zeros(batch_size, max_len, hidden_dim, device=self.device)
+        for i, feat in enumerate(all_features):
+            padded[i, : lengths[i]] = feat
+        lengths_tensor = torch.tensor(lengths, device=self.device)
+        # Batched probe inference
+        with torch.no_grad():
+            output = self.probe(padded, lengths=lengths_tensor)
+        scores = output.logits.squeeze(-1).cpu().tolist()
+        results = []
+        for i, inputs in enumerate(input_list):
+            score = scores[i]
+            results.append({
+                "file": inputs if isinstance(inputs, str) else f"input_{i}",
+                "severity_score": round(max(1.0, min(7.0, score)), 2),
+                "raw_score": round(score, 4),
+                "model_name": self.current_model_name,
+            })
+        return results

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+# Install PyTorch separately first (via conda or pip):
+#   conda install pytorch torchaudio -c pytorch -y
+# See https://pytorch.org/get-started/locally/ for GPU builds.
+transformers>=4.40.0
+safetensors>=0.4.0
+huggingface_hub>=0.20.0
+soundfile>=0.12.0

test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from huggingface_hub import snapshot_download
+# Download the model from HuggingFace
+model_dir = snapshot_download("jaesungbae/da-dsqa")
+# Load pipeline (defaults to proposed_L_coarse_tau100.0)
+from pipeline import PreTrainedPipeline
+pipe = PreTrainedPipeline(model_dir)
+# Run inference
+result = pipe("/projects/bedl/jbae4/workspace_2026/severity_level_classifier_release/sample_wavs/Naturalness/level_1/d1b9444a-2ed1-438e-fd68-08dcb5d1edd7_1071_8831.wav")
+print(result)