Spaces:

sheilaseidel
/

kiloVAD

Sleeping

sheilaseidel Claude (@vertexai-global/anthropic.claude-opus-4-5@20251101) commited on Feb 23

Commit

f95eb98

1 Parent(s): 9f65b78

Add pruned 2.1k param model and simplify postprocessing options

- Add pruned model (2.1k params) to model dropdown
- Remove EMA postprocessing options, keep only Hysteresis and Median Filtering
- Add model switching handler to reload model when selection changes

Co-Authored-By: Claude (@vertexai-global/anthropic.claude-opus-4-5@20251101) <noreply@anthropic.com>

Files changed (3) hide show

app.py +38 -8
assets/pruned_2k_params/config.json +88 -0
assets/pruned_2k_params/pruned_model.pth +3 -0

app.py CHANGED Viewed

@@ -105,13 +105,10 @@ current_model_version = "V2"
 vad_preprocessor = None
 # Postprocessing options for kiloVAD
-# Uses hysteresis thresholding with optional smoothing (median or EMA)
 POSTPROCESSING_OPTIONS = [
     "Hysteresis Thresholding",
     "Median Filtering + Hysteresis Threshold",
-    "EMA + Hysteresis Threshold (alpha=0.5)",
-    "EMA + Hysteresis Threshold (alpha=0.4)",
-    "EMA + Hysteresis Threshold (alpha=0.3)",
 ]
 # Model paths configuration with optimized thresholds
@@ -125,6 +122,15 @@ MODEL_PATHS = {
         "version": "V2",
         "frame_sec": 0.2,
         "stride_sec": 0.05  # 50ms stride with overlapping 200ms frames
     }
 }
@@ -1472,7 +1478,7 @@ with gr.Blocks(title="kiloVAD") as demo:
                 # Model Selection
                 gr.Markdown("#### 🤖 Select Model")
                 model_dropdown = gr.Dropdown(
-                    choices=["Unpruned, 81k params"],
                     value="Unpruned, 81k params",
                     label="Model",
                     info="Select a kiloVAD model version"
@@ -1485,9 +1491,6 @@ with gr.Blocks(title="kiloVAD") as demo:
                     choices=[
                         "Hysteresis Thresholding",
                         "Median Filtering + Hysteresis Threshold",
-                        "EMA + Hysteresis Threshold (alpha=0.5)",
-                        "EMA + Hysteresis Threshold (alpha=0.4)",
-                        "EMA + Hysteresis Threshold (alpha=0.3)",
                     ],
                     value="Hysteresis Thresholding",
                     label="Postprocessing",
@@ -1624,6 +1627,33 @@ with gr.Blocks(title="kiloVAD") as demo:
         show_progress=False
     )
 if __name__ == "__main__":
     # Add assets directory to allowed paths for Gradio security
     assets_path = project_root / "assets"

 vad_preprocessor = None
 # Postprocessing options for kiloVAD
+# Uses hysteresis thresholding with optional smoothing (median)
 POSTPROCESSING_OPTIONS = [
     "Hysteresis Thresholding",
     "Median Filtering + Hysteresis Threshold",
 ]
 # Model paths configuration with optimized thresholds
         "version": "V2",
         "frame_sec": 0.2,
         "stride_sec": 0.05  # 50ms stride with overlapping 200ms frames
+    },
+    "Pruned, 2.1k params": {
+        "path": project_root / "assets" / "pruned_2k_params",
+        "model_file": "pruned_model.pth",
+        "threshold_low": 0.7626,
+        "threshold_high": 0.9029,
+        "version": "V2",
+        "frame_sec": 0.2,
+        "stride_sec": 0.05  # 50ms stride with overlapping 200ms frames
     }
 }
                 # Model Selection
                 gr.Markdown("#### 🤖 Select Model")
                 model_dropdown = gr.Dropdown(
+                    choices=["Unpruned, 81k params", "Pruned, 2.1k params"],
                     value="Unpruned, 81k params",
                     label="Model",
                     info="Select a kiloVAD model version"
                     choices=[
                         "Hysteresis Thresholding",
                         "Median Filtering + Hysteresis Threshold",
                     ],
                     value="Hysteresis Thresholding",
                     label="Postprocessing",
         show_progress=False
     )
+    # Handle model selection change
+    def on_model_change(model_name, audio, postprocessing):
+        try:
+            success, message = load_vad_model(model_name)
+            if not success:
+                return f"❌ {message}", None, None
+            # Get the thresholds for the new model
+            model_info = MODEL_PATHS.get(model_name, {})
+            new_low = model_info.get("threshold_low", 0.7626)
+            new_high = model_info.get("threshold_high", 0.9029)
+            if audio is None:
+                return f"✅ {message}", None, None
+            # Re-process audio with the new model
+            return process_uploaded_audio(audio, new_low, new_high, postprocessing)
+        except Exception as e:
+            return f"❌ Error switching model: {str(e)}", None, None
+    model_dropdown.change(
+        fn=on_model_change,
+        inputs=[model_dropdown, audio_input, postprocessing_dropdown],
+        outputs=[status_display, vad_plot, stereo_audio_output],
+        show_progress=True
+    )
 if __name__ == "__main__":
     # Add assets directory to allowed paths for Gradio security
     assets_path = project_root / "assets"

assets/pruned_2k_params/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+    "experiment": {
+        "name": "flexiblevad_frame200ms_seed256",
+        "debug": false,
+        "gpu_id": null,
+        "seed": 256
+    },
+    "data": {
+        "data_dir": "/mnt/azureml/cr/j/d94de43aa8c14a2eb32c4b6dcd863283/cap/data-capability/wd/INPUT_vad_data",
+        "split_name": "libri_dns_full_no_pure_noise_v2",
+        "num_workers": 12,
+        "frame_sec": 0.2,
+        "use_precomputed": true,
+        "precomputed_dir": "/tmp/vad_data/precomputed_features",
+        "use_shm_cache": false
+    },
+    "model": {
+        "class": "FlexibleVAD",
+        "n_mels": 64,
+        "frame_sec": 0.2
+    },
+    "training": {
+        "num_epochs": 40,
+        "batch_size": 512,
+        "loss": {
+            "type": "CrossEntropyLoss",
+            "label_smoothing": 0.09
+        },
+        "pauc_loss": {
+            "enabled": false,
+            "lambda_pauc": 0.8,
+            "tpr_low": 0.9,
+            "tpr_high": 1.0,
+            "gamma": 10.0,
+            "warmup_epochs": 5
+        },
+        "model_selection": {
+            "use_fpr95_for_best_model": false
+        },
+        "optimizer": {
+            "type": "SGD",
+            "base_lr": 0.0035,
+            "momentum": 0.9,
+            "nesterov": true,
+            "weight_decay": 0.000875
+        },
+        "lr_scheduler": {
+            "type": "cyclic",
+            "min_lr": 1e-05,
+            "warmup_epochs": 5,
+            "hold_epochs": 5,
+            "decay_epochs": 5,
+            "cycle_period": 20
+        }
+    },
+    "dataloader": {
+        "shuffle_train": true,
+        "pin_memory": true,
+        "persistent_workers": true,
+        "prefetch_factor": 4,
+        "drop_last": false
+    },
+    "distributed": {
+        "enabled": true,
+        "backend": "nccl",
+        "auto_scale_workers": true
+    },
+    "logging": {
+        "level": "INFO",
+        "log_to_file": true,
+        "log_to_console": true
+    },
+    "evaluation": {
+        "save_best_model": true,
+        "eval_noise_rejection": true,
+        "plot_roc_curves": true,
+        "test_categories": [
+            "Clean LibriSpeech",
+            "Windy LibriSpeech",
+            "LibriSpeech+DNS Synthetic",
+            "DNS Speech+Noise",
+            "DNS Pure Noise"
+        ]
+    },
+    "actual_num_epochs": 40,
+    "model_total_parameters": 81090,
+    "device": "cuda:0"
+}

assets/pruned_2k_params/pruned_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff919ef23f0062e40edfd37b9a8c4f2fd99c233247a401822580adefb6d5247
+size 35175