JusperLee
/

Dolphin

@@ -1,58 +1,10 @@
 ---
-license: apache-2.0
-datasets:
-- alibabasglab/VoxCeleb2-mix
-language:
-- en
 tags:
-- speech
-pipeline_tag: audio-to-audio
 ---
-# Dolphin: Audio-Visual Speech Separation Model
-Dolphin is a state-of-the-art audio-visual speech separation model that leverages both audio and visual information to separate target speech from background noise and other speakers.
-## Model Description
-This model implements the Dolphin architecture for audio-visual speech separation, combining:
-- Audio encoder for processing audio signals
-- Video encoder for processing visual lip movements
-- Multi-modal fusion mechanism
-- Transformer-based separator with global and local attention blocks
-## Usage
-```python
-from huggingface_hub import PyTorchModelHubMixin
-import torch
-# Load the model directly from Hugging Face Hub
-model = Dolphin.from_pretrained("your-username/dolphin-model")
-# Example usage
-audio_input = torch.randn(1, 16000)  # 1 second of audio at 16kHz
-video_input = torch.randn(1, 1, 25, 88, 88)  # 25 frames of 88x88 grayscale video
-# Perform speech separation
-separated_audio = model(audio_input, video_input)
-```
-## Model Architecture
-- **Audio Encoder**: Processes raw audio waveforms
-- **Video Encoder**: Processes lip movement sequences
-- **Feature Projector**: Projects audio features to appropriate dimensions
-- **Separator**: Multi-stage transformer with global and local attention
-- **Audio Decoder**: Reconstructs separated audio waveform
-## Training Data
-The model was trained on audio-visual speech separation datasets with mixed speech scenarios.
-## Citation
-If you use this model in your research, please cite the original Dolphin paper.
-## License
-This model is released under the Apache-2.0 License.

 ---
 tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
 ---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

config.json CHANGED Viewed

@@ -1,138 +1,136 @@
 {
-  "model_type": "dolphin",
-  "task": "audio_visual_speech_separation",
   "framework": "pytorch",
   "license": "apache-2.0",
-  "tags": [
-    "audio",
-    "speech-separation",
-    "audio-visual",
-    "pytorch",
-    "dolphin"
-  ],
-  "model_config": {
-    "num_stages": 4,
-    "sample_rate": 16000,
-    "vpre_channels": 3872,
-    "vmid_channels": 512,
-    "vin_channels": 64,
-    "vout_channels": 64,
-    "module_audio_enc": {
-      "in_channels": 1,
-      "out_channels": 256,
-      "kernel_size": 16,
-      "stride": 4,
-      "groups": 1,
-      "bias": false
-    },
-    "module_feature_projector": {
-      "num_channels": 256,
-      "in_channels": 256,
-      "out_channels": 128,
-      "kernel_size": 1,
-      "bias": false
-    },
-    "module_separator": {
-      "num_stages": 4,
-      "relative_positional_encoding": {
         "in_channels": 128,
-        "num_heads": 8,
-        "maxlen": 2000,
-        "embed_v": false
-      },
-      "enc_stage": {
-        "global_blocks": {
-          "in_channels": 128,
-          "num_mha_heads": 8,
-          "dropout_rate": 0.05
-        },
-        "local_blocks": {
-          "in_channels": 128,
-          "kernel_size": 65,
-          "dropout_rate": 0.05
-        },
-        "down_conv_layer": {
-          "in_channels": 128,
-          "samp_kernel_size": 5
-        }
       },
-      "simple_fusion": {
-        "out_channels": 128
       },
-      "dec_stage": {
-        "global_blocks": {
-          "in_channels": 128,
-          "num_mha_heads": 8,
-          "dropout_rate": 0.05
-        },
-        "local_blocks": {
-          "in_channels": 128,
-          "kernel_size": 65,
-          "dropout_rate": 0.05
-        },
-        "spk_attention": {
-          "in_channels": 128,
-          "num_mha_heads": 8,
-          "dropout_rate": 0.05
-        }
       }
     },
-    "module_output_layer": {
-      "in_channels": 256,
-      "out_channels": 128
     },
-    "module_audio_dec": {
-      "in_channels": 256,
-      "out_channels": 1,
-      "kernel_size": 16,
-      "stride": 4,
-      "bias": false
     },
-    "video_encoder_params": {
-      "layers": [
-        "residual",
-        "compress_space",
-        "consecutive_residual",
-        "compress_space",
-        "consecutive_residual",
-        "linear_attend_space",
-        "compress_space",
-        "consecutive_residual",
-        "attend_space"
-      ],
-      "image_size": 88,
-      "in_channel": 1,
-      "init_channel": 4,
-      "max_dim": 32,
-      "input_conv_kernel_size": [
-        7,
-        7,
-        7
-      ],
-      "output_conv_kernel_size": [
-        3,
-        3,
-        3
-      ],
-      "residual_conv_kernel_size": 3,
-      "pad_mode": "constant",
-      "attn_dim_head": 32,
-      "attn_heads": 8,
-      "attn_dropout": 0.0,
-      "flash_attn": true,
-      "linear_attn_dim_head": 8,
-      "linear_attn_heads": 16,
-      "num_quantizers": 1,
-      "codebook_size": 256,
-      "codebook_dim": 64,
-      "commitment_cost": 1.0,
-      "distill_cost": 1.0
     }
   },
-  "architectures": [
-    "Dolphin"
   ],
-  "auto_map": {
-    "AutoModel": "dolphin.Dolphin"
-  }
 }

 {
+  "architectures": [
+    "Dolphin"
+  ],
+  "auto_map": {
+    "AutoModel": "dolphin.Dolphin"
+  },
   "framework": "pytorch",
   "license": "apache-2.0",
+  "model_type": "dolphin",
+  "module_audio_dec": {
+    "bias": false,
+    "in_channels": 256,
+    "kernel_size": 16,
+    "out_channels": 1,
+    "stride": 4
+  },
+  "module_audio_enc": {
+    "bias": false,
+    "groups": 1,
+    "in_channels": 1,
+    "kernel_size": 16,
+    "out_channels": 256,
+    "stride": 4
+  },
+  "module_feature_projector": {
+    "bias": false,
+    "in_channels": 256,
+    "kernel_size": 1,
+    "num_channels": 256,
+    "out_channels": 128
+  },
+  "module_output_layer": {
+    "in_channels": 256,
+    "out_channels": 128
+  },
+  "module_separator": {
+    "dec_stage": {
+      "global_blocks": {
+        "dropout_rate": 0.05,
         "in_channels": 128,
+        "num_mha_heads": 8
       },
+      "local_blocks": {
+        "dropout_rate": 0.05,
+        "in_channels": 128,
+        "kernel_size": 65
       },
+      "spk_attention": {
+        "dropout_rate": 0.05,
+        "in_channels": 128,
+        "num_mha_heads": 8
       }
     },
+    "enc_stage": {
+      "down_conv_layer": {
+        "in_channels": 128,
+        "samp_kernel_size": 5
+      },
+      "global_blocks": {
+        "dropout_rate": 0.05,
+        "in_channels": 128,
+        "num_mha_heads": 8
+      },
+      "local_blocks": {
+        "dropout_rate": 0.05,
+        "in_channels": 128,
+        "kernel_size": 65
+      }
     },
+    "num_stages": 4,
+    "relative_positional_encoding": {
+      "embed_v": false,
+      "in_channels": 128,
+      "maxlen": 2000,
+      "num_heads": 8
     },
+    "simple_fusion": {
+      "out_channels": 128
     }
   },
+  "num_stages": 4,
+  "sample_rate": 16000,
+  "tags": [
+    "audio",
+    "speech-separation",
+    "audio-visual",
+    "pytorch",
+    "dolphin"
   ],
+  "task": "audio_visual_speech_separation",
+  "video_encoder_params": {
+    "attn_dim_head": 32,
+    "attn_dropout": 0.0,
+    "attn_heads": 8,
+    "codebook_dim": 64,
+    "codebook_size": 256,
+    "commitment_cost": 1.0,
+    "distill_cost": 1.0,
+    "flash_attn": true,
+    "image_size": 88,
+    "in_channel": 1,
+    "init_channel": 4,
+    "input_conv_kernel_size": [
+      7,
+      7,
+      7
+    ],
+    "layers": [
+      "residual",
+      "compress_space",
+      "consecutive_residual",
+      "compress_space",
+      "consecutive_residual",
+      "linear_attend_space",
+      "compress_space",
+      "consecutive_residual",
+      "attend_space"
+    ],
+    "linear_attn_dim_head": 8,
+    "linear_attn_heads": 16,
+    "max_dim": 32,
+    "num_quantizers": 1,
+    "output_conv_kernel_size": [
+      3,
+      3,
+      3
+    ],
+    "pad_mode": "constant",
+    "residual_conv_kernel_size": 3
+  },
+  "vin_channels": 64,
+  "vmid_channels": 512,
+  "vout_channels": 64,
+  "vpre_channels": 3872
 }