ubuntu commited on
Commit ·
900f8f2
1
Parent(s): 81f0942
v1
Browse files- OmniCodec.pth +3 -0
- Qwen3AuTEncoder/config.json +28 -0
- Qwen3AuTEncoder/model.safetensors +3 -0
- Qwen3AuTEncoder/preprocessor_config.json +30 -0
OmniCodec.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30b4850299bd00a00e71c01aa73cadb01776731d6e45a0699a9062f99d866b63
|
| 3 |
+
size 627755403
|
Qwen3AuTEncoder/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0,
|
| 3 |
+
"activation_function": "gelu",
|
| 4 |
+
"architectures": [
|
| 5 |
+
"Qwen3OmniMoeAudioEncoder"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0,
|
| 8 |
+
"conv_chunksize": 500,
|
| 9 |
+
"d_model": 1280,
|
| 10 |
+
"downsample_hidden_size": 480,
|
| 11 |
+
"dropout": 0,
|
| 12 |
+
"dtype": "bfloat16",
|
| 13 |
+
"encoder_attention_heads": 20,
|
| 14 |
+
"encoder_ffn_dim": 5120,
|
| 15 |
+
"encoder_layers": 32,
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"max_source_positions": 1500,
|
| 18 |
+
"model_type": "qwen3_omni_moe_audio_encoder",
|
| 19 |
+
"n_window": 50,
|
| 20 |
+
"n_window_infer": 800,
|
| 21 |
+
"num_hidden_layers": 32,
|
| 22 |
+
"num_mel_bins": 128,
|
| 23 |
+
"output_dim": 2048,
|
| 24 |
+
"scale_embedding": false,
|
| 25 |
+
"tf_legacy_loss": false,
|
| 26 |
+
"transformers_version": "4.57.0.dev0",
|
| 27 |
+
"use_bfloat16": false
|
| 28 |
+
}
|
Qwen3AuTEncoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72229aa6fc359722c76c3627a56357e805f8c675a23487c0f827c0fc83e3f77a
|
| 3 |
+
size 1295908432
|
Qwen3AuTEncoder/preprocessor_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dither": 0.0,
|
| 3 |
+
"feature_extractor_type": "WhisperFeatureExtractor",
|
| 4 |
+
"feature_size": 128,
|
| 5 |
+
"hop_length": 160,
|
| 6 |
+
"image_mean": [
|
| 7 |
+
0.5,
|
| 8 |
+
0.5,
|
| 9 |
+
0.5
|
| 10 |
+
],
|
| 11 |
+
"image_processor_type": "Qwen2VLImageProcessor",
|
| 12 |
+
"image_std": [
|
| 13 |
+
0.5,
|
| 14 |
+
0.5,
|
| 15 |
+
0.5
|
| 16 |
+
],
|
| 17 |
+
"max_pixels": 12845056,
|
| 18 |
+
"merge_size": 2,
|
| 19 |
+
"min_pixels": 3136,
|
| 20 |
+
"n_fft": 400,
|
| 21 |
+
"n_samples": 4800000,
|
| 22 |
+
"nb_max_frames": 30000,
|
| 23 |
+
"padding_side": "right",
|
| 24 |
+
"padding_value": 0.0,
|
| 25 |
+
"patch_size": 16,
|
| 26 |
+
"processor_class": "Qwen3OmniMoeProcessor",
|
| 27 |
+
"return_attention_mask": true,
|
| 28 |
+
"sampling_rate": 16000,
|
| 29 |
+
"temporal_patch_size": 2
|
| 30 |
+
}
|