Upload merged Norwegian fine-tuned VibeVoice-7B model

Browse files

Files changed (8) hide show

README.md +62 -0
config.json +151 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +12 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+license: other
+license_name: vibevoice-community
+license_link: https://huggingface.co/vibevoice/VibeVoice-7B/blob/main/LICENSE
+base_model: vibevoice/VibeVoice-7B
+tags:
+- tts
+- text-to-speech
+- speech-synthesis
+- norwegian
+- bokmal
+- nynorsk
+language:
+- "no"
+- nb
+- nn
+---
+# VibeVoice 7B Norwegian
+This is a Norwegian fine-tuned version of [VibeVoice-7B](https://huggingface.co/vibevoice/VibeVoice-7B), a state-of-the-art text-to-speech model.
+## Training Details
+This model was trained using a progressive 3-stage fine-tuning approach:
+1. **Stage 1**: Initial Norwegian (Bokmal) training on Mozilla Common Voice
+2. **Stage 2**: Continued training on broader Norwegian data
+3. **Stage 3**: Dialect-specific fine-tuning for Ostnorsk/Oslo dialect
+The LoRA adapter was merged into the base model weights to create this standalone fine-tuned model.
+### Training Configuration
+- LoRA rank: 32
+- LoRA alpha: 128
+- Target modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
+- Diffusion head: Fully trained
+- Precision: bfloat16
+## Usage
+```python
+from transformers import AutoProcessor, AutoModel
+import torch
+processor = AutoProcessor.from_pretrained("heiertech/vibevoice-7b-norwegian")
+model = AutoModel.from_pretrained("heiertech/vibevoice-7b-norwegian", torch_dtype=torch.bfloat16)
+# Generate speech
+text = "Hei, dette er en test av den norske stemmen."
+inputs = processor(text=text, return_tensors="pt")
+outputs = model.generate(**inputs)
+```
+## License
+This model inherits the license from the base VibeVoice-7B model. Please see the [original license](https://huggingface.co/vibevoice/VibeVoice-7B/blob/main/LICENSE) for details.
+## Acknowledgments
+- Base model: [vibevoice/VibeVoice-7B](https://huggingface.co/vibevoice/VibeVoice-7B)
+- Training data: Mozilla Common Voice Norwegian

config.json ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+  "_attn_implementation_autoset": false,
+  "acostic_vae_dim": 64,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "acoustic_vae_dim": 64,
+  "architectures": [
+    "VibeVoiceForConditionalGeneration"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "dtype": "bfloat16",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibevoice_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "dtype": "bfloat16",
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.3"
+}

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb423717f01ba6aa890e122cf7f662f71a8a4ea0431f218aa62a1f7174b2b6fd
+size 4877662532

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71d015dcc94e9d00cc4b4bafc57b567768bd723acab834e395364c21533ffbff
+size 4932752840

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8489291ef93f10bcbd2042df9bba572e1d60972b3c99655c99a796c5ff3753b
+size 4982901128

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12575fd8c03051efa93e5f14740b5bf80e43de2f4d7fce4b21ed3f24f9d35ea3
+size 3893553730

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "processor_class": "VibeVoiceProcessor",
+  "speech_tok_compress_ratio": 3200,
+  "db_normalize": true,
+  "audio_processor": {
+    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
+    "sampling_rate": 24000,
+    "normalize_audio": true,
+    "target_dB_FS": -25,
+    "eps": 1e-06
+  }
+}