remove: tts-large-fp8/ — pre-quantized FP8 variant retired (transformers/torchao reload bugs + no CPU/disk spillover; bf16 + runtime NF4 covers the smaller-card use case better)

Browse files

Files changed (7) hide show

tts-large-fp8/config.json +0 -177
tts-large-fp8/generation_config.json +0 -4
tts-large-fp8/model-00001-of-00003.safetensors +0 -3
tts-large-fp8/model-00002-of-00003.safetensors +0 -3
tts-large-fp8/model-00003-of-00003.safetensors +0 -3
tts-large-fp8/model.safetensors.index.json +0 -0
tts-large-fp8/preprocessor_config.json +0 -12

tts-large-fp8/config.json DELETED Viewed

@@ -1,177 +0,0 @@
-{
-  "_attn_implementation_autoset": false,
-  "acostic_vae_dim": 64,
-  "acoustic_tokenizer_config": {
-    "causal": true,
-    "channels": 1,
-    "conv_bias": true,
-    "conv_norm": "none",
-    "corpus_normalize": 0.0,
-    "decoder_depths": null,
-    "decoder_n_filters": 32,
-    "decoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "disable_last_norm": true,
-    "dtype": "bfloat16",
-    "encoder_depths": "3-3-3-3-3-3-8",
-    "encoder_n_filters": 32,
-    "encoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "fix_std": 0.5,
-    "layer_scale_init_value": 1e-06,
-    "layernorm": "RMSNorm",
-    "layernorm_elementwise_affine": true,
-    "layernorm_eps": 1e-05,
-    "mixer_layer": "depthwise_conv",
-    "model_type": "vibevoice_acoustic_tokenizer",
-    "pad_mode": "constant",
-    "std_dist_type": "gaussian",
-    "vae_dim": 64,
-    "weight_init_value": 0.01
-  },
-  "acoustic_vae_dim": 64,
-  "architectures": [
-    "VibeVoiceForConditionalGenerationInference"
-  ],
-  "decoder_config": {
-    "attention_dropout": 0.0,
-    "dtype": "bfloat16",
-    "hidden_act": "silu",
-    "hidden_size": 3584,
-    "initializer_range": 0.02,
-    "intermediate_size": 18944,
-    "layer_types": [
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention"
-    ],
-    "max_position_embeddings": 32768,
-    "max_window_layers": 28,
-    "model_type": "qwen2",
-    "num_attention_heads": 28,
-    "num_hidden_layers": 28,
-    "num_key_value_heads": 4,
-    "rms_norm_eps": 1e-06,
-    "rope_scaling": null,
-    "rope_theta": 1000000.0,
-    "sliding_window": null,
-    "use_cache": true,
-    "use_mrope": false,
-    "use_sliding_window": false,
-    "vocab_size": 152064
-  },
-  "diffusion_head_config": {
-    "ddpm_batch_mul": 4,
-    "ddpm_beta_schedule": "cosine",
-    "ddpm_num_inference_steps": 20,
-    "ddpm_num_steps": 1000,
-    "diffusion_type": "ddpm",
-    "dtype": "bfloat16",
-    "head_ffn_ratio": 3.0,
-    "head_layers": 4,
-    "hidden_size": 3584,
-    "latent_size": 64,
-    "model_type": "vibevoice_diffusion_head",
-    "prediction_type": "v_prediction",
-    "rms_norm_eps": 1e-05,
-    "speech_vae_dim": 64
-  },
-  "dtype": "bfloat16",
-  "model_type": "vibevoice",
-  "quantization_config": {
-    "include_input_output_embeddings": false,
-    "modules_to_not_convert": [
-      "diffusion_head",
-      "acoustic_tokenizer",
-      "semantic_tokenizer",
-      "acoustic_connector",
-      "semantic_connector"
-    ],
-    "quant_method": "torchao",
-    "quant_type": {
-      "default": {
-        "_data": {
-          "set_inductor_config": true,
-          "weight_dtype": {
-            "_data": "float8_e4m3fn",
-            "_type": "torch.dtype"
-          }
-        },
-        "_type": "Float8WeightOnlyConfig",
-        "_version": 2
-      }
-    },
-    "quant_type_kwargs": {},
-    "untie_embedding_weights": false
-  },
-  "semantic_tokenizer_config": {
-    "causal": true,
-    "channels": 1,
-    "conv_bias": true,
-    "conv_norm": "none",
-    "corpus_normalize": 0.0,
-    "disable_last_norm": true,
-    "dtype": "bfloat16",
-    "encoder_depths": "3-3-3-3-3-3-8",
-    "encoder_n_filters": 32,
-    "encoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "fix_std": 0,
-    "layer_scale_init_value": 1e-06,
-    "layernorm": "RMSNorm",
-    "layernorm_elementwise_affine": true,
-    "layernorm_eps": 1e-05,
-    "mixer_layer": "depthwise_conv",
-    "model_type": "vibevoice_semantic_tokenizer",
-    "pad_mode": "constant",
-    "std_dist_type": "none",
-    "vae_dim": 128,
-    "weight_init_value": 0.01
-  },
-  "semantic_vae_dim": 128,
-  "tie_word_embeddings": false,
-  "transformers_version": "4.57.6"
-}

tts-large-fp8/generation_config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "_from_model_config": true,
-  "transformers_version": "4.57.6"
-}

tts-large-fp8/model-00001-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:76422a9f9182e4c0cc6f5db412c6d7e2008ced84e962906e0f78c235ac0cd5c7
-size 4987917596

tts-large-fp8/model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ac2c2692c7983099d880bcfe7336566f2482475e88686ee22bfd75a749eca59
-size 4970843314

tts-large-fp8/model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b2422cfd3ea4ab201c54560530974b4c68e121e22e46f4e582e90b33464aa872
-size 996097944

tts-large-fp8/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tts-large-fp8/preprocessor_config.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "processor_class": "VibeVoiceProcessor",
-  "speech_tok_compress_ratio": 3200,
-  "db_normalize": true,
-  "audio_processor": {
-    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
-    "sampling_rate": 24000,
-    "normalize_audio": true,
-    "target_dB_FS": -25,
-    "eps": 1e-06
-  }
-}