drbaph
/

s2-pro-fp8

fish_qwen3_omni

instruction-following

Model card Files Files and versions

s2-pro-fp8 / quantization_info.json

drbaph's picture

Upload 10 files

d3a2cd8 verified 1 day ago

history blame contribute delete

985 Bytes

	{
	"quantization_method": "torchao_Float8WeightOnly",
	"weight_dtype": "float8_e4m3fn",
	"scale_dtype": "float32",
	"scale_granularity": "per_row",
	"activation_dtype": "bfloat16",
	"torchao_version": "0.16.0",
	"torch_version": "2.9.0+cu128",
	"source_model": "fishaudio/s2-pro",
	"total_params_B": 4.562,
	"fp8_linear_params_B": 4.048,
	"bf16_other_params_B": 0.514,
	"output_size_GB": 6.16,
	"key_format": {
	"<layer_name>": "float8_e4m3fn quantized weight",
	"<layer_name>.scale": "float32 per-row dequantization scale",
	"_buf.<name>": "bf16/fp32 buffer (freqs_cis, causal_mask, etc.)",
	"other": "bfloat16 (embeddings, norms, non-linear layers)"
	},
	"inference_requirements": {
	"torchao": ">= 0.8.0",
	"compute_capability": ">= 8.9 (RTX 4090 / 5090) for native FP8 matmuls"
	},
	"notes": "All nn.Linear weights are float8_e4m3fn. Activations are bfloat16 (weight-only quantization). codec.pth is unchanged bfloat16."
	}