Upload folder using huggingface_hub

8c2f9b0 verified about 1 month ago

4.24 kB

	{
	"model_type": "fastconformer_rnnt",
	"source_model": "nvidia/nemotron-speech-streaming-en-0.6b",
	"format": "coreml",

	"audio": {
	"sample_rate": 16000,
	"sample_format": "S16_LE",
	"bytes_per_second": 32000
	},

	"preprocessor": {
	"n_mels": 128,
	"n_fft": 512,
	"hop_length": 160,
	"win_length": 400,
	"window": "hann",
	"preemph": 0.97,
	"dither": 1e-05,
	"normalize": null,
	"pad_to": 0,
	"mel_norm": "slaney",
	"mel_layout": "band_major",
	"_comment": "mel_layout=band_major means output shape is [n_mels, n_frames], not [n_frames, n_mels]. Mel preprocessing runs on the host (not in CoreML)."
	},

	"encoder": {
	"model_file": "encoder.mlmodelc",
	"layers": 24,
	"dim": 1024,
	"chunk_mel_frames": 56,
	"pre_encode_cache_frames": 9,
	"total_input_frames": 65,
	"compute_units": "CPU_AND_NE",
	"compute_precision": "FLOAT16",
	"cache_last_channel_shape": [1, 24, 70, 1024],
	"cache_last_time_shape": [1, 24, 1024, 8],
	"_comment_cache_layout": "batch-first [B, L, ...] for CoreML. NeMo uses layer-first [L, B, ...] internally; the wrapper transposes.",
	"inputs": {
	"audio_signal": {"dtype": "float32", "shape": [1, 128, 65]},
	"cache_last_channel": {"dtype": "float32", "shape": [1, 24, 70, 1024], "init": "zeros"},
	"cache_last_time": {"dtype": "float32", "shape": [1, 24, 1024, 8], "init": "zeros"},
	"cache_last_channel_len": {"dtype": "int32", "shape": [1], "init": "zeros"}
	},
	"outputs": {
	"encoded": {"dtype": "float16", "shape": [1, 1024, 7]},
	"encoded_length": {"dtype": "int32", "shape": [1]},
	"cache_channel_out": {"dtype": "float16", "shape": [1, 24, 70, 1024], "_comment": "feed back as cache_last_channel"},
	"cache_time_out": {"dtype": "float16", "shape": [1, 24, 1024, 8], "_comment": "feed back as cache_last_time"},
	"cache_len_out": {"dtype": "int32", "shape": [1], "_comment": "feed back as cache_last_channel_len"}
	},
	"_comment_strides": "CoreML output MLMultiArrays may have non-contiguous strides due to ANE alignment padding (e.g. stride=32 for dim size 7). Callers must use stride-aware copy, not flat memcpy."
	},

	"decoder": {
	"model_file": "decoder.mlmodelc",
	"prediction_layers": 2,
	"prediction_hidden": 640,
	"vocab_size": 1025,
	"blank_id": 1024,
	"max_symbols_per_frame": 10,
	"compute_units": "CPU_ONLY",
	"compute_precision": "FLOAT16",
	"inputs": {
	"encoder_outputs": {"dtype": "float32", "shape": [1, 1024, 1], "_comment": "single encoder frame"},
	"targets": {"dtype": "int32", "shape": [1, 1], "_comment": "previous non-blank token (blank_id for first)"},
	"target_length": {"dtype": "int32", "shape": [1], "value": 1},
	"input_states_1": {"dtype": "float32", "shape": [2, 1, 640], "init": "zeros"},
	"input_states_2": {"dtype": "float32", "shape": [2, 1, 640], "init": "zeros"}
	},
	"outputs": {
	"outputs": {"dtype": "float16", "shape": [1, 1025], "_comment": "logits over vocab, argmax to get token"},
	"output_states_1": {"dtype": "float16", "shape": [2, 1, 640], "_comment": "feed back as input_states_1"},
	"output_states_2": {"dtype": "float16", "shape": [2, 1, 640], "_comment": "feed back as input_states_2"}
	}
	},

	"streaming": {
	"chunk_duration_ms": 560,
	"chunk_audio_samples": 8960,
	"algorithm": "Cache-aware streaming FastConformer. Feed 560ms audio chunks. For each chunk: compute mel (56 frames), prepend 9 cached frames, run encoder. Then for each encoder output frame, run RNNT greedy decode: feed one encoder frame to decoder, argmax logits, if not blank emit token and loop (up to max_symbols_per_frame), if blank move to next frame. Feed cache outputs back as next chunk's cache inputs. Feed decoder states back for next symbol/frame."
	},

	"ane_profile": {
	"encoder_ane_ops": 1486,
	"encoder_cpu_ops": 111,
	"encoder_ane_pct": 93.0,
	"cpu_op_breakdown": "select (72, attention masking), softmax (24, one per conformer layer), expand_dims/logical/cast (15, control flow)",
	"decoder": "100% CPU (CPU_ONLY compute units)",
	"inference_speed": "~15.5x realtime on Apple Silicon (M-series)"
	}
	}