parakeet-tdt-ja-coreml / manifest.json
wangjazz's picture
Upload 18 files
43b5527 verified
{
"model_id": "nvidia/parakeet-tdt_ctc-0.6b-ja",
"language": "ja",
"vocab_size": 3072,
"hidden_size": 640,
"encoder_features": 1024,
"num_decoder_layers": 2,
"sample_rate": 16000,
"model_type": "EncDecHybridRNNTCTCBPEModel",
"architecture": "Hybrid FastConformer-TDT-CTC",
"encoder_output_format": "[batch, features, time]",
"fixed_audio_window_sec": 15.0,
"fixed_audio_samples": 240000,
"fixed_mel_frames": 1501,
"fixed_encoder_frames": 188,
"components": {
"preprocessor": "preprocessor.mlpackage",
"encoder": "encoder.mlpackage",
"decoder": "decoder.mlpackage",
"joint": "joint.mlpackage",
"mel_encoder": "mel_encoder.mlpackage",
"vocabulary": "vocab_ja.json"
},
"io_shapes": {
"preprocessor": {
"input": {"audio_signal": "[1, 240000]", "audio_length": "[1]"},
"output": {"mel": "[1, 80, T]", "mel_length": "[1]"}
},
"encoder": {
"input": {"mel": "[1, 80, 1501]", "mel_length": "[1]"},
"output": {"encoded": "[1, 1024, 188]", "encoded_length": "[1]"}
},
"decoder": {
"input": {"targets": "[1, 1]", "target_length": "[1]", "h_in": "[2, 1, 640]", "c_in": "[2, 1, 640]"},
"output": {"decoder_output": "[1, 640, 2]", "h_out": "[2, 1, 640]", "c_out": "[2, 1, 640]"}
},
"joint": {
"input": {"encoder_output": "[1, 1024, 188]", "decoder_output": "[1, 1, 640]"},
"output": {"logits": "[1, 188, 1, 3078]"}
},
"mel_encoder": {
"input": {"audio_signal": "[1, 240000]", "audio_length": "[1]"},
"output": {"encoded": "[1, 1024, 188]", "encoded_length": "[1]"}
}
},
"notes": {
"encoder_output_format": "Japanese model outputs (B, 1024, T) not (B, T, 1024) like v3 English",
"fixed_shapes": "Models use fixed shapes for stability (mobius approach)",
"conversion_date": "2025-12-01",
"conversion_tool": "coremltools 9.0b1",
"torch_version": "2.7.0"
}
}