| { |
| "model_id": "nvidia/parakeet-tdt_ctc-0.6b-ja", |
| "language": "ja", |
| "vocab_size": 3072, |
| "hidden_size": 640, |
| "encoder_features": 1024, |
| "num_decoder_layers": 2, |
| "sample_rate": 16000, |
| "model_type": "EncDecHybridRNNTCTCBPEModel", |
| "architecture": "Hybrid FastConformer-TDT-CTC", |
| "encoder_output_format": "[batch, features, time]", |
| "fixed_audio_window_sec": 15.0, |
| "fixed_audio_samples": 240000, |
| "fixed_mel_frames": 1501, |
| "fixed_encoder_frames": 188, |
| "components": { |
| "preprocessor": "preprocessor.mlpackage", |
| "encoder": "encoder.mlpackage", |
| "decoder": "decoder.mlpackage", |
| "joint": "joint.mlpackage", |
| "mel_encoder": "mel_encoder.mlpackage", |
| "vocabulary": "vocab_ja.json" |
| }, |
| "io_shapes": { |
| "preprocessor": { |
| "input": {"audio_signal": "[1, 240000]", "audio_length": "[1]"}, |
| "output": {"mel": "[1, 80, T]", "mel_length": "[1]"} |
| }, |
| "encoder": { |
| "input": {"mel": "[1, 80, 1501]", "mel_length": "[1]"}, |
| "output": {"encoded": "[1, 1024, 188]", "encoded_length": "[1]"} |
| }, |
| "decoder": { |
| "input": {"targets": "[1, 1]", "target_length": "[1]", "h_in": "[2, 1, 640]", "c_in": "[2, 1, 640]"}, |
| "output": {"decoder_output": "[1, 640, 2]", "h_out": "[2, 1, 640]", "c_out": "[2, 1, 640]"} |
| }, |
| "joint": { |
| "input": {"encoder_output": "[1, 1024, 188]", "decoder_output": "[1, 1, 640]"}, |
| "output": {"logits": "[1, 188, 1, 3078]"} |
| }, |
| "mel_encoder": { |
| "input": {"audio_signal": "[1, 240000]", "audio_length": "[1]"}, |
| "output": {"encoded": "[1, 1024, 188]", "encoded_length": "[1]"} |
| } |
| }, |
| "notes": { |
| "encoder_output_format": "Japanese model outputs (B, 1024, T) not (B, T, 1024) like v3 English", |
| "fixed_shapes": "Models use fixed shapes for stability (mobius approach)", |
| "conversion_date": "2025-12-01", |
| "conversion_tool": "coremltools 9.0b1", |
| "torch_version": "2.7.0" |
| } |
| } |
|
|