FluidInference
/

parakeet-ctc-110m-coreml

Model card Files Files and versions

xet

Community

Alex-Wengg commited on 26 days ago

Commit

46daad3

1 Parent(s): 20c78b6

remove TDT decoder conversion script (not CTC)

Browse files

Files changed (1) hide show

convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +0 -323

convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py DELETED Viewed

@@ -1,323 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert Parakeet TDT-CTC 110M decoder components to CoreML.
-This script exports the TDT decoder (prediction network) and joint network
-with the SAME format as the working 0.6B model:
-- JointDecision outputs token_id, token_prob, duration (argmax done inside)
-- Uses shape [1, dim, 1] for encoder/decoder steps
-- Matches the interface expected by TdtDecoderV3
-"""
-import argparse
-import os
-import torch
-import torch.nn.functional as F
-import coremltools as ct
-import numpy as np
-from pathlib import Path
-# NeMo imports
-import nemo.collections.asr as nemo_asr
-def get_model_config(model):
-    """Extract model configuration."""
-    encoder_dim = None
-    pred_hidden = 640  # Default for parakeet models
-    num_layers = 1
-    vocab_size = 1024
-    num_durations = 5
-    # Get encoder dimension
-    if hasattr(model, 'encoder'):
-        encoder = model.encoder
-        if hasattr(encoder, 'd_model'):
-            encoder_dim = encoder.d_model
-        elif hasattr(encoder, '_feat_out'):
-            encoder_dim = encoder._feat_out
-    # Get decoder config
-    if hasattr(model, 'decoder'):
-        decoder = model.decoder
-        if hasattr(decoder, 'pred_hidden'):
-            pred_hidden = decoder.pred_hidden
-        if hasattr(decoder, 'pred_rnn_layers'):
-            num_layers = decoder.pred_rnn_layers
-    # Get joint config
-    if hasattr(model, 'joint'):
-        joint = model.joint
-        if hasattr(joint, 'num_extra_outputs'):
-            num_durations = joint.num_extra_outputs
-        if hasattr(joint, 'num_classes'):
-            vocab_size = joint.num_classes - num_durations
-    return {
-        'encoder_dim': encoder_dim,
-        'pred_hidden': pred_hidden,
-        'num_layers': num_layers,
-        'vocab_size': vocab_size,
-        'num_durations': num_durations,
-    }
-class DecoderWrapper(torch.nn.Module):
-    """
-    Wrapper for the RNNT/TDT decoder (prediction network).
-    Matches 0.6B format:
-    - Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
-    - Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
-    """
-    def __init__(self, decoder, pred_hidden):
-        super().__init__()
-        self.decoder = decoder
-        self.pred_hidden = pred_hidden
-    def forward(self, targets, target_lengths, h_in, c_in):
-        """
-        Args:
-            targets: [1, 1] - previous token ID
-            target_lengths: [1] - always 1
-            h_in: [num_layers, 1, pred_hidden]
-            c_in: [num_layers, 1, pred_hidden]
-        Returns:
-            decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
-            h_out: [num_layers, 1, pred_hidden]
-            c_out: [num_layers, 1, pred_hidden]
-        """
-        state = (h_in, c_in)
-        # pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
-        pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
-        h_out, c_out = new_state
-        # Transpose to [batch, pred_hidden, time] and concat two time steps
-        # (0.6B outputs [1, 640, 2] - we match this by duplicating)
-        pred_transposed = pred_output.transpose(1, 2)  # [1, pred_hidden, 1]
-        decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2)  # [1, pred_hidden, 2]
-        return decoder_output, h_out, c_out
-class JointWrapper(torch.nn.Module):
-    """
-    Wrapper for the TDT joint network with internal argmax.
-    Matches 0.6B format:
-    - Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
-    - Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
-    """
-    def __init__(self, joint, vocab_size, num_durations=5):
-        super().__init__()
-        self.joint = joint
-        self.vocab_size = vocab_size
-        self.num_durations = num_durations
-    def forward(self, encoder_step, decoder_step):
-        """
-        Args:
-            encoder_step: [1, encoder_dim, 1]
-            decoder_step: [1, pred_hidden, 1]
-        Returns:
-            token_id: [1, 1, 1] - argmax token ID
-            token_prob: [1, 1, 1] - probability of selected token
-            duration: [1, 1, 1] - argmax duration bin
-        """
-        # Transpose to [batch, 1, dim] for joint network
-        enc = encoder_step.transpose(1, 2)  # [1, 1, encoder_dim]
-        dec = decoder_step.transpose(1, 2)  # [1, 1, pred_hidden]
-        # Run joint network
-        # Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
-        joint_out = self.joint.joint(enc, dec)
-        # Debug: print shape on first call
-        if not hasattr(self, '_debug_printed'):
-            self._debug_printed = True
-            print(f"  Joint output shape: {joint_out.shape}")
-            print(f"  Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
-        # Split: token logits include vocab + blank, durations are separate
-        # vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
-        num_tokens = self.vocab_size + 1  # Include blank at vocab_size
-        logits = joint_out[..., :num_tokens]  # [1, 1, 1, vocab_size + 1]
-        duration_logits = joint_out[..., num_tokens:]  # [1, 1, 1, num_durations]
-        # Apply softmax and get probabilities
-        probs = F.softmax(logits, dim=-1)
-        # Argmax for token
-        token_id = torch.argmax(logits, dim=-1, keepdim=True)  # [1, 1, 1, 1]
-        token_id = token_id.squeeze(-1)  # [1, 1, 1]
-        # Get probability of selected token
-        token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1))  # [1, 1, 1, 1]
-        token_prob = token_prob.squeeze(-1)  # [1, 1, 1]
-        # Argmax for duration
-        duration = torch.argmax(duration_logits, dim=-1, keepdim=False)  # [1, 1, 1]
-        return token_id.int(), token_prob, duration.int()
-def convert_decoder(model, config, output_dir: Path):
-    """Convert decoder to CoreML."""
-    print(f"Converting Decoder...")
-    print(f"  pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
-    wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
-    wrapper.eval()
-    # Create example inputs
-    targets = torch.zeros(1, 1, dtype=torch.long)
-    target_lengths = torch.ones(1, dtype=torch.long)
-    h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
-    c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
-    # Trace the model
-    with torch.no_grad():
-        traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
-    # Convert to CoreML
-    mlmodel = ct.convert(
-        traced,
-        inputs=[
-            ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
-            ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
-            ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
-            ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
-        ],
-        outputs=[
-            ct.TensorType(name="decoder_output"),
-            ct.TensorType(name="h_out"),
-            ct.TensorType(name="c_out"),
-        ],
-        minimum_deployment_target=ct.target.iOS17,
-        compute_precision=ct.precision.FLOAT16,
-    )
-    # Add metadata
-    mlmodel.author = "Fluid Inference"
-    mlmodel.short_description = "Hybrid TDT Decoder (110M)"
-    # Save
-    output_path = output_dir / "Decoder.mlpackage"
-    mlmodel.save(str(output_path))
-    print(f"  Saved to {output_path}")
-    return mlmodel
-def convert_joint(model, config, output_dir: Path):
-    """Convert joint network to CoreML."""
-    print(f"Converting JointDecision...")
-    print(f"  encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
-    print(f"  vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
-    wrapper = JointWrapper(
-        model.joint,
-        vocab_size=config['vocab_size'],
-        num_durations=config['num_durations']
-    )
-    wrapper.eval()
-    # Create example inputs - shape [1, dim, 1]
-    encoder_step = torch.randn(1, config['encoder_dim'], 1)
-    decoder_step = torch.randn(1, config['pred_hidden'], 1)
-    # Trace the model
-    with torch.no_grad():
-        traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
-    # Convert to CoreML
-    mlmodel = ct.convert(
-        traced,
-        inputs=[
-            ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
-            ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
-        ],
-        outputs=[
-            ct.TensorType(name="token_id"),
-            ct.TensorType(name="token_prob"),
-            ct.TensorType(name="duration"),
-        ],
-        minimum_deployment_target=ct.target.iOS17,
-        compute_precision=ct.precision.FLOAT16,
-    )
-    # Add metadata
-    mlmodel.author = "Fluid Inference"
-    mlmodel.short_description = "Hybrid Joint Decision (110M)"
-    # Save
-    output_path = output_dir / "JointDecision.mlpackage"
-    mlmodel.save(str(output_path))
-    print(f"  Saved to {output_path}")
-    return mlmodel
-def main():
-    parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
-    parser.add_argument(
-        "--model-name",
-        default="nvidia/parakeet-tdt_ctc-110m",
-        help="NeMo model name or path"
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path("./output"),
-        help="Output directory for CoreML models"
-    )
-    args = parser.parse_args()
-    # Create output directory
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    # Load model
-    print(f"Loading model: {args.model_name}")
-    model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
-    model.eval()
-    # Get model configuration
-    config = get_model_config(model)
-    # Auto-detect encoder dim if not found
-    if config['encoder_dim'] is None:
-        print("Auto-detecting encoder dimension...")
-        dummy_audio = torch.randn(1, 16000)
-        dummy_length = torch.tensor([16000])
-        with torch.no_grad():
-            enc_out, enc_len = model.encoder(
-                audio_signal=dummy_audio,
-                length=dummy_length
-            )
-        config['encoder_dim'] = enc_out.shape[-1]
-    print(f"\nModel config:")
-    for k, v in config.items():
-        print(f"  {k}: {v}")
-    # Convert components
-    print()
-    convert_decoder(model, config, args.output_dir)
-    convert_joint(model, config, args.output_dir)
-    print("\nConversion complete!")
-    print(f"Models saved to: {args.output_dir}")
-    print("\nNext steps:")
-    print("1. Compile to .mlmodelc:")
-    print(f"   cd {args.output_dir}")
-    print("   xcrun coremlcompiler compile Decoder.mlpackage .")
-    print("   xcrun coremlcompiler compile JointDecision.mlpackage .")
-    print("2. Copy to model cache:")
-    print("   cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
-    print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
-if __name__ == "__main__":
-    main()