File size: 5,193 Bytes
50731c6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | #!/usr/bin/env python3
"""Export Cohere Transcribe encoder (with projection) to CoreML.
This exports the Conformer encoder + encoder_decoder_proj layer as a single model.
"""
import argparse
import sys
from pathlib import Path
import coremltools as ct
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSpeechSeq2Seq
class EncoderWrapper(nn.Module):
"""Wrapper that combines encoder + projection layer."""
def __init__(self, encoder, encoder_decoder_proj):
super().__init__()
self.encoder = encoder
self.encoder_decoder_proj = encoder_decoder_proj
def forward(self, input_features, feature_length):
"""
Args:
input_features: (batch, n_mels, n_frames) mel spectrogram
feature_length: (batch,) int32 - actual length before padding
Returns:
hidden_states: (batch, encoded_frames, decoder_hidden_size) - encoder output after projection
"""
encoder_outputs = self.encoder(
input_features=input_features,
lengths=feature_length,
return_dict=True
)
hidden_states = encoder_outputs.last_hidden_state
# Apply projection if it exists
if self.encoder_decoder_proj is not None:
hidden_states = self.encoder_decoder_proj(hidden_states)
return hidden_states
def export_encoder(output_dir: Path, precision: str = "float16"):
"""Export the Cohere encoder to CoreML."""
print("="*70)
print("Cohere Transcribe Encoder Export")
print("="*70)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Load full model
print("\n[1/5] Loading model from HuggingFace...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
"CohereLabs/cohere-transcribe-03-2026",
trust_remote_code=True,
torch_dtype=torch.float32,
)
model.eval()
print(" ✓ Model loaded")
# Wrap encoder + projection
print("\n[2/5] Wrapping encoder...")
wrapped_encoder = EncoderWrapper(model.encoder, model.encoder_decoder_proj)
wrapped_encoder.eval()
print(" ✓ Encoder wrapped")
# Create example inputs
print("\n[3/5] Creating example inputs...")
batch_size = 1
n_mels = 128
max_frames = 3001 # From manifest
example_input_features = torch.randn(batch_size, n_mels, max_frames)
example_feature_length = torch.tensor([max_frames], dtype=torch.int32)
print(f" Input features: {example_input_features.shape}")
print(f" Feature length: {example_feature_length.shape}")
# Trace the model
print("\n[4/5] Tracing encoder...")
with torch.no_grad():
traced_encoder = torch.jit.trace(
wrapped_encoder,
(example_input_features, example_feature_length),
check_trace=False, # Disable due to conditional logic
)
# Test traced model
output = traced_encoder(example_input_features, example_feature_length)
print(f" Output shape: {output.shape}")
# Convert to CoreML
print(f"\n[5/5] Converting to CoreML ({precision})...")
# Define inputs
inputs = [
ct.TensorType(name="input_features", shape=example_input_features.shape, dtype=np.float32),
ct.TensorType(name="feature_length", shape=example_feature_length.shape, dtype=np.int32),
]
# Set compute precision
compute_precision = ct.precision.FLOAT16 if precision == "float16" else ct.precision.FLOAT32
# Convert
mlmodel = ct.convert(
traced_encoder,
inputs=inputs,
outputs=[ct.TensorType(name="hidden_states")],
minimum_deployment_target=ct.target.iOS17,
compute_precision=compute_precision,
)
# Save
output_path = output_dir / "cohere_encoder.mlpackage"
mlmodel.save(str(output_path))
print(f" ✓ Saved to: {output_path}")
print(f" Model size: {sum(f.stat().st_size for f in output_path.rglob('*') if f.is_file()) / 1024**3:.2f} GB")
print("\n" + "="*70)
print("ENCODER EXPORT COMPLETE")
print("="*70)
print(f"\nOutput: {output_path}")
print(f"\nModel inputs:")
print(f" - input_features: (1, 128, 3001) float32 - mel spectrogram")
print(f" - feature_length: (1,) int32 - actual length before padding")
print(f"\nModel output:")
print(f" - hidden_states: (1, 376, 1024) float16/32 - encoder output after projection")
print()
def main():
parser = argparse.ArgumentParser(description="Export Cohere encoder to CoreML")
parser.add_argument(
"--output-dir",
type=Path,
default=Path("build"),
help="Output directory for CoreML models"
)
parser.add_argument(
"--precision",
choices=["float16", "float32"],
default="float16",
help="Model precision (default: float16)"
)
args = parser.parse_args()
try:
export_encoder(args.output_dir, args.precision)
except Exception as e:
print(f"\n❌ Export failed: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
|