FluidInference
/

parakeet-ctc-110m-coreml

Model card Files Files and versions

xet

Community

alexwengg commited on Jan 2

Commit

07826d2

verified ·

1 Parent(s): b5cf390

Delete inference.py

Browse files

Files changed (1) hide show

inference.py +0 -304

inference.py DELETED Viewed

@@ -1,304 +0,0 @@
-#!/usr/bin/env python3
-"""
-Inference script for Parakeet-TDT-CTC-110M CoreML model.
-This script demonstrates how to run inference using the converted CoreML models
-on Apple Silicon. It supports both TDT (Token-Duration Transducer) decoding for
-full transcription and CTC decoding for keyword spotting.
-Usage:
-    uv run scripts/inference.py --audio audio.wav --mode tdt
-    uv run scripts/inference.py --audio audio.wav --mode ctc
-Requirements:
-    - macOS 13+ with Apple Silicon
-    - Python 3.10+
-    - coremltools
-"""
-import argparse
-import json
-from pathlib import Path
-import coremltools as ct
-import numpy as np
-class ParakeetCoreML:
-    """CoreML inference wrapper for Parakeet-TDT-CTC-110M."""
-    def __init__(self, model_dir: str):
-        """Load CoreML models from directory.
-        Args:
-            model_dir: Path to directory containing .mlpackage files
-        """
-        self.model_dir = Path(model_dir)
-        # Load metadata
-        with open(self.model_dir / "metadata.json") as f:
-            self.metadata = json.load(f)
-        # Load vocabulary
-        with open(self.model_dir / "vocab.json") as f:
-            vocab_dict = json.load(f)
-            self.vocab = {int(k): v for k, v in vocab_dict.items()}
-        self.blank_id = len(self.vocab)  # Blank token is last
-        # Load models
-        print("Loading CoreML models...")
-        self.preprocessor = ct.models.MLModel(
-            str(self.model_dir / "Preprocessor.mlpackage")
-        )
-        self.encoder = ct.models.MLModel(
-            str(self.model_dir / "Encoder.mlpackage")
-        )
-        self.ctc_head = ct.models.MLModel(
-            str(self.model_dir / "CTCHead.mlpackage")
-        )
-        self.decoder = ct.models.MLModel(
-            str(self.model_dir / "Decoder.mlpackage")
-        )
-        self.joint = ct.models.MLModel(
-            str(self.model_dir / "JointDecision.mlpackage")
-        )
-        print("Models loaded successfully.")
-    def load_audio(self, audio_path: str) -> np.ndarray:
-        """Load audio file and convert to 16kHz mono.
-        Args:
-            audio_path: Path to audio file (WAV, MP3, etc.)
-        Returns:
-            Audio samples as float32 numpy array
-        """
-        try:
-            import librosa
-            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
-            return audio.astype(np.float32)
-        except ImportError:
-            # Fallback to scipy for WAV files
-            from scipy.io import wavfile
-            sr, audio = wavfile.read(audio_path)
-            # Convert to mono if stereo
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            # Resample if needed
-            if sr != 16000:
-                from scipy import signal
-                num_samples = int(len(audio) * 16000 / sr)
-                audio = signal.resample(audio, num_samples)
-            # Normalize to float32 [-1, 1]
-            if audio.dtype == np.int16:
-                audio = audio.astype(np.float32) / 32768.0
-            elif audio.dtype == np.int32:
-                audio = audio.astype(np.float32) / 2147483648.0
-            return audio.astype(np.float32)
-    def preprocess(self, audio: np.ndarray) -> tuple[np.ndarray, int]:
-        """Convert audio to mel spectrogram.
-        Args:
-            audio: Audio samples as float32 array
-        Returns:
-            Tuple of (mel spectrogram, mel length)
-        """
-        audio_signal = audio.reshape(1, -1).astype(np.float32)
-        audio_length = np.array([len(audio)], dtype=np.int32)
-        result = self.preprocessor.predict({
-            "audio_signal": audio_signal,
-            "audio_length": audio_length
-        })
-        return result["mel"], int(result["mel_length"][0])
-    def encode(self, mel: np.ndarray, mel_length: int) -> tuple[np.ndarray, int]:
-        """Run encoder on mel spectrogram.
-        Args:
-            mel: Mel spectrogram from preprocessor
-            mel_length: Length of mel spectrogram
-        Returns:
-            Tuple of (encoder output, encoder length)
-        """
-        result = self.encoder.predict({
-            "mel": mel,
-            "mel_length": np.array([mel_length], dtype=np.int32)
-        })
-        return result["encoder"], int(result["encoder_length"][0])
-    def decode_ctc(self, encoder_output: np.ndarray) -> list[int]:
-        """CTC greedy decoding.
-        Args:
-            encoder_output: Output from encoder
-        Returns:
-            List of token IDs (with duplicates and blanks removed)
-        """
-        result = self.ctc_head.predict({"encoder_output": encoder_output})
-        log_probs = result["ctc_log_probs"]
-        # Greedy decoding: take argmax at each timestep
-        predictions = np.argmax(log_probs[0], axis=-1)
-        # Remove duplicates and blanks
-        tokens = []
-        prev_token = self.blank_id
-        for token in predictions:
-            if token != self.blank_id and token != prev_token:
-                tokens.append(int(token))
-            prev_token = token
-        return tokens
-    def decode_tdt(self, encoder_output: np.ndarray, encoder_length: int) -> list[int]:
-        """TDT (Token-Duration Transducer) decoding.
-        Args:
-            encoder_output: Output from encoder
-            encoder_length: Length of encoder output
-        Returns:
-            List of token IDs
-        """
-        hidden_size = self.metadata["decoder_hidden_dim"]
-        num_layers = self.metadata["decoder_num_layers"]
-        # Initialize decoder state
-        h = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
-        c = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
-        # Start with blank token
-        targets = np.zeros((1, 1), dtype=np.int32)
-        target_length = np.array([1], dtype=np.int32)
-        tokens = []
-        frame = 0
-        max_tokens = 1000  # Safety limit
-        while frame < encoder_length and len(tokens) < max_tokens:
-            # Get decoder output
-            decoder_result = self.decoder.predict({
-                "targets": targets,
-                "target_length": target_length,
-                "h_in": h,
-                "c_in": c
-            })
-            decoder_output = decoder_result["decoder"]
-            h = decoder_result["h_out"]
-            c = decoder_result["c_out"]
-            # Get encoder step
-            encoder_step = encoder_output[0, frame:frame+1, :].T.reshape(1, -1, 1)
-            decoder_step = decoder_output.T.reshape(1, -1, 1)
-            # Joint prediction
-            joint_result = self.joint.predict({
-                "encoder_step": encoder_step.astype(np.float32),
-                "decoder_step": decoder_step.astype(np.float32)
-            })
-            token_id = int(joint_result["token_id"])
-            duration_bin = int(joint_result["duration_bin"])
-            # Duration bins: 0=0, 1=1, 2=2, 3=3, 4=4+
-            durations = [0, 1, 2, 3, 4]
-            duration = durations[min(duration_bin, 4)]
-            if token_id != self.blank_id:
-                tokens.append(token_id)
-                # Update decoder input
-                targets = np.array([[token_id]], dtype=np.int32)
-            # Advance by duration (minimum 1 frame)
-            frame += max(1, duration)
-        return tokens
-    def tokens_to_text(self, tokens: list[int]) -> str:
-        """Convert token IDs to text.
-        Args:
-            tokens: List of token IDs
-        Returns:
-            Decoded text string
-        """
-        pieces = [self.vocab.get(t, "") for t in tokens]
-        # Join and handle SentencePiece encoding
-        text = "".join(pieces).replace("▁", " ").strip()
-        return text
-    def transcribe(self, audio_path: str, mode: str = "tdt") -> str:
-        """Transcribe audio file.
-        Args:
-            audio_path: Path to audio file
-            mode: Decoding mode - "tdt" for full transcription, "ctc" for keyword spotting
-        Returns:
-            Transcribed text
-        """
-        # Load and preprocess audio
-        audio = self.load_audio(audio_path)
-        mel, mel_length = self.preprocess(audio)
-        # Encode
-        encoder_output, encoder_length = self.encode(mel, mel_length)
-        # Decode
-        if mode == "ctc":
-            tokens = self.decode_ctc(encoder_output)
-        else:
-            tokens = self.decode_tdt(encoder_output, encoder_length)
-        # Convert to text
-        text = self.tokens_to_text(tokens)
-        return text
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run inference with Parakeet-TDT-CTC-110M CoreML model"
-    )
-    parser.add_argument(
-        "--audio", type=str, required=True,
-        help="Path to audio file (WAV, MP3, etc.)"
-    )
-    parser.add_argument(
-        "--model-dir", type=str, default=".",
-        help="Directory containing CoreML model files"
-    )
-    parser.add_argument(
-        "--mode", type=str, choices=["tdt", "ctc"], default="tdt",
-        help="Decoding mode: 'tdt' for transcription, 'ctc' for keyword spotting"
-    )
-    args = parser.parse_args()
-    # Load model
-    model = ParakeetCoreML(args.model_dir)
-    # Transcribe
-    print(f"\nTranscribing: {args.audio}")
-    print(f"Mode: {args.mode.upper()}")
-    print("-" * 40)
-    text = model.transcribe(args.audio, mode=args.mode)
-    print(f"Result: {text}")
-if __name__ == "__main__":
-    main()