| | import coremltools as ct |
| | import numpy as np |
| | import soundfile as sf |
| | import json |
| |
|
| | |
| | with open("parakeet_ctc_coreml/metadata.json") as f: |
| | meta = json.load(f) |
| |
|
| | SAMPLE_RATE = meta["sample_rate"] |
| | MAX_SAMPLES = meta["max_audio_samples"] |
| | BLANK_ID = meta["blank_id"] |
| |
|
| | |
| | mel_encoder = ct.models.MLModel("parakeet_ctc_coreml/parakeet_ctc_mel_encoder.mlpackage") |
| | ctc_decoder = ct.models.MLModel("parakeet_ctc_coreml/parakeet_ctc_decoder.mlpackage") |
| |
|
| | |
| | audio, sr = sf.read("yc_first_minute_16k_15s.wav", dtype="float32", always_2d=False) |
| | assert sr == SAMPLE_RATE, f"Expected {SAMPLE_RATE}Hz, got {sr}Hz" |
| | original_len = len(audio) |
| | if len(audio) < MAX_SAMPLES: |
| | audio = np.pad(audio, (0, MAX_SAMPLES - len(audio))) |
| | else: |
| | audio = audio[:MAX_SAMPLES] |
| |
|
| | audio_signal = audio[np.newaxis, :].astype(np.float32) |
| | audio_length = np.array([min(original_len, MAX_SAMPLES)], dtype=np.int32) |
| |
|
| | |
| | enc_out = mel_encoder.predict({ |
| | "audio_signal": audio_signal, |
| | "audio_length": audio_length, |
| | }) |
| | encoder = enc_out["encoder"] |
| | encoder_length = enc_out["encoder_length"] |
| | print(f"Encoder output shape: {encoder.shape}") |
| |
|
| | |
| | dec_out = ctc_decoder.predict({"encoder": encoder}) |
| | log_probs = dec_out["log_probs"] |
| | print(f"Log probs shape: {log_probs.shape}") |
| |
|
| | |
| | token_ids = np.argmax(log_probs[0], axis=-1) |
| |
|
| | |
| | decoded = [] |
| | prev = None |
| | for t in token_ids: |
| | if t != BLANK_ID and t != prev: |
| | decoded.append(int(t)) |
| | prev = t |
| |
|
| | |
| | with open("vocab.json") as f: |
| | vocab = json.load(f) |
| |
|
| | text = "".join(vocab[i] for i in decoded).replace("", " ").strip() |
| | print(f"Transcription: {text}") |
| |
|
| |
|