import coremltools as ct
import numpy as np
import soundfile as sf
import json

# Load metadata
with open("parakeet_ctc_coreml/metadata.json") as f:
    meta = json.load(f)

SAMPLE_RATE = meta["sample_rate"]
MAX_SAMPLES = meta["max_audio_samples"]
BLANK_ID = meta["blank_id"]

# Load models
mel_encoder = ct.models.MLModel("parakeet_ctc_coreml/parakeet_ctc_mel_encoder.mlpackage")
ctc_decoder = ct.models.MLModel("parakeet_ctc_coreml/parakeet_ctc_decoder.mlpackage")

# Load and pad/trim audio
audio, sr = sf.read("yc_first_minute_16k_15s.wav", dtype="float32", always_2d=False)
assert sr == SAMPLE_RATE, f"Expected {SAMPLE_RATE}Hz, got {sr}Hz"
original_len = len(audio)
if len(audio) < MAX_SAMPLES:
    audio = np.pad(audio, (0, MAX_SAMPLES - len(audio)))
else:
    audio = audio[:MAX_SAMPLES]

audio_signal = audio[np.newaxis, :].astype(np.float32)  # [1, N]
audio_length = np.array([min(original_len, MAX_SAMPLES)], dtype=np.int32)  # [1]

# Stage 1: Mel + Encoder
enc_out = mel_encoder.predict({
    "audio_signal": audio_signal,
    "audio_length": audio_length,
})
encoder = enc_out["encoder"]
encoder_length = enc_out["encoder_length"]
print(f"Encoder output shape: {encoder.shape}")  # [1, hidden, T]

# Stage 2: CTC Decoder
dec_out = ctc_decoder.predict({"encoder": encoder})
log_probs = dec_out["log_probs"]  # [1, T, vocab+1]
print(f"Log probs shape: {log_probs.shape}")

# Greedy decode
token_ids = np.argmax(log_probs[0], axis=-1)  # [T]

# CTC collapse (remove blanks and repeated tokens)
decoded = []
prev = None
for t in token_ids:
    if t != BLANK_ID and t != prev:
        decoded.append(int(t))
    prev = t

# Load vocab and decode
with open("vocab.json") as f:
    vocab = json.load(f)

text = "".join(vocab[i] for i in decoded).replace("", " ").strip()
print(f"Transcription: {text}")