#!/usr/bin/env python3 # HuBERT-CTC speech-recognition on Neuron import argparse import logging import time import torch from transformers import AutoProcessor, HubertForCTC from datasets import load_dataset import torch_neuronx # ensures Neuron backend from torch.nn.utils import remove_weight_norm logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def main(): parser = argparse.ArgumentParser(description="Run HuBERT-CTC on Neuron") parser.add_argument( "--model", type=str, default="hf-internal-testing/tiny-random-HubertModel", help="HuBERT-CTC model name on Hugging Face Hub", ) args = parser.parse_args() torch.set_default_dtype(torch.float32) torch.manual_seed(42) # load small speech snippet dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") sample = dataset[0]["audio"]["array"] # 16 kHz numpy array # processor + HuBERT-CTC model processor = AutoProcessor.from_pretrained(args.model) model = HubertForCTC.from_pretrained( args.model, torch_dtype=torch.float32, attn_implementation="eager" ).eval() for m in model.modules(): if hasattr(m, "weight_g") and hasattr(m, "weight_v"): remove_weight_norm(m) # preprocess inputs = processor(sample, sampling_rate=16_000, return_tensors="pt", padding=True) # pre-run to lock shapes with torch.no_grad(): _ = model(**inputs).logits # compile model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) # warmup warmup_start = time.time() with torch.no_grad(): _ = model(**inputs) warmup_time = time.time() - warmup_start # benchmark run run_start = time.time() with torch.no_grad(): logits = model(**inputs).logits run_time = time.time() - run_start # greedy decode predicted_ids = logits.argmax(dim=-1) transcription = processor.decode(predicted_ids[0]) logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) logger.info("Transcription: %s", transcription) if __name__ == "__main__": main() """ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (2048) doesn't match expected number of elements (16) /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:92:0: note: called from /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:448:0: note: called from /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:986:0: note: called from /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:1114:0: note: called from """