Ubuntu commited on Jan 26

Commit

5ee43e9

1 Parent(s): 06d3040

tests

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

torch_compile/flux/test_clip_text_encoder.py.py +80 -0
torch_compile/flux/test_flux_transformer.py +73 -0
torch_compile/flux/test_t5_text_encoder.py +72 -0
torch_compile/flux/test_vae_decoder.py +84 -0
torch_compile/run_albert.py +63 -0
torch_compile/run_ast.py +77 -0
torch_compile/run_beit.py +81 -0
torch_compile/run_bert.py +62 -0
torch_compile/run_camembert.py +71 -0
torch_compile/run_clip.py +76 -0
torch_compile/run_convbert.py +72 -0
torch_compile/run_convnext.py +72 -0
torch_compile/run_convnextv2.py +72 -0
torch_compile/run_cvt.py +72 -0
torch_compile/run_deberta.py +72 -0
torch_compile/run_deberta_v3.py +72 -0
torch_compile/run_deit.py +71 -0
torch_compile/run_distillbert.py +67 -0
torch_compile/run_donutswin.py +76 -0
torch_compile/run_dpt.py +66 -0
torch_compile/run_electra.py +67 -0
torch_compile/run_esm.py +67 -0
torch_compile/run_flaubert.py +97 -0
torch_compile/run_hubert.py +85 -0
torch_compile/run_levit.py +70 -0
torch_compile/run_mobilebert.py +67 -0
torch_compile/run_mobilenetv2.py +71 -0
torch_compile/run_mobilevit.py +70 -0
torch_compile/run_modernbert.py +66 -0
torch_compile/run_mpnet.py +62 -0
torch_compile/run_phi.py +77 -0
torch_compile/run_phi3.py +86 -0
torch_compile/run_roberta.py +67 -0
torch_compile/run_roformer.py +67 -0
torch_compile/run_sam2.py +56 -0
torch_compile/run_swin.py +76 -0
torch_compile/run_t5_decoder.py +66 -0
torch_compile/run_t5_encoder.py +59 -0
torch_compile/run_unispeech.py +62 -0
torch_compile/run_unispeech_sat.py +67 -0
torch_compile/run_vit.py +64 -0
torch_compile/run_wav2vec2.py +73 -0
torch_compile/run_whisper.py +91 -0
torch_compile/run_xlm.py +0 -0
torch_compile/run_xlm_roberta.py +0 -0
torch_compile/run_yolos.py +83 -0
torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt +1 -0
torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt +8 -0
torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt +1 -0
torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt +8 -0

torch_compile/flux/test_clip_text_encoder.py.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python3
+"""
+CLIP (Flux variant) zero-shot image-classification on Neuron.
+Flux pipeline uses:  openai/clip-vit-large-patch14
+"""
+import argparse
+import logging
+import time
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from datasets import load_dataset
+import torch_neuronx  # noqa: F401  guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="CLIP (Flux checkpoint) zero-shot image classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="openai/clip-vit-large-patch14",  # Flux CLIP checkpoint
+        help="CLIP model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model (Flux CLIP checkpoint)
+    processor = CLIPProcessor.from_pretrained(args.model)
+    model = CLIPModel.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # Zero-shot labels
+    texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
+    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+    # Pre-run once to freeze shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass (allow graph breaks for big model)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Compute probabilities
+    logits_per_image = outputs.logits_per_image  # [B, num_texts]
+    probs = logits_per_image.softmax(dim=-1)
+    best_idx = int(probs.argmax().item())
+    best_label = texts[best_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Probabilities: %s", probs.tolist())
+    logger.info("Predicted label: %s", best_label)
+if __name__ == "__main__":
+    main()

torch_compile/flux/test_flux_transformer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# torchrun --nproc_per_node=8 test_flux_transformer.py
+import os, time, argparse, logging, torch, torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module
+)
+from diffusers import FluxTransformer2DModel
+import torch_neuronx
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def apply_tp_flux(transformer: torch.nn.Module, tp_mesh: DeviceMesh):
+    # embed & final-norm replicated
+    plan = {"x_embedder": None, "norm_out": None}
+    parallelize_module(transformer, tp_mesh, plan)
+    # inside each transformer block
+    for block in transformer.transformer_blocks:
+        blk = {
+            "norm1": None,
+            "norm1_k": None,
+            "attn.qkv":  ColwiseParallel(),
+            "attn.proj": RowwiseParallel(output_layouts=Replicate()),
+            "attn.norm_q": None,
+            "attn.norm_k": None,
+            "ffn.net.0":  ColwiseParallel(),   # gate
+            "ffn.net.2":  RowwiseParallel(output_layouts=Replicate()),
+        }
+        parallelize_module(block, tp_mesh, blk)
+    return transformer
+def main():
+    dist.init_process_group(backend="neuron")
+    rank = dist.get_rank()
+    device = torch.device(f"neuron:{rank}")
+    tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size())))
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="black-forest-labs/FLUX.1-dev/transformer")
+    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--dim", type=int, default=3072)
+    args = parser.parse_args()
+    # create on CPU, real tensors
+    with torch.device("cpu"):
+        transformer = FluxTransformer2DModel.from_pretrained(
+            args.model, torch_dtype=torch.bfloat16, attn_implementation="eager"
+        ).eval()
+    transformer = apply_tp_flux(transformer, tp_mesh)
+    # move local shards to Neuron
+    for p in transformer.parameters():
+        if isinstance(p, DTensor):
+            p._local_tensor = p._local_tensor.to(device, dtype=torch.bfloat16)
+        else:
+            p.data = p.data.to(device, dtype=torch.bfloat16)
+    transformer = torch.compile(transformer, backend="neuron", fullgraph=False)
+    batch = 1
+    hidden = torch.randn(batch, args.seq_len, args.dim, dtype=torch.bfloat16, device=device)
+    encoder_hidden = torch.randn(batch, args.seq_len, 4096, dtype=torch.bfloat16, device=device)
+    timestep = torch.tensor([500], dtype=torch.int64, device=device)
+    with torch.no_grad():
+        _ = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep)
+        t0 = time.time()
+        out = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep)
+        logger.info("Rank %d Flux-TFM latency: %.3f ms  shape: %s",
+                    rank, (time.time()-t0)*1000, out.sample.shape)
+if __name__ == "__main__":
+    main()

torch_compile/flux/test_t5_text_encoder.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# torchrun --nproc_per_node=4 test_t5_text_encoder.py
+import os, time, argparse, logging, torch, torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module
+)
+from transformers import T5EncoderModel, AutoTokenizer
+from torchtitan.models.t5 import T5Encoder  # or transformers T5EncoderModel
+import torch_neuronx
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def apply_tp_t5(encoder: torch.nn.Module, tp_mesh: DeviceMesh):
+    # encoder.embed_tokens  already replicated
+    plan = {
+        "embed_tokens": None,  # replicate
+        "encoder.block": None,  # we will loop inside
+    }
+    parallelize_module(encoder, tp_mesh, plan)
+    # shard every dense layer inside each encoder block
+    for layer in encoder.encoder.block:
+        layer_plan = {
+            "layer.0.SelfAttention.q":  ColwiseParallel(),
+            "layer.0.SelfAttention.k":  ColwiseParallel(),
+            "layer.0.SelfAttention.v":  ColwiseParallel(),
+            "layer.0.SelfAttention.o":  RowwiseParallel(output_layouts=Replicate()),
+            "layer.0.dense":            ColwiseParallel(),
+            "layer.1.dense":            RowwiseParallel(output_layouts=Replicate()),
+        }
+        parallelize_module(layer, tp_mesh, layer_plan)
+    return encoder
+def main():
+    dist.init_process_group(backend="neuron")
+    rank = dist.get_rank()
+    device = torch.device(f"neuron:{rank}")
+    tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size())))
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="google/t5-v1_1-xxl")
+    parser.add_argument("--seq-len", type=int, default=512)
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    # create model on CPU, real tensors
+    with torch.device("cpu"):
+        encoder = T5EncoderModel.from_pretrained(args.model, attn_implementation="eager").eval()
+    encoder = apply_tp_t5(encoder, tp_mesh)
+    # move local shards to Neuron
+    for p in encoder.parameters():
+        if isinstance(p, DTensor):
+            p._local_tensor = p._local_tensor.to(device)
+        else:
+            p.data = p.data.to(device)
+    encoder = torch.compile(encoder, backend="neuron", fullgraph=False)
+    text = ["a photo of a cat"]
+    txt_in = tokenizer(text, max_length=args.seq_len, padding="max_length", return_tensors="pt")
+    input_ids = txt_in.input_ids.to(device)
+    with torch.no_grad():
+        _ = encoder(input_ids)  # compile
+        t0 = time.time()
+        out = encoder(input_ids).last_hidden_state
+        logger.info("Rank %d T5-XXL enc latency: %.3f ms  shape: %s",
+                    rank, (time.time()-t0)*1000, out.shape)  # [1, seq_len, 4096]
+if __name__ == "__main__":
+    main()

torch_compile/flux/test_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env python3
+"""
+Flux VAE decoder (16-ch latent → RGB image) on Neuron.
+Checkpoint: black-forest-labs/FLUX.1-dev/vae
+"""
+import argparse
+import logging
+import time
+from pathlib import Path
+import torch
+from diffusers import AutoencoderKL
+import torch_neuronx  # noqa: F401  guarantees Neuron backend
+from PIL import Image
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Flux VAE decoder (latent → image) with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        # default="black-forest-labs/FLUX.1-dev/vae",
+        default="/workspace/flux_weight/",
+        help="Flux VAE checkpoint on Hugging Face Hub",
+    )
+    parser.add_argument("--latent-ch", type=int, default=16, help="Latent channels (Flux=16)")
+    parser.add_argument("--scale", type=int, default=32, help="Latent spatial size (256 px / 8)")
+    parser.add_argument("--output", type=str, default="flux_vae_out.png", help="Output image path")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load Flux VAE decoder
+    vae = AutoencoderKL.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32).eval()
+    # Create dummy latent (bfloat16, N(0,1)) - shape: [B, 16, H/8, W/8]
+    latent = torch.randn(1, args.latent_ch, args.scale, args.scale, dtype=torch.float32)
+    # Pre-run once to freeze shapes before compilation
+    with torch.no_grad():
+        _ = vae.decode(latent).sample
+    # Compile decode function (allow graph breaks for big kernels)
+    decode_fn = torch.compile(vae.decode, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = decode_fn(latent)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        image = decode_fn(latent).sample
+    run_time = time.time() - run_start
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("VAE output shape: %s", image.shape)  # [1, 3, H, W]
+    # Convert to PIL and save
+    image = (image / 2 + 0.5).clamp(0, 1)  # scale to [0,1]
+    image = image.cpu().float()
+    Image.fromarray((image[0].permute(1, 2, 0).numpy() * 255).astype("uint8")).save(args.output)
+    logger.info("Saved decoded image to %s", Path(args.output).resolve())
+if __name__ == "__main__":
+    main()
+"""
+The compilation process took more than 2 hours.
+/usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory
+  warnings.warn(f"Could not import StableHLO C++ extension: {e}")
+INFO:__main__:Warmup: 4010.52 s, Run: 22.5420 s
+INFO:__main__:VAE output shape: torch.Size([1, 3, 256, 256])
+INFO:__main__:Saved decoded image to /workspace/torch_neuron_samples/torch-neuron-samples/scripts/torch_compile/flux/flux_vae_out.png
+"""

torch_compile/run_albert.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, AlbertForSequenceClassification
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run ALBERT on Neuron")
+    parser.add_argument(
+        "--model", type=str, default="albert-base-v2", help="ALBERT model name"
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load ALBERT model and tokenizer
+    model = AlbertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    inputs = tokenizer(
+        "Hamilton is considered to be the best musical of human history.",
+        return_tensors="pt"
+    )
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    predicted_class_id = logits.argmax().item()
+    predicted_class_label = model.config.id2label[predicted_class_id]
+    logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
+    logger.info(f"Output label: {predicted_class_label}")
+if __name__ == "__main__":
+    main()

torch_compile/run_ast.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoFeatureExtractor, ASTForAudioClassification
+from datasets import load_dataset
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run AST (Audio Spectrogram Transformer) on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="MIT/ast-finetuned-audioset-10-10-0.4593",
+        help="AST model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and extract features
+    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    dataset = dataset.sort("id")
+    sampling_rate = dataset.features["audio"].sampling_rate
+    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model)
+    inputs = feature_extractor(
+        dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt"
+    )
+    # Load AST model
+    model = ASTForAudioClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Decode result
+    predicted_class_ids = torch.argmax(logits, dim=-1).item()
+    predicted_label = model.config.id2label[predicted_class_ids]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+Works
+"""

torch_compile/run_beit.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, BeitForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run BEiT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/beit-base-patch16-224-pt22k",
+        help="BEiT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    image_processor = AutoImageProcessor.from_pretrained(args.model)
+    model = BeitForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Preprocess
+    inputs = image_processor(image, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Predicted ImageNet class
+    predicted_label = logits.argmax(-1).item()
+    label_str = model.config.id2label[predicted_label]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", label_str)
+if __name__ == "__main__":
+    main()
+"""
+root@d90ba90f3d81:/workspace/torch_neuron_samples/torch-neuron-samples/scripts/tests# torch-mlir-opt -pass-pipeline='builtin.module(torch-backend-to-stablehlo-backend-pipeline)' /tmp/UnnammedModule.mlir
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
+/usr/local/lib/python3.10/site-packages/transformers/pytorch_utils.py:361:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:625:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:688:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:824:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:1007:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: note: see current operation: %733 = "torch.aten.fill.Tensor"(%732, %524) : (!torch.vtensor<[197],si64>, !torch.vtensor<[],si64>) -> !torch.vtensor<[197],si64>
+"""

torch_compile/run_bert.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, BertForSequenceClassification
+import torch_neuronx
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run Bert on Neuron")
+    parser.add_argument(
+        "--model", type=str, default="google-bert/bert-base-uncased", help="Bert model name"
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    model = BertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    inputs = tokenizer("Hamilton is considered to be the best musical of human history.", return_tensors="pt")
+    # Run once to establish shapes before compile
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    predicted_class_id = logits.argmax().item()
+    predicted_class_label = model.config.id2label[predicted_class_id]
+    logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
+    logger.info(f"Output label: {predicted_class_label}")
+if __name__ == "__main__":
+    main()
+"""
+Works
+"""

torch_compile/run_camembert.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, CamembertForSequenceClassification
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run CamemBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="camembert-base",
+        help="CamemBERT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = CamembertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Tokenize sample text
+    text = "CamemBERT est un modèle de langue français."
+    inputs = tokenizer(text, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Decode result
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+Works
+"""

torch_compile/run_clip.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="CLIP zero-shot image classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="openai/clip-vit-base-patch32",
+        help="CLIP model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    processor = CLIPProcessor.from_pretrained(args.model)
+    model = CLIPModel.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Build zero-shot inputs
+    texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
+    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Compute probabilities
+    logits_per_image = outputs.logits_per_image  # [batch_size, num_texts]
+    probs = logits_per_image.softmax(dim=-1)
+    best_idx = int(probs.argmax())
+    best_label = texts[best_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Probabilities: %s", probs.tolist())
+    logger.info("Predicted label: %s", best_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_convbert.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, ConvBertForSequenceClassification
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run ConvBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="YituTech/conv-bert-base",
+        help="ConvBERT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = ConvBertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Tokenize sample text
+    text = "ConvBERT combines self-attention and lightweight convolutions."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Decode result
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+<unknown>:0: error: failed to legalize operation 'torch.constant.int'
+<unknown>:0: note: see current operation: %0 = "torch.constant.int"() <{value = 9 : i64}> : () -> !torch.int
+"""

torch_compile/run_convnext.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, ConvNextForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="ConvNeXt image-classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/convnext-tiny-224",
+        help="ConvNeXT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = ConvNextForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Preprocess image
+    inputs = processor(images=image, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Predicted ImageNet class
+    predicted_class_idx = outputs.logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_convnextv2.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, ConvNextV2ForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="ConvNeXt-V2 image-classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/convnextv2-tiny-1k-224",
+        help="ConvNeXt-V2 model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = ConvNextV2ForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Preprocess image
+    inputs = processor(images=image, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Predicted ImageNet class
+    predicted_class_idx = outputs.logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_cvt.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, CvtForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="CvT image-classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/cvt-13",
+        help="CvT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = CvtForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Preprocess image
+    inputs = processor(images=image, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Predicted ImageNet class
+    predicted_class_idx = outputs.logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_deberta.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, DebertaForSequenceClassification
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="DeBERTa sequence-classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/deberta-base",
+        help="DeBERTa model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = DebertaForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Tokenize sample text
+    text = "DeBERTa improves BERT and RoBERTa using disentangled attention."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Decode result
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='neuron:0', size=(1, 18, 768)), Parameter(FakeTensor(..., size=(2304, 768), requires_grad=True)), None), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices neuron:0, cpu')
+"""

torch_compile/run_deberta_v3.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, DebertaV2ForSequenceClassification
+import torch_neuronx  # ensures Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(
+        description="DeBERTa-v3 sequence-classification with torch.compile on Neuron"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/deberta-v3-base",
+        help="DeBERTa-v3 model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = DebertaV2ForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Tokenize sample text
+    text = "DeBERTa-v3 achieves stronger performance with improved pre-training."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Compile forward pass (allow graph breaks to avoid instruction-limit)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # Decode result
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+Works
+"""

torch_compile/run_deit.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+# DeiT (Vision Transformer) image-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, DeiTForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run DeiT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/deit-base-distilled-patch16-224",
+        help="DeiT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & distilled DeiT model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = DeiTForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_distillbert.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# DistilBERT text-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, DistilBertForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run DistilBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="distilbert-base-uncased-finetuned-sst-2-english",
+        help="DistilBERT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = DistilBertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "DistilBERT is a compact, fast variant of BERT."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_donutswin.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+# DonutSwin image-encoder on Neuron (no decoder, pure vision)
+import argparse
+import logging
+import time
+import torch
+from transformers import DonutImageProcessor, DonutSwinModel
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run DonutSwin encoder on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="naver-clova-ix/donut-base",
+        help="DonutSwin model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & vision encoder only
+    processor = DonutImageProcessor.from_pretrained(args.model)
+    model = DonutSwinModel.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).last_hidden_state
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        hidden = model(**inputs).last_hidden_state
+    run_time = time.time() - run_start
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Output hidden shape: %s", hidden.shape)  # [B, seq_len, hidden_size]
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:637:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:712:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:783:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:922:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: note: see current operation: %1327 = "torch.aten.fill.Tensor"(%1326, %1091) : (!torch.vtensor<[1,630,470,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,630,470,1],f32>
+"""

torch_compile/run_dpt.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python3
+# DPT (Dense Prediction Transformer) monocular depth estimation on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import DPTImageProcessor, DPTForDepthEstimation
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run DPT depth estimation on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Intel/dpt-large",
+        help="DPT model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & DPT model
+    processor = DPTImageProcessor.from_pretrained(args.model)
+    model = DPTForDepthEstimation.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).predicted_depth
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        depth = model(**inputs).predicted_depth
+    run_time = time.time() - run_start
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Output depth shape: %s", depth.shape)  # [B, 1, H, W]
+if __name__ == "__main__":
+    main()

torch_compile/run_electra.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# ELECTRA (discriminator) text-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, ElectraForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run ELECTRA on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="google/electra-base-discriminator",
+        help="ELECTRA model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = ElectraForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "ELECTRA pre-trains a discriminator to detect replaced tokens."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_esm.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# ESM (Evolutionary Scale Modeling) protein-sequence classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import EsmTokenizer, EsmForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run ESM on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/esm2_t33_650M_UR50D",
+        help="ESM model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = EsmTokenizer.from_pretrained(args.model)
+    model = EsmForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize protein sequence
+    sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
+    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_flaubert.py ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env python3
+# FlauBERT text-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import FlaubertTokenizer, FlaubertForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run FlauBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="flaubert/flaubert_base_cased",
+        help="FlauBERT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = FlaubertTokenizer.from_pretrained(args.model)
+    model = FlaubertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "FlauBERT est un modèle de langue français performant."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+Traceback (most recent call last):
+  File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 67, in <module>
+    main()
+  File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 49, in main
+    _ = model(**inputs)
+  File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 841, in compile_wrapper
+    raise e.with_traceback(None) from e.__cause__  # User compiler error
+torch._dynamo.exc.Unsupported: Unsupported Tensor.item() call with capture_scalar_outputs=False
+  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
+  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
+  Developer debug context: call_method TensorVariable() item () {}
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
+from user code:
+   File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 1156, in forward
+    transformer_outputs = self.transformer(
+  File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 873, in forward
+    assert lengths.max().item() <= slen
+Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
+"""

torch_compile/run_hubert.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python3
+# HuBERT-CTC speech-recognition on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoProcessor, HubertForCTC
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+from torch.nn.utils import remove_weight_norm
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run HuBERT-CTC on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="hf-internal-testing/tiny-random-HubertModel",
+        help="HuBERT-CTC model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load small speech snippet
+    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    sample = dataset[0]["audio"]["array"]  # 16 kHz numpy array
+    # processor + HuBERT-CTC model
+    processor = AutoProcessor.from_pretrained(args.model)
+    model = HubertForCTC.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    for m in model.modules():
+        if hasattr(m, "weight_g") and hasattr(m, "weight_v"):
+            remove_weight_norm(m)
+    # preprocess
+    inputs = processor(sample, sampling_rate=16_000, return_tensors="pt", padding=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # greedy decode
+    predicted_ids = logits.argmax(dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Transcription: %s", transcription)
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (2048) doesn't match expected number of elements (16)
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:92:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:448:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:986:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:1114:0: note: called from
+"""

torch_compile/run_levit.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+# LeViT vision-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, LevitForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run LeViT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="facebook/levit-128S",
+        help="LeViT model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = LevitForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_mobilebert.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# MobileBERT text-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, MobileBertForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run MobileBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="google/mobilebert-uncased",
+        help="MobileBERT model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = MobileBertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "MobileBERT is a compact BERT for on-device NLP."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+# MobileNetV2 image-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from torchvision import transforms
+from transformers import AutoImageProcessor, MobileNetV2ForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run MobileNetV2 on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="google/mobilenet_v2_1.0_224",
+        help="MobileNetV2 model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & MobileNetV2 model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = MobileNetV2ForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_mobilevit.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+# MobileViT image-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, MobileViTForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run MobileViT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="apple/mobilevit-small",
+        help="MobileViT model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = MobileViTForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_modernbert.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python3
+# ModernBERT-base text-classification on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, ModernBertForSequenceClassification
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run ModernBERT on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="answerdotai/ModernBERT-base",
+        help="ModernBERT model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = ModernBertForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "Hello, my dog is cute"
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile (full graph for single encoder)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_mpnet.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+# MPNet sentence-embedding on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, MPNetModel
+import torch_neuronx  # ensures Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run MPNet encoder on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/mpnet-base",
+        help="MPNet model name on Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = MPNetModel.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample sentence
+    text = "MPNet is a variant of BERT with permutation language modeling."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).pooler_output
+    # compile
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        embeddings = model(**inputs).pooler_output
+    run_time = time.time() - run_start
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Output embedding shape: %s", embeddings.shape)  # [1, hidden]
+if __name__ == "__main__":
+    main()

torch_compile/run_phi.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env python3
+# Phi (Phi-2 default) forward-trace + manual greedy on Neuron – fixed pad token
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@torch.no_grad()
+def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens):
+    """Manual greedy loop.  Calls the *compiled* forward iteratively."""
+    B, seq_len = input_ids.shape
+    device = input_ids.device
+    position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1)
+    for _ in range(max_new_tokens):
+        logits = model_forward(input_ids, position_ids)[0]  # unpack tuple
+        next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+        input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:]  # rolling window
+    return input_ids
+def main():
+    parser = argparse.ArgumentParser(description="Phi forward-compile + manual greedy on Neuron")
+    parser.add_argument("--model", default="microsoft/phi-2")
+    parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length")
+    parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    # Phi has no pad_token by default
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.float32,
+        attn_implementation="eager",
+        use_cache=False,  # static shapes
+    ).eval()
+    prompt = "The future of AI is"
+    inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
+    input_ids = inputs.input_ids
+    B, seq_len = input_ids.shape
+    # shape lock & compile forward only (full graph)
+    with torch.no_grad():
+        position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1)
+        _ = model(input_ids, position_ids)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = model(input_ids, position_ids)
+    logger.info("Warmup (forward): %.3f s", time.time() - start)
+    # manual greedy generation
+    start = time.time()
+    final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens)
+    logger.info("Generate (manual loop): %.3f s", time.time() - start)
+    text = tokenizer.decode(final_ids[0], skip_special_tokens=True)
+    logger.info("Output: %s", text)
+if __name__ == "__main__":
+    main()

torch_compile/run_phi3.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env python3
+# Phi-3-mini  –  compile model.forward only, manual greedy loop on Neuron
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@torch.no_grad()
+def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens):
+    B, seq_len = input_ids.shape
+    device = input_ids.device
+    position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1)
+    for _ in range(max_new_tokens):
+        logits = model_forward(input_ids, position_ids)[0]
+        next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+        input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:]  # rolling window
+        # position_ids stays identical (fixed seq_len)
+    return input_ids
+def main():
+    parser = argparse.ArgumentParser(description="Phi-3-mini forward-compile + manual greedy on Neuron")
+    parser.add_argument("--model", default="microsoft/Phi-3-mini-4k-instruct")
+    parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length")
+    parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.float32,
+        attn_implementation="eager",
+        use_cache=False,  # static shapes
+    ).eval()
+    # fixed-shape prompt
+    prompt = "The future of AI is"
+    inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
+    input_ids = inputs.input_ids
+    B, seq_len = input_ids.shape
+    # shape lock & compile forward only (full graph)
+    with torch.no_grad():
+        position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1)
+        _ = model(input_ids, position_ids)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = model(input_ids, position_ids)
+    logger.info("Warmup (forward): %.3f s", time.time() - start)
+    # manual greedy generation
+    start = time.time()
+    final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens)
+    logger.info("Generate (manual loop): %.3f s", time.time() - start)
+    text = tokenizer.decode(final_ids[0], skip_special_tokens=True)
+    logger.info("Output: %s", text)
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory
+  warnings.warn(f"Could not import StableHLO C++ extension: {e}")
+`torch_dtype` is deprecated! Use `dtype` instead!
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.90it/s]
+INFO:__main__:Warmup (forward): 19.975 s
+INFO:__main__:Generate (manual loop): 271.678 s
+INFO:__main__:Output: The future of AI is
+:         1iewer
+I'melissa'
+"""

torch_compile/run_roberta.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# RoBERTa text-classification on Neuron – full graph compile
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, RobertaForSequenceClassification
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="RoBERTa on Neuron (full graph)")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="roberta-base",
+        help="RoBERTa model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = RobertaForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "RoBERTa is a robustly optimized BERT pretraining approach."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile full graph
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_roformer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# RoFormer (Rotary-position Transformer) text-classification on Neuron – full graph
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, RoFormerForSequenceClassification
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="RoFormer on Neuron (full graph)")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="junnyu/roformer_chinese_base",
+        help="RoFormer model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # load tokenizer & model
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = RoFormerForSequenceClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # tokenize sample
+    text = "RoFormer uses rotary position embeddings."
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile full graph
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_sam2.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+# SAM encoder on Neuron – constant-shape, no lambda
+import argparse
+import logging
+import time
+import torch
+from transformers import SamProcessor, SamModel
+from PIL import Image
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="SAM encoder on Neuron (full graph)")
+    parser.add_argument("--model", default="facebook/sam-vit-base")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    # load processor & model
+    processor = SamProcessor.from_pretrained(args.model)
+    model = SamModel.from_pretrained(args.model, attn_implementation="eager").eval()
+    # dummy 224×224 RGB image
+    dummy_image = Image.new("RGB", (224, 224), color="red")
+    # constant-shape inputs (no points → encoder only)
+    inputs = processor(images=dummy_image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model.get_image_embeddings(**inputs)
+    # compile encoder forward (full graph)
+    model.get_image_embeddings = torch.compile(
+        model.get_image_embeddings, backend="neuron", fullgraph=True
+    )
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = model.get_image_embeddings(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - start)
+    # benchmark
+    start = time.time()
+    with torch.no_grad():
+        embeddings = model.get_image_embeddings(**inputs)
+    logger.info("Run: %.3f s", time.time() - start)
+    logger.info("Embedding shape: %s", embeddings.shape)  # [1, 256, 64, 64]
+if __name__ == "__main__":
+    main()

torch_compile/run_swin.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+# Swin Transformer image-classification on Neuron – full graph
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, SwinForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Swin on Neuron (full graph)")
+    parser.add_argument("--model", default="microsoft/swin-tiny-patch4-window7-224")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = SwinForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile full graph
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - warmup_start)
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Run: %.3f s", run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:662:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:736:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:806:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:945:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:1139:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: note: see current operation: %1014 = "torch.aten.fill.Tensor"(%1013, %778) : (!torch.vtensor<[1,49,49,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,49,49,1],f32>
+"""

torch_compile/run_t5_decoder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python3
+# T5 decoder (no cache) on Neuron – constant shapes, full graph, no Apex
+import os
+os.environ["USE_FUSED_LAYER_NORM"] = "0"  # MUST be before any transformers import
+import argparse
+import logging
+import time
+import torch
+from transformers import T5Tokenizer, T5Model
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="T5 decoder on Neuron (full graph, no cache)")
+    parser.add_argument("--model", default="t5-small")
+    parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    tokenizer = T5Tokenizer.from_pretrained(args.model)
+    # disable DynamicCache → no deepcopy of config
+    model = T5Model.from_pretrained(
+        args.model,
+        torch_dtype=torch.float32,
+        attn_implementation="eager",
+        use_cache=False,  # <-- static shapes, no cache
+    ).eval()
+    # constant-shape inputs
+    text = "hello"
+    enc_tok = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        enc_out = model.encoder(input_ids=enc_tok.input_ids).last_hidden_state.detach()
+    dec_tok = tokenizer("<pad>", max_length=args.seq_len, padding="max_length", return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model.decoder(input_ids=dec_tok.input_ids, encoder_hidden_states=enc_out).last_hidden_state
+    # compile decoder forward only (full graph)
+    decode_fn = lambda inp, enc: model.decoder(input_ids=inp, encoder_hidden_states=enc).last_hidden_state
+    decode_fn = torch.compile(decode_fn, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = decode_fn(dec_tok.input_ids, enc_out)
+    logger.info("Warmup: %.3f s", time.time() - start)
+    # benchmark
+    start = time.time()
+    with torch.no_grad():
+        hidden = decode_fn(dec_tok.input_ids, enc_out)
+    logger.info("Run: %.3f s", time.time() - start)
+    logger.info("Hidden shape: %s", hidden.shape)  # [B, seq_len, d_model]
+if __name__ == "__main__":
+    main()

torch_compile/run_t5_encoder.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/usr/bin/env python3
+# T5 encoder on Neuron – no Apex, full graph, constant shapes
+import os
+os.environ["USE_FUSED_LAYER_NORM"] = "0"  # <── disable Apex
+import argparse
+import logging
+import time
+import torch
+from transformers import T5Tokenizer, T5Model  # use T5Model (no LM head)
+from datasets import load_dataset
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="T5 encoder on Neuron (full graph)")
+    parser.add_argument("--model", default="t5-small")
+    parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    tokenizer = T5Tokenizer.from_pretrained(args.model)
+    model = T5Model.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # fixed-shape input
+    text = "translate English to French: The cat is on the mat."
+    inputs = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model.encoder(**inputs).last_hidden_state
+    # compile encoder forward only (full graph)
+    encode_fn = lambda **kw: model.encoder(**kw).last_hidden_state
+    encode_fn = torch.compile(encode_fn, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = encode_fn(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - start)
+    # benchmark
+    start = time.time()
+    with torch.no_grad():
+        hidden = encode_fn(**inputs)
+    logger.info("Run: %.3f s", time.time() - start)
+    logger.info("Hidden shape: %s", hidden.shape)  # [B, seq_len, d_model]
+if __name__ == "__main__":
+    main()

torch_compile/run_unispeech.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+# UniSpeech (non-SAT) CTC speech-recognition on Neuron – constant shapes, full graph
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoProcessor, UniSpeechForCTC
+from datasets import load_dataset
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="UniSpeech CTC on Neuron (full graph)")
+    parser.add_argument("--model", default="microsoft/unispeech-large-1500h-cv")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    # load small speech snippet
+    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    sample = dataset[0]["audio"]["array"]  # 16 kHz numpy array
+    sampling_rate = dataset.features["audio"].sampling_rate
+    # processor + CTC model (non-SAT)
+    processor = AutoProcessor.from_pretrained(args.model)
+    model = UniSpeechForCTC.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess – fixed-length audio (4 s)
+    inputs = processor(sample, sampling_rate=sampling_rate, max_length=4 * 16_000, padding="max_length", return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile forward (full graph)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - start)
+    # benchmark + decode
+    start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    logger.info("Run: %.3f s", time.time() - start)
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)[0]
+    logger.info("Transcription: %s", transcription)
+if __name__ == "__main__":
+    main()

torch_compile/run_unispeech_sat.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# UniSpeech-SAT encoder on Neuron – full graph, constant shapes
+import argparse
+import logging
+import time
+import torch
+from transformers import Wav2Vec2Processor, UniSpeechSatModel
+from datasets import load_dataset
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="UniSpeech-SAT encoder on Neuron (full graph)")
+    parser.add_argument("--model", default="microsoft/unispeech-sat-base-100h-libri-ft")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    # load small speech snippet
+    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    sample = dataset[0]["audio"]["array"]  # 16 kHz numpy array
+    # processor + UniSpeech-SAT encoder (no LM head)
+    processor = Wav2Vec2Processor.from_pretrained(args.model)
+    model = UniSpeechSatModel.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess – fixed-length audio (pad to 4 s)
+    inputs = processor(sample, sampling_rate=16_000, max_length=4 * 16_000, padding="max_length", return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).last_hidden_state
+    # compile encoder forward (full graph)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - start)
+    # benchmark run
+    start = time.time()
+    with torch.no_grad():
+        hidden = model(**inputs).last_hidden_state
+    logger.info("Run: %.3f s", time.time() - start)
+    logger.info("Output hidden shape: %s", hidden.shape)  # [B, T, hidden]
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128)
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:140:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:485:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:1078:0: note: called from
+"""

torch_compile/run_vit.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+# Vision Transformer (ViT) image-classification on Neuron – full graph, constant shapes
+import argparse
+import logging
+import time
+import torch
+from transformers import AutoImageProcessor, ViTForImageClassification
+from datasets import load_dataset
+import torch_neuronx  # guarantees Neuron backend
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="ViT on Neuron (full graph)")
+    parser.add_argument("--model", default="google/vit-base-patch16-224")
+    args = parser.parse_args()
+    torch.manual_seed(42)
+    torch.set_default_dtype(torch.float32)
+    # load dataset image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # load processor & model
+    processor = AutoImageProcessor.from_pretrained(args.model)
+    model = ViTForImageClassification.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    ).eval()
+    # preprocess
+    inputs = processor(images=image, return_tensors="pt")
+    # pre-run to lock shapes
+    with torch.no_grad():
+        _ = model(**inputs).logits
+    # compile full graph
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    logger.info("Warmup: %.3f s", time.time() - warmup_start)
+    # benchmark run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    # top-1 ImageNet class
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]
+    logger.info("Run: %.3f s", run_time)
+    logger.info("Predicted label: %s", predicted_label)
+if __name__ == "__main__":
+    main()

torch_compile/run_wav2vec2.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import argparse
+import logging
+import time
+import torch
+from datasets import load_dataset
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import torch_neuronx
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run Wav2Vec2 on Neuron")
+    parser.add_argument(
+        "--model", type=str, default="facebook/wav2vec2-base-960h", help="Wav2Vec2 model name"
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    processor = Wav2Vec2Processor.from_pretrained(args.model)
+    model = Wav2Vec2ForCTC.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    dataset = dataset.sort("id")
+    sampling_rate = dataset.features["audio"].sampling_rate
+    inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    # Run once to establish shapes before compile
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    warmup_time = time.time() - warmup_start
+    # Run
+    run_start = time.time()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    run_time = time.time() - run_start
+    probabilities = torch.sigmoid(logits[0])
+    labels = (probabilities > 0.5).long()
+    logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
+    logger.info(f"Output label: {labels[0].tolist()}")
+if __name__ == "__main__":
+    main()
+"""
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128)
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
+/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:372:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:713:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1462:0: note: called from
+/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1862:0: note: called from
+# dynamic shape of intermediate tensors leading to static shape error while runing the traced artifact.
+"""

torch_compile/run_whisper.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import argparse
+import logging
+import time
+import torch
+from transformers import AutoTokenizer, WhisperForConditionalGeneration
+import torch_neuronx
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run Whisper on Neuron")
+    parser.add_argument(
+        "--model", type=str, default="openai/whisper-tiny", help="Whisper model name"
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    model = WhisperForConditionalGeneration.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    num_mel_bins = model.config.num_mel_bins
+    input_features = torch.randn(args.batch_size, num_mel_bins, 3000, dtype=torch.float32)
+    gen_kwargs = {
+        "max_new_tokens": 64,
+        "do_sample": False,
+        "cache_implementation": "static",
+        "eos_token_id": -1,
+    }
+    # Run once to establish shapes before compile
+    with torch.no_grad():
+        _ = model.generate(input_features=input_features, **gen_kwargs)
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        output = model.generate(input_features=input_features, **gen_kwargs)
+    warmup_time = time.time() - warmup_start
+    # Run
+    run_start = time.time()
+    with torch.no_grad():
+        output = model.generate(input_features=input_features, **gen_kwargs)
+    run_time = time.time() - run_start
+    logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
+    logger.info(f"Output: {tokenizer.batch_decode(output, skip_special_tokens=True)}")
+if __name__ == "__main__":
+    main()
+"""
+Traceback (most recent call last):
+  File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 64, in <module>
+    main()
+  File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 50, in main
+    output = model.generate(input_features=input_features, **gen_kwargs)
+  File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 704, in generate
+    init_tokens = self._retrieve_init_tokens(
+  File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1572, in _retrieve_init_tokens
+    lang_ids = self.detect_language(
+  File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1683, in detect_language
+    lang_ids = logits.argmax(-1)
+  File "/torch-neuronx/torch_neuronx/python_ops/auto_registration.py", line 306, in wrapper
+    result = operation(*args, **kwargs)
+  File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 712, in __call__
+    result = impl.execute(*args, **kwargs)
+  File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 109, in execute
+    result = self._execute_impl(*args2, **kwargs2)
+  File "/torch-neuronx/torch_neuronx/python_ops/to_copy.py", line 102, in _execute_impl
+    cpu_dst = copy_neuron_to_cpu(
+  File "/torch-neuronx/torch_neuronx/python_ops/cast_policy.py", line 102, in copy_neuron_to_cpu
+    _C._nrt_copy_neuron_to_cpu_tensor(neuron_src, cpu_tmp, non_blocking=non_blocking)
+RuntimeError: Compilation error occurred on Neuron for operation=aten::_index_put_impl_;
+error message="COMPILATION FAILED: Error: 2026-01-16T11:49:13Z 2026-01-16 11:49:13.062190: E hilo/hlo_passes/NeuronHloVerifier.cc:647] [ERROR] [NCC_EVRF024] Output tensor size of 10,759,912,900 bytes with shape of f32[51865,51865] exceeds 4GB limit for individual tensor size. TIP: Consider applying model parallelism or tensor parallelism per https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html."
+python stack trace=
+"""

torch_compile/run_xlm.py ADDED Viewed

File without changes

torch_compile/run_xlm_roberta.py ADDED Viewed

File without changes

torch_compile/run_yolos.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+import logging
+import time
+import os
+import torch
+from transformers import AutoImageProcessor, YolosForObjectDetection
+from datasets import load_dataset
+import torch_neuronx  # ensure Neuron backend is available
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    # Allow CPU fallback
+    # ERROR:torch_neuronx.neuron_dynamo_backend.backend:Execution failed: Compilation error occurred on Neuron for operation=torch_compile;
+    # error message="COMPILATION FAILED: Error: 2026-01-20T12:06:37Z tensor_op_name: _gather.577 | hlo_id: 577 |  [ERROR] [NCC_EXTP003] Instructions generated by compiler 290400 exceeds the typical limit of 150000. Input computation graph is too big due to large operators - Consider using smaller batches or sequence length, or applying tensor parellelism. For further troubleshooting visit https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html"
+    # python stack trace=
+    os.environ["TORCH_NEURONX_FALLBACK_ONLY_FOR_UNIMPLEMENTED_OPS"] = "0"
+    parser = argparse.ArgumentParser(description="Run YOLOS object detection on Neuron")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="hustvl/yolos-base",
+        help="YOLOS model name on Hugging Face Hub",
+    )
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    args = parser.parse_args()
+    torch.set_default_dtype(torch.float32)
+    torch.manual_seed(42)
+    # Load dataset and pick an image
+    dataset = load_dataset("huggingface/cats-image")
+    image = dataset["test"]["image"][0]
+    # Load processor and model
+    image_processor = AutoImageProcessor.from_pretrained(args.model)
+    model = YolosForObjectDetection.from_pretrained(
+        args.model, torch_dtype=torch.float32, attn_implementation="eager"
+    )
+    model.eval()
+    # Preprocess image
+    inputs = image_processor(images=image, return_tensors="pt")
+    # Pre-run once to fix shapes before compilation
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Compile forward pass
+    model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
+    # Warmup
+    warmup_start = time.time()
+    with torch.no_grad():
+        _ = model(**inputs)
+    warmup_time = time.time() - warmup_start
+    # Actual run
+    run_start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    run_time = time.time() - run_start
+    # Post-process: keep only top detection
+    logits = outputs.logits  # [B, num_queries, num_classes + 1]
+    probs = logits.softmax(dim=-1)[0, :, :-1]  # drop "no-object"
+    scores, labels = probs.max(dim=-1)  # CPU fallback allowed
+    best_idx = scores.argmax().item()
+    logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
+    logger.info("Top detection: class=%d, score=%.3f", labels[best_idx].item(), scores[best_idx].item())
+if __name__ == "__main__":
+    main()
+"""
+Need to fall back to CPU.
+"""

torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Operator ' aten::argmax.out ' fell back to CPU

torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Operator 'torch_compile' executed on Neuron
+Operator 'neuron::memory::alloc' executed on Neuron
+Operator 'neuron::copy::cpu_to_neuron' executed on Neuron
+Operator '_to_copy' executed on Neuron
+Operator 'model_default' executed on Neuron
+Operator 'neuron::memory::dealloc' executed on Neuron
+Operator 'neuron::copy::neuron_to_cpu' executed on Neuron
+Operator 'copy_' executed on Neuron

torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Operator ' aten::argmax.out ' fell back to CPU

torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Operator 'torch_compile' executed on Neuron
+Operator 'neuron::memory::alloc' executed on Neuron
+Operator 'neuron::copy::cpu_to_neuron' executed on Neuron
+Operator '_to_copy' executed on Neuron
+Operator 'model_default' executed on Neuron
+Operator 'neuron::memory::dealloc' executed on Neuron
+Operator 'neuron::copy::neuron_to_cpu' executed on Neuron
+Operator 'copy_' executed on Neuron