Ubuntu commited on
Commit
5ee43e9
·
1 Parent(s): 06d3040
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. torch_compile/flux/test_clip_text_encoder.py.py +80 -0
  2. torch_compile/flux/test_flux_transformer.py +73 -0
  3. torch_compile/flux/test_t5_text_encoder.py +72 -0
  4. torch_compile/flux/test_vae_decoder.py +84 -0
  5. torch_compile/run_albert.py +63 -0
  6. torch_compile/run_ast.py +77 -0
  7. torch_compile/run_beit.py +81 -0
  8. torch_compile/run_bert.py +62 -0
  9. torch_compile/run_camembert.py +71 -0
  10. torch_compile/run_clip.py +76 -0
  11. torch_compile/run_convbert.py +72 -0
  12. torch_compile/run_convnext.py +72 -0
  13. torch_compile/run_convnextv2.py +72 -0
  14. torch_compile/run_cvt.py +72 -0
  15. torch_compile/run_deberta.py +72 -0
  16. torch_compile/run_deberta_v3.py +72 -0
  17. torch_compile/run_deit.py +71 -0
  18. torch_compile/run_distillbert.py +67 -0
  19. torch_compile/run_donutswin.py +76 -0
  20. torch_compile/run_dpt.py +66 -0
  21. torch_compile/run_electra.py +67 -0
  22. torch_compile/run_esm.py +67 -0
  23. torch_compile/run_flaubert.py +97 -0
  24. torch_compile/run_hubert.py +85 -0
  25. torch_compile/run_levit.py +70 -0
  26. torch_compile/run_mobilebert.py +67 -0
  27. torch_compile/run_mobilenetv2.py +71 -0
  28. torch_compile/run_mobilevit.py +70 -0
  29. torch_compile/run_modernbert.py +66 -0
  30. torch_compile/run_mpnet.py +62 -0
  31. torch_compile/run_phi.py +77 -0
  32. torch_compile/run_phi3.py +86 -0
  33. torch_compile/run_roberta.py +67 -0
  34. torch_compile/run_roformer.py +67 -0
  35. torch_compile/run_sam2.py +56 -0
  36. torch_compile/run_swin.py +76 -0
  37. torch_compile/run_t5_decoder.py +66 -0
  38. torch_compile/run_t5_encoder.py +59 -0
  39. torch_compile/run_unispeech.py +62 -0
  40. torch_compile/run_unispeech_sat.py +67 -0
  41. torch_compile/run_vit.py +64 -0
  42. torch_compile/run_wav2vec2.py +73 -0
  43. torch_compile/run_whisper.py +91 -0
  44. torch_compile/run_xlm.py +0 -0
  45. torch_compile/run_xlm_roberta.py +0 -0
  46. torch_compile/run_yolos.py +83 -0
  47. torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt +1 -0
  48. torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt +8 -0
  49. torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt +1 -0
  50. torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt +8 -0
torch_compile/flux/test_clip_text_encoder.py.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLIP (Flux variant) zero-shot image-classification on Neuron.
4
+ Flux pipeline uses: openai/clip-vit-large-patch14
5
+ """
6
+ import argparse
7
+ import logging
8
+ import time
9
+
10
+ import torch
11
+ from transformers import CLIPProcessor, CLIPModel
12
+ from datasets import load_dataset
13
+ import torch_neuronx # noqa: F401 guarantees Neuron backend
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def main():
20
+ parser = argparse.ArgumentParser(
21
+ description="CLIP (Flux checkpoint) zero-shot image classification with torch.compile on Neuron"
22
+ )
23
+ parser.add_argument(
24
+ "--model",
25
+ type=str,
26
+ default="openai/clip-vit-large-patch14", # Flux CLIP checkpoint
27
+ help="CLIP model name on Hugging Face Hub",
28
+ )
29
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
30
+ args = parser.parse_args()
31
+
32
+ torch.set_default_dtype(torch.float32)
33
+ torch.manual_seed(42)
34
+
35
+ # Load dataset and pick an image
36
+ dataset = load_dataset("huggingface/cats-image")
37
+ image = dataset["test"]["image"][0]
38
+
39
+ # Load processor and model (Flux CLIP checkpoint)
40
+ processor = CLIPProcessor.from_pretrained(args.model)
41
+ model = CLIPModel.from_pretrained(
42
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
43
+ ).eval()
44
+
45
+ # Zero-shot labels
46
+ texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
47
+ inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
48
+
49
+ # Pre-run once to freeze shapes before compilation
50
+ with torch.no_grad():
51
+ outputs = model(**inputs)
52
+
53
+ # Compile forward pass (allow graph breaks for big model)
54
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
55
+
56
+ # Warmup
57
+ warmup_start = time.time()
58
+ with torch.no_grad():
59
+ _ = model(**inputs)
60
+ warmup_time = time.time() - warmup_start
61
+
62
+ # Actual run
63
+ run_start = time.time()
64
+ with torch.no_grad():
65
+ outputs = model(**inputs)
66
+ run_time = time.time() - run_start
67
+
68
+ # Compute probabilities
69
+ logits_per_image = outputs.logits_per_image # [B, num_texts]
70
+ probs = logits_per_image.softmax(dim=-1)
71
+ best_idx = int(probs.argmax().item())
72
+ best_label = texts[best_idx]
73
+
74
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
75
+ logger.info("Probabilities: %s", probs.tolist())
76
+ logger.info("Predicted label: %s", best_label)
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
torch_compile/flux/test_flux_transformer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torchrun --nproc_per_node=8 test_flux_transformer.py
2
+ import os, time, argparse, logging, torch, torch.distributed as dist
3
+ from torch.distributed.device_mesh import DeviceMesh
4
+ from torch.distributed.tensor.parallel import (
5
+ ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module
6
+ )
7
+ from diffusers import FluxTransformer2DModel
8
+ import torch_neuronx
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def apply_tp_flux(transformer: torch.nn.Module, tp_mesh: DeviceMesh):
13
+ # embed & final-norm replicated
14
+ plan = {"x_embedder": None, "norm_out": None}
15
+ parallelize_module(transformer, tp_mesh, plan)
16
+
17
+ # inside each transformer block
18
+ for block in transformer.transformer_blocks:
19
+ blk = {
20
+ "norm1": None,
21
+ "norm1_k": None,
22
+ "attn.qkv": ColwiseParallel(),
23
+ "attn.proj": RowwiseParallel(output_layouts=Replicate()),
24
+ "attn.norm_q": None,
25
+ "attn.norm_k": None,
26
+ "ffn.net.0": ColwiseParallel(), # gate
27
+ "ffn.net.2": RowwiseParallel(output_layouts=Replicate()),
28
+ }
29
+ parallelize_module(block, tp_mesh, blk)
30
+ return transformer
31
+
32
+ def main():
33
+ dist.init_process_group(backend="neuron")
34
+ rank = dist.get_rank()
35
+ device = torch.device(f"neuron:{rank}")
36
+ tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size())))
37
+
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--model", default="black-forest-labs/FLUX.1-dev/transformer")
40
+ parser.add_argument("--seq-len", type=int, default=4096)
41
+ parser.add_argument("--dim", type=int, default=3072)
42
+ args = parser.parse_args()
43
+
44
+ # create on CPU, real tensors
45
+ with torch.device("cpu"):
46
+ transformer = FluxTransformer2DModel.from_pretrained(
47
+ args.model, torch_dtype=torch.bfloat16, attn_implementation="eager"
48
+ ).eval()
49
+
50
+ transformer = apply_tp_flux(transformer, tp_mesh)
51
+ # move local shards to Neuron
52
+ for p in transformer.parameters():
53
+ if isinstance(p, DTensor):
54
+ p._local_tensor = p._local_tensor.to(device, dtype=torch.bfloat16)
55
+ else:
56
+ p.data = p.data.to(device, dtype=torch.bfloat16)
57
+
58
+ transformer = torch.compile(transformer, backend="neuron", fullgraph=False)
59
+
60
+ batch = 1
61
+ hidden = torch.randn(batch, args.seq_len, args.dim, dtype=torch.bfloat16, device=device)
62
+ encoder_hidden = torch.randn(batch, args.seq_len, 4096, dtype=torch.bfloat16, device=device)
63
+ timestep = torch.tensor([500], dtype=torch.int64, device=device)
64
+
65
+ with torch.no_grad():
66
+ _ = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep)
67
+ t0 = time.time()
68
+ out = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep)
69
+ logger.info("Rank %d Flux-TFM latency: %.3f ms shape: %s",
70
+ rank, (time.time()-t0)*1000, out.sample.shape)
71
+
72
+ if __name__ == "__main__":
73
+ main()
torch_compile/flux/test_t5_text_encoder.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torchrun --nproc_per_node=4 test_t5_text_encoder.py
2
+ import os, time, argparse, logging, torch, torch.distributed as dist
3
+ from torch.distributed.device_mesh import DeviceMesh
4
+ from torch.distributed.tensor.parallel import (
5
+ ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module
6
+ )
7
+ from transformers import T5EncoderModel, AutoTokenizer
8
+ from torchtitan.models.t5 import T5Encoder # or transformers T5EncoderModel
9
+ import torch_neuronx
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def apply_tp_t5(encoder: torch.nn.Module, tp_mesh: DeviceMesh):
14
+ # encoder.embed_tokens already replicated
15
+ plan = {
16
+ "embed_tokens": None, # replicate
17
+ "encoder.block": None, # we will loop inside
18
+ }
19
+ parallelize_module(encoder, tp_mesh, plan)
20
+
21
+ # shard every dense layer inside each encoder block
22
+ for layer in encoder.encoder.block:
23
+ layer_plan = {
24
+ "layer.0.SelfAttention.q": ColwiseParallel(),
25
+ "layer.0.SelfAttention.k": ColwiseParallel(),
26
+ "layer.0.SelfAttention.v": ColwiseParallel(),
27
+ "layer.0.SelfAttention.o": RowwiseParallel(output_layouts=Replicate()),
28
+ "layer.0.dense": ColwiseParallel(),
29
+ "layer.1.dense": RowwiseParallel(output_layouts=Replicate()),
30
+ }
31
+ parallelize_module(layer, tp_mesh, layer_plan)
32
+ return encoder
33
+
34
+ def main():
35
+ dist.init_process_group(backend="neuron")
36
+ rank = dist.get_rank()
37
+ device = torch.device(f"neuron:{rank}")
38
+ tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size())))
39
+
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument("--model", default="google/t5-v1_1-xxl")
42
+ parser.add_argument("--seq-len", type=int, default=512)
43
+ args = parser.parse_args()
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
46
+ # create model on CPU, real tensors
47
+ with torch.device("cpu"):
48
+ encoder = T5EncoderModel.from_pretrained(args.model, attn_implementation="eager").eval()
49
+
50
+ encoder = apply_tp_t5(encoder, tp_mesh)
51
+ # move local shards to Neuron
52
+ for p in encoder.parameters():
53
+ if isinstance(p, DTensor):
54
+ p._local_tensor = p._local_tensor.to(device)
55
+ else:
56
+ p.data = p.data.to(device)
57
+
58
+ encoder = torch.compile(encoder, backend="neuron", fullgraph=False)
59
+
60
+ text = ["a photo of a cat"]
61
+ txt_in = tokenizer(text, max_length=args.seq_len, padding="max_length", return_tensors="pt")
62
+ input_ids = txt_in.input_ids.to(device)
63
+
64
+ with torch.no_grad():
65
+ _ = encoder(input_ids) # compile
66
+ t0 = time.time()
67
+ out = encoder(input_ids).last_hidden_state
68
+ logger.info("Rank %d T5-XXL enc latency: %.3f ms shape: %s",
69
+ rank, (time.time()-t0)*1000, out.shape) # [1, seq_len, 4096]
70
+
71
+ if __name__ == "__main__":
72
+ main()
torch_compile/flux/test_vae_decoder.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Flux VAE decoder (16-ch latent → RGB image) on Neuron.
4
+ Checkpoint: black-forest-labs/FLUX.1-dev/vae
5
+ """
6
+ import argparse
7
+ import logging
8
+ import time
9
+ from pathlib import Path
10
+
11
+ import torch
12
+ from diffusers import AutoencoderKL
13
+ import torch_neuronx # noqa: F401 guarantees Neuron backend
14
+ from PIL import Image
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(
22
+ description="Flux VAE decoder (latent → image) with torch.compile on Neuron"
23
+ )
24
+ parser.add_argument(
25
+ "--model",
26
+ type=str,
27
+ # default="black-forest-labs/FLUX.1-dev/vae",
28
+ default="/workspace/flux_weight/",
29
+ help="Flux VAE checkpoint on Hugging Face Hub",
30
+ )
31
+ parser.add_argument("--latent-ch", type=int, default=16, help="Latent channels (Flux=16)")
32
+ parser.add_argument("--scale", type=int, default=32, help="Latent spatial size (256 px / 8)")
33
+ parser.add_argument("--output", type=str, default="flux_vae_out.png", help="Output image path")
34
+ args = parser.parse_args()
35
+
36
+ torch.set_default_dtype(torch.float32)
37
+ torch.manual_seed(42)
38
+
39
+ # Load Flux VAE decoder
40
+ vae = AutoencoderKL.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32).eval()
41
+
42
+ # Create dummy latent (bfloat16, N(0,1)) - shape: [B, 16, H/8, W/8]
43
+ latent = torch.randn(1, args.latent_ch, args.scale, args.scale, dtype=torch.float32)
44
+
45
+ # Pre-run once to freeze shapes before compilation
46
+ with torch.no_grad():
47
+ _ = vae.decode(latent).sample
48
+
49
+ # Compile decode function (allow graph breaks for big kernels)
50
+ decode_fn = torch.compile(vae.decode, backend="neuron", fullgraph=True)
51
+
52
+ # Warmup
53
+ warmup_start = time.time()
54
+ with torch.no_grad():
55
+ _ = decode_fn(latent)
56
+ warmup_time = time.time() - warmup_start
57
+
58
+ # Actual run
59
+ run_start = time.time()
60
+ with torch.no_grad():
61
+ image = decode_fn(latent).sample
62
+ run_time = time.time() - run_start
63
+
64
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
65
+ logger.info("VAE output shape: %s", image.shape) # [1, 3, H, W]
66
+
67
+ # Convert to PIL and save
68
+ image = (image / 2 + 0.5).clamp(0, 1) # scale to [0,1]
69
+ image = image.cpu().float()
70
+ Image.fromarray((image[0].permute(1, 2, 0).numpy() * 255).astype("uint8")).save(args.output)
71
+ logger.info("Saved decoded image to %s", Path(args.output).resolve())
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
76
+
77
+ """
78
+ The compilation process took more than 2 hours.
79
+ /usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory
80
+ warnings.warn(f"Could not import StableHLO C++ extension: {e}")
81
+ INFO:__main__:Warmup: 4010.52 s, Run: 22.5420 s
82
+ INFO:__main__:VAE output shape: torch.Size([1, 3, 256, 256])
83
+ INFO:__main__:Saved decoded image to /workspace/torch_neuron_samples/torch-neuron-samples/scripts/torch_compile/flux/flux_vae_out.png
84
+ """
torch_compile/run_albert.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, AlbertForSequenceClassification
7
+
8
+ import torch_neuronx # ensure Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Run ALBERT on Neuron")
16
+ parser.add_argument(
17
+ "--model", type=str, default="albert-base-v2", help="ALBERT model name"
18
+ )
19
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
20
+ args = parser.parse_args()
21
+
22
+ torch.set_default_dtype(torch.float32)
23
+ torch.manual_seed(42)
24
+
25
+ # Load ALBERT model and tokenizer
26
+ model = AlbertForSequenceClassification.from_pretrained(
27
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
28
+ )
29
+ model.eval()
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
32
+ inputs = tokenizer(
33
+ "Hamilton is considered to be the best musical of human history.",
34
+ return_tensors="pt"
35
+ )
36
+
37
+ # Pre-run once to fix shapes before compilation
38
+ with torch.no_grad():
39
+ _ = model(**inputs).logits
40
+
41
+ # Compile forward pass
42
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
43
+
44
+ # Warmup
45
+ warmup_start = time.time()
46
+ with torch.no_grad():
47
+ _ = model(**inputs)
48
+ warmup_time = time.time() - warmup_start
49
+
50
+ # Actual run
51
+ run_start = time.time()
52
+ with torch.no_grad():
53
+ logits = model(**inputs).logits
54
+ run_time = time.time() - run_start
55
+ predicted_class_id = logits.argmax().item()
56
+ predicted_class_label = model.config.id2label[predicted_class_id]
57
+
58
+ logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
59
+ logger.info(f"Output label: {predicted_class_label}")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
torch_compile/run_ast.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoFeatureExtractor, ASTForAudioClassification
7
+ from datasets import load_dataset
8
+
9
+ import torch_neuronx # ensure Neuron backend is available
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run AST (Audio Spectrogram Transformer) on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="MIT/ast-finetuned-audioset-10-10-0.4593",
21
+ help="AST model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # Load dataset and extract features
30
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
31
+ dataset = dataset.sort("id")
32
+ sampling_rate = dataset.features["audio"].sampling_rate
33
+
34
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model)
35
+ inputs = feature_extractor(
36
+ dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt"
37
+ )
38
+
39
+ # Load AST model
40
+ model = ASTForAudioClassification.from_pretrained(
41
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
42
+ )
43
+ model.eval()
44
+
45
+ # Pre-run once to fix shapes before compilation
46
+ with torch.no_grad():
47
+ logits = model(**inputs).logits
48
+
49
+ # Compile forward pass
50
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
51
+
52
+ # Warmup
53
+ warmup_start = time.time()
54
+ with torch.no_grad():
55
+ _ = model(**inputs)
56
+ warmup_time = time.time() - warmup_start
57
+
58
+ # Actual run
59
+ run_start = time.time()
60
+ with torch.no_grad():
61
+ logits = model(**inputs).logits
62
+ run_time = time.time() - run_start
63
+
64
+ # Decode result
65
+ predicted_class_ids = torch.argmax(logits, dim=-1).item()
66
+ predicted_label = model.config.id2label[predicted_class_ids]
67
+
68
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
69
+ logger.info("Predicted label: %s", predicted_label)
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
74
+
75
+ """
76
+ Works
77
+ """
torch_compile/run_beit.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoImageProcessor, BeitForImageClassification
7
+ from datasets import load_dataset
8
+
9
+ import torch_neuronx # ensure Neuron backend is available
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run BEiT on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="microsoft/beit-base-patch16-224-pt22k",
21
+ help="BEiT model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # Load image
30
+ dataset = load_dataset("huggingface/cats-image")
31
+ image = dataset["test"]["image"][0]
32
+
33
+ # Load processor and model
34
+ image_processor = AutoImageProcessor.from_pretrained(args.model)
35
+ model = BeitForImageClassification.from_pretrained(
36
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
37
+ )
38
+ model.eval()
39
+
40
+ # Preprocess
41
+ inputs = image_processor(image, return_tensors="pt")
42
+
43
+ # Pre-run once to fix shapes before compilation
44
+ with torch.no_grad():
45
+ logits = model(**inputs).logits
46
+
47
+ # Compile forward pass
48
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
49
+
50
+ # Warmup
51
+ warmup_start = time.time()
52
+ with torch.no_grad():
53
+ _ = model(**inputs)
54
+ warmup_time = time.time() - warmup_start
55
+
56
+ # Actual run
57
+ run_start = time.time()
58
+ with torch.no_grad():
59
+ logits = model(**inputs).logits
60
+ run_time = time.time() - run_start
61
+
62
+ # Predicted ImageNet class
63
+ predicted_label = logits.argmax(-1).item()
64
+ label_str = model.config.id2label[predicted_label]
65
+
66
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
67
+ logger.info("Predicted label: %s", label_str)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
72
+ """
73
+ root@d90ba90f3d81:/workspace/torch_neuron_samples/torch-neuron-samples/scripts/tests# torch-mlir-opt -pass-pipeline='builtin.module(torch-backend-to-stablehlo-backend-pipeline)' /tmp/UnnammedModule.mlir
74
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
75
+ /usr/local/lib/python3.10/site-packages/transformers/pytorch_utils.py:361:0: note: called from
76
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:625:0: note: called from
77
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:688:0: note: called from
78
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:824:0: note: called from
79
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:1007:0: note: called from
80
+ /usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: note: see current operation: %733 = "torch.aten.fill.Tensor"(%732, %524) : (!torch.vtensor<[197],si64>, !torch.vtensor<[],si64>) -> !torch.vtensor<[197],si64>
81
+ """
torch_compile/run_bert.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, BertForSequenceClassification
7
+
8
+ import torch_neuronx
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Run Bert on Neuron")
16
+ parser.add_argument(
17
+ "--model", type=str, default="google-bert/bert-base-uncased", help="Bert model name"
18
+ )
19
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
20
+ args = parser.parse_args()
21
+
22
+ torch.set_default_dtype(torch.float32)
23
+ torch.manual_seed(42)
24
+
25
+ model = BertForSequenceClassification.from_pretrained(
26
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
27
+ )
28
+ model.eval()
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ inputs = tokenizer("Hamilton is considered to be the best musical of human history.", return_tensors="pt")
32
+
33
+ # Run once to establish shapes before compile
34
+ with torch.no_grad():
35
+ logits = model(**inputs).logits
36
+
37
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
38
+
39
+ # Warmup
40
+ warmup_start = time.time()
41
+ with torch.no_grad():
42
+ logits = model(**inputs)
43
+ warmup_time = time.time() - warmup_start
44
+
45
+ # Run
46
+ run_start = time.time()
47
+ with torch.no_grad():
48
+ logits = model(**inputs).logits
49
+ run_time = time.time() - run_start
50
+ predicted_class_id = logits.argmax().item()
51
+ predicted_class_label = model.config.id2label[predicted_class_id]
52
+
53
+ logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
54
+ logger.info(f"Output label: {predicted_class_label}")
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
59
+
60
+ """
61
+ Works
62
+ """
torch_compile/run_camembert.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, CamembertForSequenceClassification
7
+
8
+ import torch_neuronx # ensure Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Run CamemBERT on Neuron")
16
+ parser.add_argument(
17
+ "--model",
18
+ type=str,
19
+ default="camembert-base",
20
+ help="CamemBERT model name on Hugging Face Hub",
21
+ )
22
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
23
+ args = parser.parse_args()
24
+
25
+ torch.set_default_dtype(torch.float32)
26
+ torch.manual_seed(42)
27
+
28
+ # Load tokenizer and model
29
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
30
+ model = CamembertForSequenceClassification.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ )
33
+ model.eval()
34
+
35
+ # Tokenize sample text
36
+ text = "CamemBERT est un modèle de langue français."
37
+ inputs = tokenizer(text, return_tensors="pt")
38
+
39
+ # Pre-run once to fix shapes before compilation
40
+ with torch.no_grad():
41
+ logits = model(**inputs).logits
42
+
43
+ # Compile forward pass
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # Warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # Actual run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # Decode result
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
68
+
69
+ """
70
+ Works
71
+ """
torch_compile/run_clip.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import CLIPProcessor, CLIPModel
7
+ from datasets import load_dataset
8
+ import torch_neuronx # ensures Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(
16
+ description="CLIP zero-shot image classification with torch.compile on Neuron"
17
+ )
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="openai/clip-vit-base-patch32",
22
+ help="CLIP model name on Hugging Face Hub",
23
+ )
24
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # Load dataset and pick an image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # Load processor and model
35
+ processor = CLIPProcessor.from_pretrained(args.model)
36
+ model = CLIPModel.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ )
39
+ model.eval()
40
+
41
+ # Build zero-shot inputs
42
+ texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
43
+ inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
44
+
45
+ # Pre-run once to fix shapes before compilation
46
+ with torch.no_grad():
47
+ outputs = model(**inputs)
48
+
49
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
50
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
51
+
52
+ # Warmup
53
+ warmup_start = time.time()
54
+ with torch.no_grad():
55
+ _ = model(**inputs)
56
+ warmup_time = time.time() - warmup_start
57
+
58
+ # Actual run
59
+ run_start = time.time()
60
+ with torch.no_grad():
61
+ outputs = model(**inputs)
62
+ run_time = time.time() - run_start
63
+
64
+ # Compute probabilities
65
+ logits_per_image = outputs.logits_per_image # [batch_size, num_texts]
66
+ probs = logits_per_image.softmax(dim=-1)
67
+ best_idx = int(probs.argmax())
68
+ best_label = texts[best_idx]
69
+
70
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
71
+ logger.info("Probabilities: %s", probs.tolist())
72
+ logger.info("Predicted label: %s", best_label)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
torch_compile/run_convbert.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, ConvBertForSequenceClassification
7
+
8
+ import torch_neuronx # ensure Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Run ConvBERT on Neuron")
16
+ parser.add_argument(
17
+ "--model",
18
+ type=str,
19
+ default="YituTech/conv-bert-base",
20
+ help="ConvBERT model name on Hugging Face Hub",
21
+ )
22
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
23
+ args = parser.parse_args()
24
+
25
+ torch.set_default_dtype(torch.float32)
26
+ torch.manual_seed(42)
27
+
28
+ # Load tokenizer and model
29
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
30
+ model = ConvBertForSequenceClassification.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ )
33
+ model.eval()
34
+
35
+ # Tokenize sample text
36
+ text = "ConvBERT combines self-attention and lightweight convolutions."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # Pre-run once to fix shapes before compilation
40
+ with torch.no_grad():
41
+ logits = model(**inputs).logits
42
+
43
+ # Compile forward pass
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # Warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # Actual run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # Decode result
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
68
+
69
+ """
70
+ <unknown>:0: error: failed to legalize operation 'torch.constant.int'
71
+ <unknown>:0: note: see current operation: %0 = "torch.constant.int"() <{value = 9 : i64}> : () -> !torch.int
72
+ """
torch_compile/run_convnext.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoImageProcessor, ConvNextForImageClassification
7
+ from datasets import load_dataset
8
+ import torch_neuronx # ensures Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(
16
+ description="ConvNeXt image-classification with torch.compile on Neuron"
17
+ )
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="facebook/convnext-tiny-224",
22
+ help="ConvNeXT model name on Hugging Face Hub",
23
+ )
24
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # Load dataset and pick an image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # Load processor and model
35
+ processor = AutoImageProcessor.from_pretrained(args.model)
36
+ model = ConvNextForImageClassification.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ )
39
+ model.eval()
40
+
41
+ # Preprocess image
42
+ inputs = processor(images=image, return_tensors="pt")
43
+
44
+ # Pre-run once to fix shapes before compilation
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+
48
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
49
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
50
+
51
+ # Warmup
52
+ warmup_start = time.time()
53
+ with torch.no_grad():
54
+ _ = model(**inputs)
55
+ warmup_time = time.time() - warmup_start
56
+
57
+ # Actual run
58
+ run_start = time.time()
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ run_time = time.time() - run_start
62
+
63
+ # Predicted ImageNet class
64
+ predicted_class_idx = outputs.logits.argmax(-1).item()
65
+ predicted_label = model.config.id2label[predicted_class_idx]
66
+
67
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
68
+ logger.info("Predicted label: %s", predicted_label)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
torch_compile/run_convnextv2.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoImageProcessor, ConvNextV2ForImageClassification
7
+ from datasets import load_dataset
8
+ import torch_neuronx # ensures Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(
16
+ description="ConvNeXt-V2 image-classification with torch.compile on Neuron"
17
+ )
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="facebook/convnextv2-tiny-1k-224",
22
+ help="ConvNeXt-V2 model name on Hugging Face Hub",
23
+ )
24
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # Load dataset and pick an image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # Load processor and model
35
+ processor = AutoImageProcessor.from_pretrained(args.model)
36
+ model = ConvNextV2ForImageClassification.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ )
39
+ model.eval()
40
+
41
+ # Preprocess image
42
+ inputs = processor(images=image, return_tensors="pt")
43
+
44
+ # Pre-run once to fix shapes before compilation
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+
48
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
49
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
50
+
51
+ # Warmup
52
+ warmup_start = time.time()
53
+ with torch.no_grad():
54
+ _ = model(**inputs)
55
+ warmup_time = time.time() - warmup_start
56
+
57
+ # Actual run
58
+ run_start = time.time()
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ run_time = time.time() - run_start
62
+
63
+ # Predicted ImageNet class
64
+ predicted_class_idx = outputs.logits.argmax(-1).item()
65
+ predicted_label = model.config.id2label[predicted_class_idx]
66
+
67
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
68
+ logger.info("Predicted label: %s", predicted_label)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
torch_compile/run_cvt.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoImageProcessor, CvtForImageClassification
7
+ from datasets import load_dataset
8
+ import torch_neuronx # ensures Neuron backend is available
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(
16
+ description="CvT image-classification with torch.compile on Neuron"
17
+ )
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="microsoft/cvt-13",
22
+ help="CvT model name on Hugging Face Hub",
23
+ )
24
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # Load dataset and pick an image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # Load processor and model
35
+ processor = AutoImageProcessor.from_pretrained(args.model)
36
+ model = CvtForImageClassification.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ )
39
+ model.eval()
40
+
41
+ # Preprocess image
42
+ inputs = processor(images=image, return_tensors="pt")
43
+
44
+ # Pre-run once to fix shapes before compilation
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+
48
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
49
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
50
+
51
+ # Warmup
52
+ warmup_start = time.time()
53
+ with torch.no_grad():
54
+ _ = model(**inputs)
55
+ warmup_time = time.time() - warmup_start
56
+
57
+ # Actual run
58
+ run_start = time.time()
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ run_time = time.time() - run_start
62
+
63
+ # Predicted ImageNet class
64
+ predicted_class_idx = outputs.logits.argmax(-1).item()
65
+ predicted_label = model.config.id2label[predicted_class_idx]
66
+
67
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
68
+ logger.info("Predicted label: %s", predicted_label)
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
torch_compile/run_deberta.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, DebertaForSequenceClassification
7
+ import torch_neuronx # ensures Neuron backend is available
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser(
15
+ description="DeBERTa sequence-classification with torch.compile on Neuron"
16
+ )
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="microsoft/deberta-base",
21
+ help="DeBERTa model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # Load tokenizer and model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = DebertaForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ )
34
+ model.eval()
35
+
36
+ # Tokenize sample text
37
+ text = "DeBERTa improves BERT and RoBERTa using disentangled attention."
38
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
39
+
40
+ # Pre-run once to fix shapes before compilation
41
+ with torch.no_grad():
42
+ logits = model(**inputs).logits
43
+
44
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
45
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
46
+
47
+ # Warmup
48
+ warmup_start = time.time()
49
+ with torch.no_grad():
50
+ _ = model(**inputs)
51
+ warmup_time = time.time() - warmup_start
52
+
53
+ # Actual run
54
+ run_start = time.time()
55
+ with torch.no_grad():
56
+ logits = model(**inputs).logits
57
+ run_time = time.time() - run_start
58
+
59
+ # Decode result
60
+ predicted_class_id = logits.argmax().item()
61
+ predicted_label = model.config.id2label[predicted_class_id]
62
+
63
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
64
+ logger.info("Predicted label: %s", predicted_label)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
69
+
70
+ """
71
+ torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='neuron:0', size=(1, 18, 768)), Parameter(FakeTensor(..., size=(2304, 768), requires_grad=True)), None), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices neuron:0, cpu')
72
+ """
torch_compile/run_deberta_v3.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, DebertaV2ForSequenceClassification
7
+ import torch_neuronx # ensures Neuron backend is available
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser(
15
+ description="DeBERTa-v3 sequence-classification with torch.compile on Neuron"
16
+ )
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="microsoft/deberta-v3-base",
21
+ help="DeBERTa-v3 model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # Load tokenizer and model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = DebertaV2ForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ )
34
+ model.eval()
35
+
36
+ # Tokenize sample text
37
+ text = "DeBERTa-v3 achieves stronger performance with improved pre-training."
38
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
39
+
40
+ # Pre-run once to fix shapes before compilation
41
+ with torch.no_grad():
42
+ logits = model(**inputs).logits
43
+
44
+ # Compile forward pass (allow graph breaks to avoid instruction-limit)
45
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
46
+
47
+ # Warmup
48
+ warmup_start = time.time()
49
+ with torch.no_grad():
50
+ _ = model(**inputs)
51
+ warmup_time = time.time() - warmup_start
52
+
53
+ # Actual run
54
+ run_start = time.time()
55
+ with torch.no_grad():
56
+ logits = model(**inputs).logits
57
+ run_time = time.time() - run_start
58
+
59
+ # Decode result
60
+ predicted_class_id = logits.argmax().item()
61
+ predicted_label = model.config.id2label[predicted_class_id]
62
+
63
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
64
+ logger.info("Predicted label: %s", predicted_label)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
69
+
70
+ """
71
+ Works
72
+ """
torch_compile/run_deit.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # DeiT (Vision Transformer) image-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoImageProcessor, DeiTForImageClassification
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Run DeiT on Neuron")
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="facebook/deit-base-distilled-patch16-224",
22
+ help="DeiT model name on Hugging Face Hub",
23
+ )
24
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # load dataset image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # load processor & distilled DeiT model
35
+ processor = AutoImageProcessor.from_pretrained(args.model)
36
+ model = DeiTForImageClassification.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ ).eval()
39
+
40
+ # preprocess
41
+ inputs = processor(images=image, return_tensors="pt")
42
+
43
+ # pre-run to lock shapes
44
+ with torch.no_grad():
45
+ _ = model(**inputs).logits
46
+
47
+ # compile
48
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
49
+
50
+ # warmup
51
+ warmup_start = time.time()
52
+ with torch.no_grad():
53
+ _ = model(**inputs)
54
+ warmup_time = time.time() - warmup_start
55
+
56
+ # benchmark run
57
+ run_start = time.time()
58
+ with torch.no_grad():
59
+ logits = model(**inputs).logits
60
+ run_time = time.time() - run_start
61
+
62
+ # top-1 ImageNet class
63
+ predicted_class_idx = logits.argmax(-1).item()
64
+ predicted_label = model.config.id2label[predicted_class_idx]
65
+
66
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
67
+ logger.info("Predicted label: %s", predicted_label)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
torch_compile/run_distillbert.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # DistilBERT text-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, DistilBertForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run DistilBERT on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="distilbert-base-uncased-finetuned-sst-2-english",
21
+ help="DistilBERT model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = DistilBertForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "DistilBERT is a compact, fast variant of BERT."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_donutswin.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # DonutSwin image-encoder on Neuron (no decoder, pure vision)
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import DonutImageProcessor, DonutSwinModel
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Run DonutSwin encoder on Neuron")
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="naver-clova-ix/donut-base",
22
+ help="DonutSwin model name on Hugging Face Hub",
23
+ )
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load dataset image
30
+ dataset = load_dataset("huggingface/cats-image")
31
+ image = dataset["test"]["image"][0]
32
+
33
+ # load processor & vision encoder only
34
+ processor = DonutImageProcessor.from_pretrained(args.model)
35
+ model = DonutSwinModel.from_pretrained(
36
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
37
+ ).eval()
38
+
39
+ # preprocess
40
+ inputs = processor(images=image, return_tensors="pt")
41
+
42
+ # pre-run to lock shapes
43
+ with torch.no_grad():
44
+ _ = model(**inputs).last_hidden_state
45
+
46
+ # compile
47
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
48
+
49
+ # warmup
50
+ warmup_start = time.time()
51
+ with torch.no_grad():
52
+ _ = model(**inputs)
53
+ warmup_time = time.time() - warmup_start
54
+
55
+ # benchmark run
56
+ run_start = time.time()
57
+ with torch.no_grad():
58
+ hidden = model(**inputs).last_hidden_state
59
+ run_time = time.time() - run_start
60
+
61
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
62
+ logger.info("Output hidden shape: %s", hidden.shape) # [B, seq_len, hidden_size]
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
67
+
68
+ """
69
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
70
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:637:0: note: called from
71
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:712:0: note: called from
72
+ /usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from
73
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:783:0: note: called from
74
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:922:0: note: called from
75
+ /usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: note: see current operation: %1327 = "torch.aten.fill.Tensor"(%1326, %1091) : (!torch.vtensor<[1,630,470,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,630,470,1],f32>
76
+ """
torch_compile/run_dpt.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # DPT (Dense Prediction Transformer) monocular depth estimation on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import DPTImageProcessor, DPTForDepthEstimation
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Run DPT depth estimation on Neuron")
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="Intel/dpt-large",
22
+ help="DPT model name on Hugging Face Hub",
23
+ )
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load dataset image
30
+ dataset = load_dataset("huggingface/cats-image")
31
+ image = dataset["test"]["image"][0]
32
+
33
+ # load processor & DPT model
34
+ processor = DPTImageProcessor.from_pretrained(args.model)
35
+ model = DPTForDepthEstimation.from_pretrained(
36
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
37
+ ).eval()
38
+
39
+ # preprocess
40
+ inputs = processor(images=image, return_tensors="pt")
41
+
42
+ # pre-run to lock shapes
43
+ with torch.no_grad():
44
+ _ = model(**inputs).predicted_depth
45
+
46
+ # compile
47
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
48
+
49
+ # warmup
50
+ warmup_start = time.time()
51
+ with torch.no_grad():
52
+ _ = model(**inputs)
53
+ warmup_time = time.time() - warmup_start
54
+
55
+ # benchmark run
56
+ run_start = time.time()
57
+ with torch.no_grad():
58
+ depth = model(**inputs).predicted_depth
59
+ run_time = time.time() - run_start
60
+
61
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
62
+ logger.info("Output depth shape: %s", depth.shape) # [B, 1, H, W]
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
torch_compile/run_electra.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ELECTRA (discriminator) text-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, ElectraForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run ELECTRA on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="google/electra-base-discriminator",
21
+ help="ELECTRA model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = ElectraForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "ELECTRA pre-trains a discriminator to detect replaced tokens."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_esm.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ESM (Evolutionary Scale Modeling) protein-sequence classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import EsmTokenizer, EsmForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run ESM on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="facebook/esm2_t33_650M_UR50D",
21
+ help="ESM model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = EsmTokenizer.from_pretrained(args.model)
31
+ model = EsmForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize protein sequence
36
+ sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
37
+ inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_flaubert.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # FlauBERT text-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import FlaubertTokenizer, FlaubertForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run FlauBERT on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="flaubert/flaubert_base_cased",
21
+ help="FlauBERT model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = FlaubertTokenizer.from_pretrained(args.model)
31
+ model = FlaubertForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "FlauBERT est un modèle de langue français performant."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
68
+
69
+
70
+ """
71
+ Traceback (most recent call last):
72
+ File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 67, in <module>
73
+ main()
74
+ File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 49, in main
75
+ _ = model(**inputs)
76
+ File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
77
+ return self._call_impl(*args, **kwargs)
78
+ File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
79
+ return forward_call(*args, **kwargs)
80
+ File "/usr/local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 841, in compile_wrapper
81
+ raise e.with_traceback(None) from e.__cause__ # User compiler error
82
+ torch._dynamo.exc.Unsupported: Unsupported Tensor.item() call with capture_scalar_outputs=False
83
+ Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
84
+ Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
85
+
86
+ Developer debug context: call_method TensorVariable() item () {}
87
+
88
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
89
+
90
+ from user code:
91
+ File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 1156, in forward
92
+ transformer_outputs = self.transformer(
93
+ File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 873, in forward
94
+ assert lengths.max().item() <= slen
95
+
96
+ Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
97
+ """
torch_compile/run_hubert.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HuBERT-CTC speech-recognition on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoProcessor, HubertForCTC
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+ from torch.nn.utils import remove_weight_norm
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(description="Run HuBERT-CTC on Neuron")
19
+ parser.add_argument(
20
+ "--model",
21
+ type=str,
22
+ default="hf-internal-testing/tiny-random-HubertModel",
23
+ help="HuBERT-CTC model name on Hugging Face Hub",
24
+ )
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # load small speech snippet
31
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
32
+ sample = dataset[0]["audio"]["array"] # 16 kHz numpy array
33
+
34
+ # processor + HuBERT-CTC model
35
+ processor = AutoProcessor.from_pretrained(args.model)
36
+ model = HubertForCTC.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ ).eval()
39
+ for m in model.modules():
40
+ if hasattr(m, "weight_g") and hasattr(m, "weight_v"):
41
+ remove_weight_norm(m)
42
+
43
+ # preprocess
44
+ inputs = processor(sample, sampling_rate=16_000, return_tensors="pt", padding=True)
45
+
46
+ # pre-run to lock shapes
47
+ with torch.no_grad():
48
+ _ = model(**inputs).logits
49
+
50
+ # compile
51
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
52
+
53
+ # warmup
54
+ warmup_start = time.time()
55
+ with torch.no_grad():
56
+ _ = model(**inputs)
57
+ warmup_time = time.time() - warmup_start
58
+
59
+ # benchmark run
60
+ run_start = time.time()
61
+ with torch.no_grad():
62
+ logits = model(**inputs).logits
63
+ run_time = time.time() - run_start
64
+
65
+ # greedy decode
66
+ predicted_ids = logits.argmax(dim=-1)
67
+ transcription = processor.decode(predicted_ids[0])
68
+
69
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
70
+ logger.info("Transcription: %s", transcription)
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
75
+
76
+ """
77
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (2048) doesn't match expected number of elements (16)
78
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
79
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
80
+ /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:92:0: note: called from
81
+ /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:448:0: note: called from
82
+ /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:986:0: note: called from
83
+ /usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:1114:0: note: called from
84
+
85
+ """
torch_compile/run_levit.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # LeViT vision-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoImageProcessor, LevitForImageClassification
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Run LeViT on Neuron")
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="facebook/levit-128S",
22
+ help="LeViT model name on Hugging Face Hub",
23
+ )
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load dataset image
30
+ dataset = load_dataset("huggingface/cats-image")
31
+ image = dataset["test"]["image"][0]
32
+
33
+ # load processor & model
34
+ processor = AutoImageProcessor.from_pretrained(args.model)
35
+ model = LevitForImageClassification.from_pretrained(
36
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
37
+ ).eval()
38
+
39
+ # preprocess
40
+ inputs = processor(images=image, return_tensors="pt")
41
+
42
+ # pre-run to lock shapes
43
+ with torch.no_grad():
44
+ _ = model(**inputs).logits
45
+
46
+ # compile
47
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
48
+
49
+ # warmup
50
+ warmup_start = time.time()
51
+ with torch.no_grad():
52
+ _ = model(**inputs)
53
+ warmup_time = time.time() - warmup_start
54
+
55
+ # benchmark run
56
+ run_start = time.time()
57
+ with torch.no_grad():
58
+ logits = model(**inputs).logits
59
+ run_time = time.time() - run_start
60
+
61
+ # top-1 ImageNet class
62
+ predicted_class_idx = logits.argmax(-1).item()
63
+ predicted_label = model.config.id2label[predicted_class_idx]
64
+
65
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
66
+ logger.info("Predicted label: %s", predicted_label)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
torch_compile/run_mobilebert.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # MobileBERT text-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, MobileBertForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run MobileBERT on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="google/mobilebert-uncased",
21
+ help="MobileBERT model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = MobileBertForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "MobileBERT is a compact BERT for on-device NLP."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_mobilenetv2.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # MobileNetV2 image-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from torchvision import transforms
9
+ from transformers import AutoImageProcessor, MobileNetV2ForImageClassification
10
+ from datasets import load_dataset
11
+ import torch_neuronx # ensures Neuron backend
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(description="Run MobileNetV2 on Neuron")
19
+ parser.add_argument(
20
+ "--model",
21
+ type=str,
22
+ default="google/mobilenet_v2_1.0_224",
23
+ help="MobileNetV2 model name on Hugging Face Hub",
24
+ )
25
+ args = parser.parse_args()
26
+
27
+ torch.set_default_dtype(torch.float32)
28
+ torch.manual_seed(42)
29
+
30
+ # load dataset image
31
+ dataset = load_dataset("huggingface/cats-image")
32
+ image = dataset["test"]["image"][0]
33
+
34
+ # load processor & MobileNetV2 model
35
+ processor = AutoImageProcessor.from_pretrained(args.model)
36
+ model = MobileNetV2ForImageClassification.from_pretrained(
37
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
38
+ ).eval()
39
+
40
+ # preprocess
41
+ inputs = processor(images=image, return_tensors="pt")
42
+
43
+ # pre-run to lock shapes
44
+ with torch.no_grad():
45
+ _ = model(**inputs).logits
46
+
47
+ # compile
48
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
49
+
50
+ # warmup
51
+ warmup_start = time.time()
52
+ with torch.no_grad():
53
+ _ = model(**inputs)
54
+ warmup_time = time.time() - warmup_start
55
+
56
+ # benchmark run
57
+ run_start = time.time()
58
+ with torch.no_grad():
59
+ logits = model(**inputs).logits
60
+ run_time = time.time() - run_start
61
+
62
+ # top-1 ImageNet class
63
+ predicted_class_idx = logits.argmax(-1).item()
64
+ predicted_label = model.config.id2label[predicted_class_idx]
65
+
66
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
67
+ logger.info("Predicted label: %s", predicted_label)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
torch_compile/run_mobilevit.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # MobileViT image-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoImageProcessor, MobileViTForImageClassification
9
+ from datasets import load_dataset
10
+ import torch_neuronx # ensures Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Run MobileViT on Neuron")
18
+ parser.add_argument(
19
+ "--model",
20
+ type=str,
21
+ default="apple/mobilevit-small",
22
+ help="MobileViT model name on Hugging Face Hub",
23
+ )
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load dataset image
30
+ dataset = load_dataset("huggingface/cats-image")
31
+ image = dataset["test"]["image"][0]
32
+
33
+ # load processor & model
34
+ processor = AutoImageProcessor.from_pretrained(args.model)
35
+ model = MobileViTForImageClassification.from_pretrained(
36
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
37
+ ).eval()
38
+
39
+ # preprocess
40
+ inputs = processor(images=image, return_tensors="pt")
41
+
42
+ # pre-run to lock shapes
43
+ with torch.no_grad():
44
+ _ = model(**inputs).logits
45
+
46
+ # compile
47
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
48
+
49
+ # warmup
50
+ warmup_start = time.time()
51
+ with torch.no_grad():
52
+ _ = model(**inputs)
53
+ warmup_time = time.time() - warmup_start
54
+
55
+ # benchmark run
56
+ run_start = time.time()
57
+ with torch.no_grad():
58
+ logits = model(**inputs).logits
59
+ run_time = time.time() - run_start
60
+
61
+ # top-1 ImageNet class
62
+ predicted_class_idx = logits.argmax(-1).item()
63
+ predicted_label = model.config.id2label[predicted_class_idx]
64
+
65
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
66
+ logger.info("Predicted label: %s", predicted_label)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
torch_compile/run_modernbert.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ModernBERT-base text-classification on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, ModernBertForSequenceClassification
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run ModernBERT on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="answerdotai/ModernBERT-base",
21
+ help="ModernBERT model name on Hugging Face Hub",
22
+ )
23
+ args = parser.parse_args()
24
+
25
+ torch.set_default_dtype(torch.float32)
26
+ torch.manual_seed(42)
27
+
28
+ # load tokenizer & model
29
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
30
+ model = ModernBertForSequenceClassification.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ ).eval()
33
+
34
+ # tokenize sample
35
+ text = "Hello, my dog is cute"
36
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
+
38
+ # pre-run to lock shapes
39
+ with torch.no_grad():
40
+ _ = model(**inputs).logits
41
+
42
+ # compile (full graph for single encoder)
43
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
44
+
45
+ # warmup
46
+ warmup_start = time.time()
47
+ with torch.no_grad():
48
+ _ = model(**inputs)
49
+ warmup_time = time.time() - warmup_start
50
+
51
+ # benchmark run
52
+ run_start = time.time()
53
+ with torch.no_grad():
54
+ logits = model(**inputs).logits
55
+ run_time = time.time() - run_start
56
+
57
+ # top-1 label
58
+ predicted_class_id = logits.argmax().item()
59
+ predicted_label = model.config.id2label[predicted_class_id]
60
+
61
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
62
+ logger.info("Predicted label: %s", predicted_label)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
torch_compile/run_mpnet.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # MPNet sentence-embedding on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, MPNetModel
9
+ import torch_neuronx # ensures Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run MPNet encoder on Neuron")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="microsoft/mpnet-base",
21
+ help="MPNet model name on Hugging Face Hub",
22
+ )
23
+ args = parser.parse_args()
24
+
25
+ torch.set_default_dtype(torch.float32)
26
+ torch.manual_seed(42)
27
+
28
+ # load tokenizer & model
29
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
30
+ model = MPNetModel.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ ).eval()
33
+
34
+ # tokenize sample sentence
35
+ text = "MPNet is a variant of BERT with permutation language modeling."
36
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
+
38
+ # pre-run to lock shapes
39
+ with torch.no_grad():
40
+ _ = model(**inputs).pooler_output
41
+
42
+ # compile
43
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
44
+
45
+ # warmup
46
+ warmup_start = time.time()
47
+ with torch.no_grad():
48
+ _ = model(**inputs)
49
+ warmup_time = time.time() - warmup_start
50
+
51
+ # benchmark run
52
+ run_start = time.time()
53
+ with torch.no_grad():
54
+ embeddings = model(**inputs).pooler_output
55
+ run_time = time.time() - run_start
56
+
57
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
58
+ logger.info("Output embedding shape: %s", embeddings.shape) # [1, hidden]
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
torch_compile/run_phi.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Phi (Phi-2 default) forward-trace + manual greedy on Neuron – fixed pad token
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import torch_neuronx # guarantees Neuron backend
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @torch.no_grad()
15
+ def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens):
16
+ """Manual greedy loop. Calls the *compiled* forward iteratively."""
17
+ B, seq_len = input_ids.shape
18
+ device = input_ids.device
19
+ position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1)
20
+
21
+ for _ in range(max_new_tokens):
22
+ logits = model_forward(input_ids, position_ids)[0] # unpack tuple
23
+ next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
24
+ input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:] # rolling window
25
+ return input_ids
26
+
27
+
28
+ def main():
29
+ parser = argparse.ArgumentParser(description="Phi forward-compile + manual greedy on Neuron")
30
+ parser.add_argument("--model", default="microsoft/phi-2")
31
+ parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length")
32
+ parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate")
33
+ args = parser.parse_args()
34
+
35
+ torch.manual_seed(42)
36
+ torch.set_default_dtype(torch.float32)
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
39
+ # Phi has no pad_token by default
40
+ if tokenizer.pad_token is None:
41
+ tokenizer.pad_token = tokenizer.eos_token
42
+
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ args.model,
45
+ torch_dtype=torch.float32,
46
+ attn_implementation="eager",
47
+ use_cache=False, # static shapes
48
+ ).eval()
49
+
50
+ prompt = "The future of AI is"
51
+ inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
52
+ input_ids = inputs.input_ids
53
+ B, seq_len = input_ids.shape
54
+
55
+ # shape lock & compile forward only (full graph)
56
+ with torch.no_grad():
57
+ position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1)
58
+ _ = model(input_ids, position_ids)
59
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
60
+
61
+ # warmup
62
+ start = time.time()
63
+ with torch.no_grad():
64
+ _ = model(input_ids, position_ids)
65
+ logger.info("Warmup (forward): %.3f s", time.time() - start)
66
+
67
+ # manual greedy generation
68
+ start = time.time()
69
+ final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens)
70
+ logger.info("Generate (manual loop): %.3f s", time.time() - start)
71
+
72
+ text = tokenizer.decode(final_ids[0], skip_special_tokens=True)
73
+ logger.info("Output: %s", text)
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
torch_compile/run_phi3.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Phi-3-mini – compile model.forward only, manual greedy loop on Neuron
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import torch_neuronx # guarantees Neuron backend
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @torch.no_grad()
15
+ def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens):
16
+ B, seq_len = input_ids.shape
17
+ device = input_ids.device
18
+ position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1)
19
+
20
+ for _ in range(max_new_tokens):
21
+ logits = model_forward(input_ids, position_ids)[0]
22
+ next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
23
+ input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:] # rolling window
24
+ # position_ids stays identical (fixed seq_len)
25
+ return input_ids
26
+
27
+
28
+ def main():
29
+ parser = argparse.ArgumentParser(description="Phi-3-mini forward-compile + manual greedy on Neuron")
30
+ parser.add_argument("--model", default="microsoft/Phi-3-mini-4k-instruct")
31
+ parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length")
32
+ parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate")
33
+ args = parser.parse_args()
34
+
35
+ torch.manual_seed(42)
36
+ torch.set_default_dtype(torch.float32)
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
39
+ model = AutoModelForCausalLM.from_pretrained(
40
+ args.model,
41
+ torch_dtype=torch.float32,
42
+ attn_implementation="eager",
43
+ use_cache=False, # static shapes
44
+ ).eval()
45
+
46
+ # fixed-shape prompt
47
+ prompt = "The future of AI is"
48
+ inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
49
+ input_ids = inputs.input_ids
50
+ B, seq_len = input_ids.shape
51
+
52
+ # shape lock & compile forward only (full graph)
53
+ with torch.no_grad():
54
+ position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1)
55
+ _ = model(input_ids, position_ids)
56
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
57
+
58
+ # warmup
59
+ start = time.time()
60
+ with torch.no_grad():
61
+ _ = model(input_ids, position_ids)
62
+ logger.info("Warmup (forward): %.3f s", time.time() - start)
63
+
64
+ # manual greedy generation
65
+ start = time.time()
66
+ final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens)
67
+ logger.info("Generate (manual loop): %.3f s", time.time() - start)
68
+
69
+ text = tokenizer.decode(final_ids[0], skip_special_tokens=True)
70
+ logger.info("Output: %s", text)
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
75
+
76
+ """
77
+ /usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory
78
+ warnings.warn(f"Could not import StableHLO C++ extension: {e}")
79
+ `torch_dtype` is deprecated! Use `dtype` instead!
80
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.90it/s]
81
+ INFO:__main__:Warmup (forward): 19.975 s
82
+ INFO:__main__:Generate (manual loop): 271.678 s
83
+ INFO:__main__:Output: The future of AI is
84
+ : 1iewer
85
+ I'melissa'
86
+ """
torch_compile/run_roberta.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # RoBERTa text-classification on Neuron – full graph compile
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, RobertaForSequenceClassification
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="RoBERTa on Neuron (full graph)")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="roberta-base",
21
+ help="RoBERTa model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = RobertaForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "RoBERTa is a robustly optimized BERT pretraining approach."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile full graph
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_roformer.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # RoFormer (Rotary-position Transformer) text-classification on Neuron – full graph
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, RoFormerForSequenceClassification
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="RoFormer on Neuron (full graph)")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="junnyu/roformer_chinese_base",
21
+ help="RoFormer model name on Hugging Face Hub",
22
+ )
23
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
24
+ args = parser.parse_args()
25
+
26
+ torch.set_default_dtype(torch.float32)
27
+ torch.manual_seed(42)
28
+
29
+ # load tokenizer & model
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+ model = RoFormerForSequenceClassification.from_pretrained(
32
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
33
+ ).eval()
34
+
35
+ # tokenize sample
36
+ text = "RoFormer uses rotary position embeddings."
37
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
38
+
39
+ # pre-run to lock shapes
40
+ with torch.no_grad():
41
+ _ = model(**inputs).logits
42
+
43
+ # compile full graph
44
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
45
+
46
+ # warmup
47
+ warmup_start = time.time()
48
+ with torch.no_grad():
49
+ _ = model(**inputs)
50
+ warmup_time = time.time() - warmup_start
51
+
52
+ # benchmark run
53
+ run_start = time.time()
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ run_time = time.time() - run_start
57
+
58
+ # top-1 label
59
+ predicted_class_id = logits.argmax().item()
60
+ predicted_label = model.config.id2label[predicted_class_id]
61
+
62
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
63
+ logger.info("Predicted label: %s", predicted_label)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
torch_compile/run_sam2.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # SAM encoder on Neuron – constant-shape, no lambda
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import SamProcessor, SamModel
8
+ from PIL import Image
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="SAM encoder on Neuron (full graph)")
17
+ parser.add_argument("--model", default="facebook/sam-vit-base")
18
+ args = parser.parse_args()
19
+
20
+ torch.manual_seed(42)
21
+ torch.set_default_dtype(torch.float32)
22
+
23
+ # load processor & model
24
+ processor = SamProcessor.from_pretrained(args.model)
25
+ model = SamModel.from_pretrained(args.model, attn_implementation="eager").eval()
26
+
27
+ # dummy 224×224 RGB image
28
+ dummy_image = Image.new("RGB", (224, 224), color="red")
29
+ # constant-shape inputs (no points → encoder only)
30
+ inputs = processor(images=dummy_image, return_tensors="pt")
31
+
32
+ # pre-run to lock shapes
33
+ with torch.no_grad():
34
+ _ = model.get_image_embeddings(**inputs)
35
+
36
+ # compile encoder forward (full graph)
37
+ model.get_image_embeddings = torch.compile(
38
+ model.get_image_embeddings, backend="neuron", fullgraph=True
39
+ )
40
+
41
+ # warmup
42
+ start = time.time()
43
+ with torch.no_grad():
44
+ _ = model.get_image_embeddings(**inputs)
45
+ logger.info("Warmup: %.3f s", time.time() - start)
46
+
47
+ # benchmark
48
+ start = time.time()
49
+ with torch.no_grad():
50
+ embeddings = model.get_image_embeddings(**inputs)
51
+ logger.info("Run: %.3f s", time.time() - start)
52
+ logger.info("Embedding shape: %s", embeddings.shape) # [1, 256, 64, 64]
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
torch_compile/run_swin.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Swin Transformer image-classification on Neuron – full graph
3
+ import argparse
4
+ import logging
5
+ import time
6
+
7
+ import torch
8
+ from transformers import AutoImageProcessor, SwinForImageClassification
9
+ from datasets import load_dataset
10
+ import torch_neuronx # guarantees Neuron backend
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description="Swin on Neuron (full graph)")
18
+ parser.add_argument("--model", default="microsoft/swin-tiny-patch4-window7-224")
19
+ args = parser.parse_args()
20
+
21
+ torch.manual_seed(42)
22
+ torch.set_default_dtype(torch.float32)
23
+
24
+ # load dataset image
25
+ dataset = load_dataset("huggingface/cats-image")
26
+ image = dataset["test"]["image"][0]
27
+
28
+ # load processor & model
29
+ processor = AutoImageProcessor.from_pretrained(args.model)
30
+ model = SwinForImageClassification.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ ).eval()
33
+
34
+ # preprocess
35
+ inputs = processor(images=image, return_tensors="pt")
36
+
37
+ # pre-run to lock shapes
38
+ with torch.no_grad():
39
+ _ = model(**inputs).logits
40
+
41
+ # compile full graph
42
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
43
+
44
+ # warmup
45
+ warmup_start = time.time()
46
+ with torch.no_grad():
47
+ _ = model(**inputs)
48
+ logger.info("Warmup: %.3f s", time.time() - warmup_start)
49
+
50
+ # benchmark run
51
+ run_start = time.time()
52
+ with torch.no_grad():
53
+ logits = model(**inputs).logits
54
+ run_time = time.time() - run_start
55
+
56
+ # top-1 ImageNet class
57
+ predicted_class_idx = logits.argmax(-1).item()
58
+ predicted_label = model.config.id2label[predicted_class_idx]
59
+
60
+ logger.info("Run: %.3f s", run_time)
61
+ logger.info("Predicted label: %s", predicted_label)
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
66
+
67
+ """
68
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: error: failed to legalize operation 'torch.aten.fill.Tensor'
69
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:662:0: note: called from
70
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:736:0: note: called from
71
+ /usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from
72
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:806:0: note: called from
73
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:945:0: note: called from
74
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:1139:0: note: called from
75
+ /usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: note: see current operation: %1014 = "torch.aten.fill.Tensor"(%1013, %778) : (!torch.vtensor<[1,49,49,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,49,49,1],f32>
76
+ """
torch_compile/run_t5_decoder.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # T5 decoder (no cache) on Neuron – constant shapes, full graph, no Apex
3
+ import os
4
+ os.environ["USE_FUSED_LAYER_NORM"] = "0" # MUST be before any transformers import
5
+
6
+ import argparse
7
+ import logging
8
+ import time
9
+ import torch
10
+ from transformers import T5Tokenizer, T5Model
11
+ import torch_neuronx # guarantees Neuron backend
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(description="T5 decoder on Neuron (full graph, no cache)")
19
+ parser.add_argument("--model", default="t5-small")
20
+ parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length")
21
+ args = parser.parse_args()
22
+
23
+ torch.manual_seed(42)
24
+ torch.set_default_dtype(torch.float32)
25
+
26
+ tokenizer = T5Tokenizer.from_pretrained(args.model)
27
+ # disable DynamicCache → no deepcopy of config
28
+ model = T5Model.from_pretrained(
29
+ args.model,
30
+ torch_dtype=torch.float32,
31
+ attn_implementation="eager",
32
+ use_cache=False, # <-- static shapes, no cache
33
+ ).eval()
34
+
35
+ # constant-shape inputs
36
+ text = "hello"
37
+ enc_tok = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
38
+ with torch.no_grad():
39
+ enc_out = model.encoder(input_ids=enc_tok.input_ids).last_hidden_state.detach()
40
+
41
+ dec_tok = tokenizer("<pad>", max_length=args.seq_len, padding="max_length", return_tensors="pt")
42
+
43
+ # pre-run to lock shapes
44
+ with torch.no_grad():
45
+ _ = model.decoder(input_ids=dec_tok.input_ids, encoder_hidden_states=enc_out).last_hidden_state
46
+
47
+ # compile decoder forward only (full graph)
48
+ decode_fn = lambda inp, enc: model.decoder(input_ids=inp, encoder_hidden_states=enc).last_hidden_state
49
+ decode_fn = torch.compile(decode_fn, backend="neuron", fullgraph=True)
50
+
51
+ # warmup
52
+ start = time.time()
53
+ with torch.no_grad():
54
+ _ = decode_fn(dec_tok.input_ids, enc_out)
55
+ logger.info("Warmup: %.3f s", time.time() - start)
56
+
57
+ # benchmark
58
+ start = time.time()
59
+ with torch.no_grad():
60
+ hidden = decode_fn(dec_tok.input_ids, enc_out)
61
+ logger.info("Run: %.3f s", time.time() - start)
62
+ logger.info("Hidden shape: %s", hidden.shape) # [B, seq_len, d_model]
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
torch_compile/run_t5_encoder.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # T5 encoder on Neuron – no Apex, full graph, constant shapes
3
+ import os
4
+ os.environ["USE_FUSED_LAYER_NORM"] = "0" # <── disable Apex
5
+
6
+ import argparse
7
+ import logging
8
+ import time
9
+ import torch
10
+ from transformers import T5Tokenizer, T5Model # use T5Model (no LM head)
11
+ from datasets import load_dataset
12
+ import torch_neuronx # guarantees Neuron backend
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def main():
19
+ parser = argparse.ArgumentParser(description="T5 encoder on Neuron (full graph)")
20
+ parser.add_argument("--model", default="t5-small")
21
+ parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length")
22
+ args = parser.parse_args()
23
+
24
+ torch.manual_seed(42)
25
+ torch.set_default_dtype(torch.float32)
26
+
27
+ tokenizer = T5Tokenizer.from_pretrained(args.model)
28
+ model = T5Model.from_pretrained(
29
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
30
+ ).eval()
31
+
32
+ # fixed-shape input
33
+ text = "translate English to French: The cat is on the mat."
34
+ inputs = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt")
35
+
36
+ # pre-run to lock shapes
37
+ with torch.no_grad():
38
+ _ = model.encoder(**inputs).last_hidden_state
39
+
40
+ # compile encoder forward only (full graph)
41
+ encode_fn = lambda **kw: model.encoder(**kw).last_hidden_state
42
+ encode_fn = torch.compile(encode_fn, backend="neuron", fullgraph=True)
43
+
44
+ # warmup
45
+ start = time.time()
46
+ with torch.no_grad():
47
+ _ = encode_fn(**inputs)
48
+ logger.info("Warmup: %.3f s", time.time() - start)
49
+
50
+ # benchmark
51
+ start = time.time()
52
+ with torch.no_grad():
53
+ hidden = encode_fn(**inputs)
54
+ logger.info("Run: %.3f s", time.time() - start)
55
+ logger.info("Hidden shape: %s", hidden.shape) # [B, seq_len, d_model]
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
torch_compile/run_unispeech.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # UniSpeech (non-SAT) CTC speech-recognition on Neuron – constant shapes, full graph
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import AutoProcessor, UniSpeechForCTC
8
+ from datasets import load_dataset
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="UniSpeech CTC on Neuron (full graph)")
17
+ parser.add_argument("--model", default="microsoft/unispeech-large-1500h-cv")
18
+ args = parser.parse_args()
19
+
20
+ torch.manual_seed(42)
21
+ torch.set_default_dtype(torch.float32)
22
+
23
+ # load small speech snippet
24
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
25
+ sample = dataset[0]["audio"]["array"] # 16 kHz numpy array
26
+ sampling_rate = dataset.features["audio"].sampling_rate
27
+
28
+ # processor + CTC model (non-SAT)
29
+ processor = AutoProcessor.from_pretrained(args.model)
30
+ model = UniSpeechForCTC.from_pretrained(
31
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
32
+ ).eval()
33
+
34
+ # preprocess – fixed-length audio (4 s)
35
+ inputs = processor(sample, sampling_rate=sampling_rate, max_length=4 * 16_000, padding="max_length", return_tensors="pt")
36
+
37
+ # pre-run to lock shapes
38
+ with torch.no_grad():
39
+ _ = model(**inputs).logits
40
+
41
+ # compile forward (full graph)
42
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
43
+
44
+ # warmup
45
+ start = time.time()
46
+ with torch.no_grad():
47
+ _ = model(**inputs)
48
+ logger.info("Warmup: %.3f s", time.time() - start)
49
+
50
+ # benchmark + decode
51
+ start = time.time()
52
+ with torch.no_grad():
53
+ logits = model(**inputs).logits
54
+ logger.info("Run: %.3f s", time.time() - start)
55
+
56
+ predicted_ids = torch.argmax(logits, dim=-1)
57
+ transcription = processor.batch_decode(predicted_ids)[0]
58
+ logger.info("Transcription: %s", transcription)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
torch_compile/run_unispeech_sat.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # UniSpeech-SAT encoder on Neuron – full graph, constant shapes
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import Wav2Vec2Processor, UniSpeechSatModel
8
+ from datasets import load_dataset
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="UniSpeech-SAT encoder on Neuron (full graph)")
17
+ parser.add_argument("--model", default="microsoft/unispeech-sat-base-100h-libri-ft")
18
+ args = parser.parse_args()
19
+
20
+ torch.manual_seed(42)
21
+ torch.set_default_dtype(torch.float32)
22
+
23
+ # load small speech snippet
24
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
25
+ sample = dataset[0]["audio"]["array"] # 16 kHz numpy array
26
+
27
+ # processor + UniSpeech-SAT encoder (no LM head)
28
+ processor = Wav2Vec2Processor.from_pretrained(args.model)
29
+ model = UniSpeechSatModel.from_pretrained(
30
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
31
+ ).eval()
32
+
33
+ # preprocess – fixed-length audio (pad to 4 s)
34
+ inputs = processor(sample, sampling_rate=16_000, max_length=4 * 16_000, padding="max_length", return_tensors="pt")
35
+
36
+ # pre-run to lock shapes
37
+ with torch.no_grad():
38
+ _ = model(**inputs).last_hidden_state
39
+
40
+ # compile encoder forward (full graph)
41
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
42
+
43
+ # warmup
44
+ start = time.time()
45
+ with torch.no_grad():
46
+ _ = model(**inputs)
47
+ logger.info("Warmup: %.3f s", time.time() - start)
48
+
49
+ # benchmark run
50
+ start = time.time()
51
+ with torch.no_grad():
52
+ hidden = model(**inputs).last_hidden_state
53
+ logger.info("Run: %.3f s", time.time() - start)
54
+ logger.info("Output hidden shape: %s", hidden.shape) # [B, T, hidden]
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
59
+
60
+ """
61
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128)
62
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
63
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
64
+ /usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:140:0: note: called from
65
+ /usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:485:0: note: called from
66
+ /usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:1078:0: note: called from
67
+ """
torch_compile/run_vit.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Vision Transformer (ViT) image-classification on Neuron – full graph, constant shapes
3
+ import argparse
4
+ import logging
5
+ import time
6
+ import torch
7
+ from transformers import AutoImageProcessor, ViTForImageClassification
8
+ from datasets import load_dataset
9
+ import torch_neuronx # guarantees Neuron backend
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="ViT on Neuron (full graph)")
17
+ parser.add_argument("--model", default="google/vit-base-patch16-224")
18
+ args = parser.parse_args()
19
+
20
+ torch.manual_seed(42)
21
+ torch.set_default_dtype(torch.float32)
22
+
23
+ # load dataset image
24
+ dataset = load_dataset("huggingface/cats-image")
25
+ image = dataset["test"]["image"][0]
26
+
27
+ # load processor & model
28
+ processor = AutoImageProcessor.from_pretrained(args.model)
29
+ model = ViTForImageClassification.from_pretrained(
30
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
31
+ ).eval()
32
+
33
+ # preprocess
34
+ inputs = processor(images=image, return_tensors="pt")
35
+
36
+ # pre-run to lock shapes
37
+ with torch.no_grad():
38
+ _ = model(**inputs).logits
39
+
40
+ # compile full graph
41
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
42
+
43
+ # warmup
44
+ warmup_start = time.time()
45
+ with torch.no_grad():
46
+ _ = model(**inputs)
47
+ logger.info("Warmup: %.3f s", time.time() - warmup_start)
48
+
49
+ # benchmark run
50
+ run_start = time.time()
51
+ with torch.no_grad():
52
+ logits = model(**inputs).logits
53
+ run_time = time.time() - run_start
54
+
55
+ # top-1 ImageNet class
56
+ predicted_class_idx = logits.argmax(-1).item()
57
+ predicted_label = model.config.id2label[predicted_class_idx]
58
+
59
+ logger.info("Run: %.3f s", run_time)
60
+ logger.info("Predicted label: %s", predicted_label)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
torch_compile/run_wav2vec2.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from datasets import load_dataset
7
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
8
+
9
+ import torch_neuronx
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(description="Run Wav2Vec2 on Neuron")
17
+ parser.add_argument(
18
+ "--model", type=str, default="facebook/wav2vec2-base-960h", help="Wav2Vec2 model name"
19
+ )
20
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
21
+ args = parser.parse_args()
22
+
23
+ torch.set_default_dtype(torch.float32)
24
+ torch.manual_seed(42)
25
+
26
+ processor = Wav2Vec2Processor.from_pretrained(args.model)
27
+ model = Wav2Vec2ForCTC.from_pretrained(
28
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
29
+ )
30
+ model.eval()
31
+
32
+ dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
33
+ dataset = dataset.sort("id")
34
+ sampling_rate = dataset.features["audio"].sampling_rate
35
+ inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
36
+ # Run once to establish shapes before compile
37
+ with torch.no_grad():
38
+ logits = model(**inputs).logits
39
+
40
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False)
41
+
42
+ # Warmup
43
+ warmup_start = time.time()
44
+ with torch.no_grad():
45
+ logits = model(**inputs).logits
46
+ warmup_time = time.time() - warmup_start
47
+
48
+ # Run
49
+ run_start = time.time()
50
+ with torch.no_grad():
51
+ logits = model(**inputs).logits
52
+ run_time = time.time() - run_start
53
+ probabilities = torch.sigmoid(logits[0])
54
+ labels = (probabilities > 0.5).long()
55
+
56
+ logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
57
+ logger.info(f"Output label: {labels[0].tolist()}")
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
62
+
63
+ """
64
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128)
65
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from
66
+ /usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from
67
+ /usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:372:0: note: called from
68
+ /usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:713:0: note: called from
69
+ /usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1462:0: note: called from
70
+ /usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1862:0: note: called from
71
+
72
+ # dynamic shape of intermediate tensors leading to static shape error while runing the traced artifact.
73
+ """
torch_compile/run_whisper.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, WhisperForConditionalGeneration
7
+
8
+ import torch_neuronx
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Run Whisper on Neuron")
16
+ parser.add_argument(
17
+ "--model", type=str, default="openai/whisper-tiny", help="Whisper model name"
18
+ )
19
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
20
+ args = parser.parse_args()
21
+
22
+ torch.set_default_dtype(torch.float32)
23
+ torch.manual_seed(42)
24
+
25
+ model = WhisperForConditionalGeneration.from_pretrained(
26
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
27
+ )
28
+ model.eval()
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
31
+
32
+ num_mel_bins = model.config.num_mel_bins
33
+ input_features = torch.randn(args.batch_size, num_mel_bins, 3000, dtype=torch.float32)
34
+ gen_kwargs = {
35
+ "max_new_tokens": 64,
36
+ "do_sample": False,
37
+ "cache_implementation": "static",
38
+ "eos_token_id": -1,
39
+ }
40
+
41
+ # Run once to establish shapes before compile
42
+ with torch.no_grad():
43
+ _ = model.generate(input_features=input_features, **gen_kwargs)
44
+
45
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
46
+
47
+ # Warmup
48
+ warmup_start = time.time()
49
+ with torch.no_grad():
50
+ output = model.generate(input_features=input_features, **gen_kwargs)
51
+ warmup_time = time.time() - warmup_start
52
+
53
+ # Run
54
+ run_start = time.time()
55
+ with torch.no_grad():
56
+ output = model.generate(input_features=input_features, **gen_kwargs)
57
+ run_time = time.time() - run_start
58
+
59
+ logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s")
60
+ logger.info(f"Output: {tokenizer.batch_decode(output, skip_special_tokens=True)}")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
65
+
66
+ """
67
+ Traceback (most recent call last):
68
+ File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 64, in <module>
69
+ main()
70
+ File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 50, in main
71
+ output = model.generate(input_features=input_features, **gen_kwargs)
72
+ File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 704, in generate
73
+ init_tokens = self._retrieve_init_tokens(
74
+ File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1572, in _retrieve_init_tokens
75
+ lang_ids = self.detect_language(
76
+ File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1683, in detect_language
77
+ lang_ids = logits.argmax(-1)
78
+ File "/torch-neuronx/torch_neuronx/python_ops/auto_registration.py", line 306, in wrapper
79
+ result = operation(*args, **kwargs)
80
+ File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 712, in __call__
81
+ result = impl.execute(*args, **kwargs)
82
+ File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 109, in execute
83
+ result = self._execute_impl(*args2, **kwargs2)
84
+ File "/torch-neuronx/torch_neuronx/python_ops/to_copy.py", line 102, in _execute_impl
85
+ cpu_dst = copy_neuron_to_cpu(
86
+ File "/torch-neuronx/torch_neuronx/python_ops/cast_policy.py", line 102, in copy_neuron_to_cpu
87
+ _C._nrt_copy_neuron_to_cpu_tensor(neuron_src, cpu_tmp, non_blocking=non_blocking)
88
+ RuntimeError: Compilation error occurred on Neuron for operation=aten::_index_put_impl_;
89
+ error message="COMPILATION FAILED: Error: 2026-01-16T11:49:13Z 2026-01-16 11:49:13.062190: E hilo/hlo_passes/NeuronHloVerifier.cc:647] [ERROR] [NCC_EVRF024] Output tensor size of 10,759,912,900 bytes with shape of f32[51865,51865] exceeds 4GB limit for individual tensor size. TIP: Consider applying model parallelism or tensor parallelism per https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html."
90
+ python stack trace=
91
+ """
torch_compile/run_xlm.py ADDED
File without changes
torch_compile/run_xlm_roberta.py ADDED
File without changes
torch_compile/run_yolos.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import time
4
+ import os
5
+
6
+ import torch
7
+ from transformers import AutoImageProcessor, YolosForObjectDetection
8
+ from datasets import load_dataset
9
+ import torch_neuronx # ensure Neuron backend is available
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def main():
16
+ # Allow CPU fallback
17
+ # ERROR:torch_neuronx.neuron_dynamo_backend.backend:Execution failed: Compilation error occurred on Neuron for operation=torch_compile;
18
+ # error message="COMPILATION FAILED: Error: 2026-01-20T12:06:37Z tensor_op_name: _gather.577 | hlo_id: 577 | [ERROR] [NCC_EXTP003] Instructions generated by compiler 290400 exceeds the typical limit of 150000. Input computation graph is too big due to large operators - Consider using smaller batches or sequence length, or applying tensor parellelism. For further troubleshooting visit https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html"
19
+ # python stack trace=
20
+ os.environ["TORCH_NEURONX_FALLBACK_ONLY_FOR_UNIMPLEMENTED_OPS"] = "0"
21
+
22
+ parser = argparse.ArgumentParser(description="Run YOLOS object detection on Neuron")
23
+ parser.add_argument(
24
+ "--model",
25
+ type=str,
26
+ default="hustvl/yolos-base",
27
+ help="YOLOS model name on Hugging Face Hub",
28
+ )
29
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
30
+ args = parser.parse_args()
31
+
32
+ torch.set_default_dtype(torch.float32)
33
+ torch.manual_seed(42)
34
+
35
+ # Load dataset and pick an image
36
+ dataset = load_dataset("huggingface/cats-image")
37
+ image = dataset["test"]["image"][0]
38
+
39
+ # Load processor and model
40
+ image_processor = AutoImageProcessor.from_pretrained(args.model)
41
+ model = YolosForObjectDetection.from_pretrained(
42
+ args.model, torch_dtype=torch.float32, attn_implementation="eager"
43
+ )
44
+ model.eval()
45
+
46
+ # Preprocess image
47
+ inputs = image_processor(images=image, return_tensors="pt")
48
+
49
+ # Pre-run once to fix shapes before compilation
50
+ with torch.no_grad():
51
+ outputs = model(**inputs)
52
+
53
+ # Compile forward pass
54
+ model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True)
55
+
56
+ # Warmup
57
+ warmup_start = time.time()
58
+ with torch.no_grad():
59
+ _ = model(**inputs)
60
+ warmup_time = time.time() - warmup_start
61
+
62
+ # Actual run
63
+ run_start = time.time()
64
+ with torch.no_grad():
65
+ outputs = model(**inputs)
66
+ run_time = time.time() - run_start
67
+
68
+ # Post-process: keep only top detection
69
+ logits = outputs.logits # [B, num_queries, num_classes + 1]
70
+ probs = logits.softmax(dim=-1)[0, :, :-1] # drop "no-object"
71
+ scores, labels = probs.max(dim=-1) # CPU fallback allowed
72
+ best_idx = scores.argmax().item()
73
+
74
+ logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
75
+ logger.info("Top detection: class=%d, score=%.3f", labels[best_idx].item(), scores[best_idx].item())
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
80
+
81
+ """
82
+ Need to fall back to CPU.
83
+ """
torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Operator ' aten::argmax.out ' fell back to CPU
torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Operator 'torch_compile' executed on Neuron
2
+ Operator 'neuron::memory::alloc' executed on Neuron
3
+ Operator 'neuron::copy::cpu_to_neuron' executed on Neuron
4
+ Operator '_to_copy' executed on Neuron
5
+ Operator 'model_default' executed on Neuron
6
+ Operator 'neuron::memory::dealloc' executed on Neuron
7
+ Operator 'neuron::copy::neuron_to_cpu' executed on Neuron
8
+ Operator 'copy_' executed on Neuron
torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Operator ' aten::argmax.out ' fell back to CPU
torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Operator 'torch_compile' executed on Neuron
2
+ Operator 'neuron::memory::alloc' executed on Neuron
3
+ Operator 'neuron::copy::cpu_to_neuron' executed on Neuron
4
+ Operator '_to_copy' executed on Neuron
5
+ Operator 'model_default' executed on Neuron
6
+ Operator 'neuron::memory::dealloc' executed on Neuron
7
+ Operator 'neuron::copy::neuron_to_cpu' executed on Neuron
8
+ Operator 'copy_' executed on Neuron