diff --git a/torch_compile/flux/test_clip_text_encoder.py.py b/torch_compile/flux/test_clip_text_encoder.py.py new file mode 100644 index 0000000000000000000000000000000000000000..3b4b03a3058ab124725ed5065cdc866096ef29ee --- /dev/null +++ b/torch_compile/flux/test_clip_text_encoder.py.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +CLIP (Flux variant) zero-shot image-classification on Neuron. +Flux pipeline uses: openai/clip-vit-large-patch14 +""" +import argparse +import logging +import time + +import torch +from transformers import CLIPProcessor, CLIPModel +from datasets import load_dataset +import torch_neuronx # noqa: F401 guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="CLIP (Flux checkpoint) zero-shot image classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="openai/clip-vit-large-patch14", # Flux CLIP checkpoint + help="CLIP model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model (Flux CLIP checkpoint) + processor = CLIPProcessor.from_pretrained(args.model) + model = CLIPModel.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # Zero-shot labels + texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"] + inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) + + # Pre-run once to freeze shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass (allow graph breaks for big model) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Compute probabilities + logits_per_image = outputs.logits_per_image # [B, num_texts] + probs = logits_per_image.softmax(dim=-1) + best_idx = int(probs.argmax().item()) + best_label = texts[best_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Probabilities: %s", probs.tolist()) + logger.info("Predicted label: %s", best_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/flux/test_flux_transformer.py b/torch_compile/flux/test_flux_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..800103c5089b30c1a70c7160388bc67381a6c6ce --- /dev/null +++ b/torch_compile/flux/test_flux_transformer.py @@ -0,0 +1,73 @@ +# torchrun --nproc_per_node=8 test_flux_transformer.py +import os, time, argparse, logging, torch, torch.distributed as dist +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.tensor.parallel import ( + ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module +) +from diffusers import FluxTransformer2DModel +import torch_neuronx +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def apply_tp_flux(transformer: torch.nn.Module, tp_mesh: DeviceMesh): + # embed & final-norm replicated + plan = {"x_embedder": None, "norm_out": None} + parallelize_module(transformer, tp_mesh, plan) + + # inside each transformer block + for block in transformer.transformer_blocks: + blk = { + "norm1": None, + "norm1_k": None, + "attn.qkv": ColwiseParallel(), + "attn.proj": RowwiseParallel(output_layouts=Replicate()), + "attn.norm_q": None, + "attn.norm_k": None, + "ffn.net.0": ColwiseParallel(), # gate + "ffn.net.2": RowwiseParallel(output_layouts=Replicate()), + } + parallelize_module(block, tp_mesh, blk) + return transformer + +def main(): + dist.init_process_group(backend="neuron") + rank = dist.get_rank() + device = torch.device(f"neuron:{rank}") + tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size()))) + + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="black-forest-labs/FLUX.1-dev/transformer") + parser.add_argument("--seq-len", type=int, default=4096) + parser.add_argument("--dim", type=int, default=3072) + args = parser.parse_args() + + # create on CPU, real tensors + with torch.device("cpu"): + transformer = FluxTransformer2DModel.from_pretrained( + args.model, torch_dtype=torch.bfloat16, attn_implementation="eager" + ).eval() + + transformer = apply_tp_flux(transformer, tp_mesh) + # move local shards to Neuron + for p in transformer.parameters(): + if isinstance(p, DTensor): + p._local_tensor = p._local_tensor.to(device, dtype=torch.bfloat16) + else: + p.data = p.data.to(device, dtype=torch.bfloat16) + + transformer = torch.compile(transformer, backend="neuron", fullgraph=False) + + batch = 1 + hidden = torch.randn(batch, args.seq_len, args.dim, dtype=torch.bfloat16, device=device) + encoder_hidden = torch.randn(batch, args.seq_len, 4096, dtype=torch.bfloat16, device=device) + timestep = torch.tensor([500], dtype=torch.int64, device=device) + + with torch.no_grad(): + _ = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep) + t0 = time.time() + out = transformer(hidden=hidden, encoder_hidden=encoder_hidden, timestep=timestep) + logger.info("Rank %d Flux-TFM latency: %.3f ms shape: %s", + rank, (time.time()-t0)*1000, out.sample.shape) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/flux/test_t5_text_encoder.py b/torch_compile/flux/test_t5_text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2e47fe100c877969ae32db82ec0d306a70908036 --- /dev/null +++ b/torch_compile/flux/test_t5_text_encoder.py @@ -0,0 +1,72 @@ +# torchrun --nproc_per_node=4 test_t5_text_encoder.py +import os, time, argparse, logging, torch, torch.distributed as dist +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.tensor.parallel import ( + ColwiseParallel, RowwiseParallel, PrepareModuleInput, parallelize_module +) +from transformers import T5EncoderModel, AutoTokenizer +from torchtitan.models.t5 import T5Encoder # or transformers T5EncoderModel +import torch_neuronx +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def apply_tp_t5(encoder: torch.nn.Module, tp_mesh: DeviceMesh): + # encoder.embed_tokens already replicated + plan = { + "embed_tokens": None, # replicate + "encoder.block": None, # we will loop inside + } + parallelize_module(encoder, tp_mesh, plan) + + # shard every dense layer inside each encoder block + for layer in encoder.encoder.block: + layer_plan = { + "layer.0.SelfAttention.q": ColwiseParallel(), + "layer.0.SelfAttention.k": ColwiseParallel(), + "layer.0.SelfAttention.v": ColwiseParallel(), + "layer.0.SelfAttention.o": RowwiseParallel(output_layouts=Replicate()), + "layer.0.dense": ColwiseParallel(), + "layer.1.dense": RowwiseParallel(output_layouts=Replicate()), + } + parallelize_module(layer, tp_mesh, layer_plan) + return encoder + +def main(): + dist.init_process_group(backend="neuron") + rank = dist.get_rank() + device = torch.device(f"neuron:{rank}") + tp_mesh = DeviceMesh("neuron", list(range(dist.get_world_size()))) + + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="google/t5-v1_1-xxl") + parser.add_argument("--seq-len", type=int, default=512) + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + # create model on CPU, real tensors + with torch.device("cpu"): + encoder = T5EncoderModel.from_pretrained(args.model, attn_implementation="eager").eval() + + encoder = apply_tp_t5(encoder, tp_mesh) + # move local shards to Neuron + for p in encoder.parameters(): + if isinstance(p, DTensor): + p._local_tensor = p._local_tensor.to(device) + else: + p.data = p.data.to(device) + + encoder = torch.compile(encoder, backend="neuron", fullgraph=False) + + text = ["a photo of a cat"] + txt_in = tokenizer(text, max_length=args.seq_len, padding="max_length", return_tensors="pt") + input_ids = txt_in.input_ids.to(device) + + with torch.no_grad(): + _ = encoder(input_ids) # compile + t0 = time.time() + out = encoder(input_ids).last_hidden_state + logger.info("Rank %d T5-XXL enc latency: %.3f ms shape: %s", + rank, (time.time()-t0)*1000, out.shape) # [1, seq_len, 4096] + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/flux/test_vae_decoder.py b/torch_compile/flux/test_vae_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..47e6dc883e056af5084f6060871654cc12037eef --- /dev/null +++ b/torch_compile/flux/test_vae_decoder.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Flux VAE decoder (16-ch latent → RGB image) on Neuron. +Checkpoint: black-forest-labs/FLUX.1-dev/vae +""" +import argparse +import logging +import time +from pathlib import Path + +import torch +from diffusers import AutoencoderKL +import torch_neuronx # noqa: F401 guarantees Neuron backend +from PIL import Image + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="Flux VAE decoder (latent → image) with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + # default="black-forest-labs/FLUX.1-dev/vae", + default="/workspace/flux_weight/", + help="Flux VAE checkpoint on Hugging Face Hub", + ) + parser.add_argument("--latent-ch", type=int, default=16, help="Latent channels (Flux=16)") + parser.add_argument("--scale", type=int, default=32, help="Latent spatial size (256 px / 8)") + parser.add_argument("--output", type=str, default="flux_vae_out.png", help="Output image path") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load Flux VAE decoder + vae = AutoencoderKL.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32).eval() + + # Create dummy latent (bfloat16, N(0,1)) - shape: [B, 16, H/8, W/8] + latent = torch.randn(1, args.latent_ch, args.scale, args.scale, dtype=torch.float32) + + # Pre-run once to freeze shapes before compilation + with torch.no_grad(): + _ = vae.decode(latent).sample + + # Compile decode function (allow graph breaks for big kernels) + decode_fn = torch.compile(vae.decode, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = decode_fn(latent) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + image = decode_fn(latent).sample + run_time = time.time() - run_start + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("VAE output shape: %s", image.shape) # [1, 3, H, W] + + # Convert to PIL and save + image = (image / 2 + 0.5).clamp(0, 1) # scale to [0,1] + image = image.cpu().float() + Image.fromarray((image[0].permute(1, 2, 0).numpy() * 255).astype("uint8")).save(args.output) + logger.info("Saved decoded image to %s", Path(args.output).resolve()) + + +if __name__ == "__main__": + main() + +""" +The compilation process took more than 2 hours. +/usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory + warnings.warn(f"Could not import StableHLO C++ extension: {e}") +INFO:__main__:Warmup: 4010.52 s, Run: 22.5420 s +INFO:__main__:VAE output shape: torch.Size([1, 3, 256, 256]) +INFO:__main__:Saved decoded image to /workspace/torch_neuron_samples/torch-neuron-samples/scripts/torch_compile/flux/flux_vae_out.png +""" \ No newline at end of file diff --git a/torch_compile/run_albert.py b/torch_compile/run_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..ddd28676021486147b0c4d496e8e56474608f9ca --- /dev/null +++ b/torch_compile/run_albert.py @@ -0,0 +1,63 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, AlbertForSequenceClassification + +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run ALBERT on Neuron") + parser.add_argument( + "--model", type=str, default="albert-base-v2", help="ALBERT model name" + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load ALBERT model and tokenizer + model = AlbertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + inputs = tokenizer( + "Hamilton is considered to be the best musical of human history.", + return_tensors="pt" + ) + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + _ = model(**inputs).logits + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + predicted_class_id = logits.argmax().item() + predicted_class_label = model.config.id2label[predicted_class_id] + + logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s") + logger.info(f"Output label: {predicted_class_label}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_ast.py b/torch_compile/run_ast.py new file mode 100644 index 0000000000000000000000000000000000000000..7b6579ed169ac54504e6c4f14f5e0111a8bbe8c2 --- /dev/null +++ b/torch_compile/run_ast.py @@ -0,0 +1,77 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoFeatureExtractor, ASTForAudioClassification +from datasets import load_dataset + +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run AST (Audio Spectrogram Transformer) on Neuron") + parser.add_argument( + "--model", + type=str, + default="MIT/ast-finetuned-audioset-10-10-0.4593", + help="AST model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and extract features + dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + dataset = dataset.sort("id") + sampling_rate = dataset.features["audio"].sampling_rate + + feature_extractor = AutoFeatureExtractor.from_pretrained(args.model) + inputs = feature_extractor( + dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt" + ) + + # Load AST model + model = ASTForAudioClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Decode result + predicted_class_ids = torch.argmax(logits, dim=-1).item() + predicted_label = model.config.id2label[predicted_class_ids] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +Works +""" \ No newline at end of file diff --git a/torch_compile/run_beit.py b/torch_compile/run_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..c6ce0a3c0f8d85f7195b84d710921736b10c4183 --- /dev/null +++ b/torch_compile/run_beit.py @@ -0,0 +1,81 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, BeitForImageClassification +from datasets import load_dataset + +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run BEiT on Neuron") + parser.add_argument( + "--model", + type=str, + default="microsoft/beit-base-patch16-224-pt22k", + help="BEiT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + image_processor = AutoImageProcessor.from_pretrained(args.model) + model = BeitForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Preprocess + inputs = image_processor(image, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Predicted ImageNet class + predicted_label = logits.argmax(-1).item() + label_str = model.config.id2label[predicted_label] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", label_str) + + +if __name__ == "__main__": + main() +""" +root@d90ba90f3d81:/workspace/torch_neuron_samples/torch-neuron-samples/scripts/tests# torch-mlir-opt -pass-pipeline='builtin.module(torch-backend-to-stablehlo-backend-pipeline)' /tmp/UnnammedModule.mlir +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: error: failed to legalize operation 'torch.aten.fill.Tensor' +/usr/local/lib/python3.10/site-packages/transformers/pytorch_utils.py:361:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:625:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:688:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:824:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:1007:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.py:593:0: note: see current operation: %733 = "torch.aten.fill.Tensor"(%732, %524) : (!torch.vtensor<[197],si64>, !torch.vtensor<[],si64>) -> !torch.vtensor<[197],si64> +""" diff --git a/torch_compile/run_bert.py b/torch_compile/run_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..78ad06df306fd5ef8a297c641d72acc9b9a38bc5 --- /dev/null +++ b/torch_compile/run_bert.py @@ -0,0 +1,62 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, BertForSequenceClassification + +import torch_neuronx + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run Bert on Neuron") + parser.add_argument( + "--model", type=str, default="google-bert/bert-base-uncased", help="Bert model name" + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + model = BertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + inputs = tokenizer("Hamilton is considered to be the best musical of human history.", return_tensors="pt") + + # Run once to establish shapes before compile + with torch.no_grad(): + logits = model(**inputs).logits + + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + logits = model(**inputs) + warmup_time = time.time() - warmup_start + + # Run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + predicted_class_id = logits.argmax().item() + predicted_class_label = model.config.id2label[predicted_class_id] + + logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s") + logger.info(f"Output label: {predicted_class_label}") + + +if __name__ == "__main__": + main() + +""" +Works +""" diff --git a/torch_compile/run_camembert.py b/torch_compile/run_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..8f403832465c1bf98a228e3b42bac48327f9fbf6 --- /dev/null +++ b/torch_compile/run_camembert.py @@ -0,0 +1,71 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, CamembertForSequenceClassification + +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run CamemBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="camembert-base", + help="CamemBERT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = CamembertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Tokenize sample text + text = "CamemBERT est un modèle de langue français." + inputs = tokenizer(text, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Decode result + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +Works +""" diff --git a/torch_compile/run_clip.py b/torch_compile/run_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..ae16631c38634c2400ec35e000de6b4cef3d14d1 --- /dev/null +++ b/torch_compile/run_clip.py @@ -0,0 +1,76 @@ +import argparse +import logging +import time + +import torch +from transformers import CLIPProcessor, CLIPModel +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="CLIP zero-shot image classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="openai/clip-vit-base-patch32", + help="CLIP model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + processor = CLIPProcessor.from_pretrained(args.model) + model = CLIPModel.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Build zero-shot inputs + texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"] + inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Compute probabilities + logits_per_image = outputs.logits_per_image # [batch_size, num_texts] + probs = logits_per_image.softmax(dim=-1) + best_idx = int(probs.argmax()) + best_label = texts[best_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Probabilities: %s", probs.tolist()) + logger.info("Predicted label: %s", best_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_convbert.py b/torch_compile/run_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ab68e4ac50f9f1a1415ef350ebf5d2b624edde --- /dev/null +++ b/torch_compile/run_convbert.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, ConvBertForSequenceClassification + +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run ConvBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="YituTech/conv-bert-base", + help="ConvBERT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = ConvBertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Tokenize sample text + text = "ConvBERT combines self-attention and lightweight convolutions." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Decode result + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +:0: error: failed to legalize operation 'torch.constant.int' +:0: note: see current operation: %0 = "torch.constant.int"() <{value = 9 : i64}> : () -> !torch.int +""" \ No newline at end of file diff --git a/torch_compile/run_convnext.py b/torch_compile/run_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..34f3def380db8270de3d05b9ca670a19a546b541 --- /dev/null +++ b/torch_compile/run_convnext.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, ConvNextForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="ConvNeXt image-classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="facebook/convnext-tiny-224", + help="ConvNeXT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + processor = AutoImageProcessor.from_pretrained(args.model) + model = ConvNextForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Preprocess image + inputs = processor(images=image, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Predicted ImageNet class + predicted_class_idx = outputs.logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_convnextv2.py b/torch_compile/run_convnextv2.py new file mode 100644 index 0000000000000000000000000000000000000000..c2d6e7ac88f031c80178fd5b8dec8339314d63df --- /dev/null +++ b/torch_compile/run_convnextv2.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, ConvNextV2ForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="ConvNeXt-V2 image-classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="facebook/convnextv2-tiny-1k-224", + help="ConvNeXt-V2 model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + processor = AutoImageProcessor.from_pretrained(args.model) + model = ConvNextV2ForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Preprocess image + inputs = processor(images=image, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Predicted ImageNet class + predicted_class_idx = outputs.logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_cvt.py b/torch_compile/run_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..b5f03432efa1ed51bc30992982d1f0d28c4cc753 --- /dev/null +++ b/torch_compile/run_cvt.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, CvtForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="CvT image-classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/cvt-13", + help="CvT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + processor = AutoImageProcessor.from_pretrained(args.model) + model = CvtForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Preprocess image + inputs = processor(images=image, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Predicted ImageNet class + predicted_class_idx = outputs.logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_deberta.py b/torch_compile/run_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..41b67c63add233a6e07e544c7a9c4d2545226b3d --- /dev/null +++ b/torch_compile/run_deberta.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, DebertaForSequenceClassification +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="DeBERTa sequence-classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/deberta-base", + help="DeBERTa model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = DebertaForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Tokenize sample text + text = "DeBERTa improves BERT and RoBERTa using disentangled attention." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Decode result + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function (*(FakeTensor(..., device='neuron:0', size=(1, 18, 768)), Parameter(FakeTensor(..., size=(2304, 768), requires_grad=True)), None), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices neuron:0, cpu') +""" \ No newline at end of file diff --git a/torch_compile/run_deberta_v3.py b/torch_compile/run_deberta_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..dbcef18108b05f6545cf858cf13c58b518b7442e --- /dev/null +++ b/torch_compile/run_deberta_v3.py @@ -0,0 +1,72 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, DebertaV2ForSequenceClassification +import torch_neuronx # ensures Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="DeBERTa-v3 sequence-classification with torch.compile on Neuron" + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/deberta-v3-base", + help="DeBERTa-v3 model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = DebertaV2ForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Tokenize sample text + text = "DeBERTa-v3 achieves stronger performance with improved pre-training." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + logits = model(**inputs).logits + + # Compile forward pass (allow graph breaks to avoid instruction-limit) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # Decode result + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +Works +""" \ No newline at end of file diff --git a/torch_compile/run_deit.py b/torch_compile/run_deit.py new file mode 100644 index 0000000000000000000000000000000000000000..e7d0b3a0787ba70af2cc0c1691853e8148c9d1ff --- /dev/null +++ b/torch_compile/run_deit.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# DeiT (Vision Transformer) image-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, DeiTForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run DeiT on Neuron") + parser.add_argument( + "--model", + type=str, + default="facebook/deit-base-distilled-patch16-224", + help="DeiT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & distilled DeiT model + processor = AutoImageProcessor.from_pretrained(args.model) + model = DeiTForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_distillbert.py b/torch_compile/run_distillbert.py new file mode 100644 index 0000000000000000000000000000000000000000..78e439afc93559e85fbf515c93e09e6731cef7e5 --- /dev/null +++ b/torch_compile/run_distillbert.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# DistilBERT text-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, DistilBertForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run DistilBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="distilbert-base-uncased-finetuned-sst-2-english", + help="DistilBERT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = DistilBertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "DistilBERT is a compact, fast variant of BERT." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_donutswin.py b/torch_compile/run_donutswin.py new file mode 100644 index 0000000000000000000000000000000000000000..9a7cc758e97a0deec86a764c0e53320c8b3cf3eb --- /dev/null +++ b/torch_compile/run_donutswin.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# DonutSwin image-encoder on Neuron (no decoder, pure vision) +import argparse +import logging +import time + +import torch +from transformers import DonutImageProcessor, DonutSwinModel +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run DonutSwin encoder on Neuron") + parser.add_argument( + "--model", + type=str, + default="naver-clova-ix/donut-base", + help="DonutSwin model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & vision encoder only + processor = DonutImageProcessor.from_pretrained(args.model) + model = DonutSwinModel.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).last_hidden_state + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + hidden = model(**inputs).last_hidden_state + run_time = time.time() - run_start + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Output hidden shape: %s", hidden.shape) # [B, seq_len, hidden_size] + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: error: failed to legalize operation 'torch.aten.fill.Tensor' +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:637:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:712:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:783:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:922:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.py:586:0: note: see current operation: %1327 = "torch.aten.fill.Tensor"(%1326, %1091) : (!torch.vtensor<[1,630,470,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,630,470,1],f32> +""" \ No newline at end of file diff --git a/torch_compile/run_dpt.py b/torch_compile/run_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..af7b5b58d965af5c1eb26c5aceaf4b4f6ca65b04 --- /dev/null +++ b/torch_compile/run_dpt.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# DPT (Dense Prediction Transformer) monocular depth estimation on Neuron +import argparse +import logging +import time + +import torch +from transformers import DPTImageProcessor, DPTForDepthEstimation +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run DPT depth estimation on Neuron") + parser.add_argument( + "--model", + type=str, + default="Intel/dpt-large", + help="DPT model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & DPT model + processor = DPTImageProcessor.from_pretrained(args.model) + model = DPTForDepthEstimation.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).predicted_depth + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + depth = model(**inputs).predicted_depth + run_time = time.time() - run_start + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Output depth shape: %s", depth.shape) # [B, 1, H, W] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_electra.py b/torch_compile/run_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..4a08356e4c2ff43ad6c449784a2afc687f0a862e --- /dev/null +++ b/torch_compile/run_electra.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# ELECTRA (discriminator) text-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, ElectraForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run ELECTRA on Neuron") + parser.add_argument( + "--model", + type=str, + default="google/electra-base-discriminator", + help="ELECTRA model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = ElectraForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "ELECTRA pre-trains a discriminator to detect replaced tokens." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_esm.py b/torch_compile/run_esm.py new file mode 100644 index 0000000000000000000000000000000000000000..aed72af5e72669fdca61ee0f2565cceb0f44d30b --- /dev/null +++ b/torch_compile/run_esm.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# ESM (Evolutionary Scale Modeling) protein-sequence classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import EsmTokenizer, EsmForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run ESM on Neuron") + parser.add_argument( + "--model", + type=str, + default="facebook/esm2_t33_650M_UR50D", + help="ESM model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = EsmTokenizer.from_pretrained(args.model) + model = EsmForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize protein sequence + sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG" + inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_flaubert.py b/torch_compile/run_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..62b86ee32a7fcfd5826b979f0c7efa318d297fff --- /dev/null +++ b/torch_compile/run_flaubert.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# FlauBERT text-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import FlaubertTokenizer, FlaubertForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run FlauBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="flaubert/flaubert_base_cased", + help="FlauBERT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = FlaubertTokenizer.from_pretrained(args.model) + model = FlaubertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "FlauBERT est un modèle de langue français performant." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + + +""" +Traceback (most recent call last): + File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 67, in + main() + File "/workspace/torch_neuron_sample/torch-neuron-samples/scripts/torch_compile/run_flaubert.py", line 49, in main + _ = model(**inputs) + File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/usr/local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 841, in compile_wrapper + raise e.with_traceback(None) from e.__cause__ # User compiler error +torch._dynamo.exc.Unsupported: Unsupported Tensor.item() call with capture_scalar_outputs=False + Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False. + Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph. + + Developer debug context: call_method TensorVariable() item () {} + + For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html + +from user code: + File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 1156, in forward + transformer_outputs = self.transformer( + File "/usr/local/lib/python3.10/site-packages/transformers/models/flaubert/modeling_flaubert.py", line 873, in forward + assert lengths.max().item() <= slen + +Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" +""" \ No newline at end of file diff --git a/torch_compile/run_hubert.py b/torch_compile/run_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..0497093ae66361ef36bfd034033231f2cfbe79ad --- /dev/null +++ b/torch_compile/run_hubert.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# HuBERT-CTC speech-recognition on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoProcessor, HubertForCTC +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend +from torch.nn.utils import remove_weight_norm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run HuBERT-CTC on Neuron") + parser.add_argument( + "--model", + type=str, + default="hf-internal-testing/tiny-random-HubertModel", + help="HuBERT-CTC model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load small speech snippet + dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + sample = dataset[0]["audio"]["array"] # 16 kHz numpy array + + # processor + HuBERT-CTC model + processor = AutoProcessor.from_pretrained(args.model) + model = HubertForCTC.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + for m in model.modules(): + if hasattr(m, "weight_g") and hasattr(m, "weight_v"): + remove_weight_norm(m) + + # preprocess + inputs = processor(sample, sampling_rate=16_000, return_tensors="pt", padding=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # greedy decode + predicted_ids = logits.argmax(dim=-1) + transcription = processor.decode(predicted_ids[0]) + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Transcription: %s", transcription) + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (2048) doesn't match expected number of elements (16) +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:92:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:448:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:986:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.py:1114:0: note: called from + +""" \ No newline at end of file diff --git a/torch_compile/run_levit.py b/torch_compile/run_levit.py new file mode 100644 index 0000000000000000000000000000000000000000..2fffe97a7a30f6e1b6b703824ce1bd2489db9025 --- /dev/null +++ b/torch_compile/run_levit.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# LeViT vision-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, LevitForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run LeViT on Neuron") + parser.add_argument( + "--model", + type=str, + default="facebook/levit-128S", + help="LeViT model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & model + processor = AutoImageProcessor.from_pretrained(args.model) + model = LevitForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_mobilebert.py b/torch_compile/run_mobilebert.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1f4a5c6b4bb7c0c35d0d30dc3cb8f551322e09 --- /dev/null +++ b/torch_compile/run_mobilebert.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# MobileBERT text-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, MobileBertForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run MobileBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="google/mobilebert-uncased", + help="MobileBERT model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = MobileBertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "MobileBERT is a compact BERT for on-device NLP." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_mobilenetv2.py b/torch_compile/run_mobilenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..ead903813de9119822a71c2b9cacccc82447853d --- /dev/null +++ b/torch_compile/run_mobilenetv2.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# MobileNetV2 image-classification on Neuron +import argparse +import logging +import time + +import torch +from torchvision import transforms +from transformers import AutoImageProcessor, MobileNetV2ForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run MobileNetV2 on Neuron") + parser.add_argument( + "--model", + type=str, + default="google/mobilenet_v2_1.0_224", + help="MobileNetV2 model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & MobileNetV2 model + processor = AutoImageProcessor.from_pretrained(args.model) + model = MobileNetV2ForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_mobilevit.py b/torch_compile/run_mobilevit.py new file mode 100644 index 0000000000000000000000000000000000000000..bf99f0045287d8e941a8666dda148f9b7bd107fb --- /dev/null +++ b/torch_compile/run_mobilevit.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# MobileViT image-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, MobileViTForImageClassification +from datasets import load_dataset +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run MobileViT on Neuron") + parser.add_argument( + "--model", + type=str, + default="apple/mobilevit-small", + help="MobileViT model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & model + processor = AutoImageProcessor.from_pretrained(args.model) + model = MobileViTForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_modernbert.py b/torch_compile/run_modernbert.py new file mode 100644 index 0000000000000000000000000000000000000000..72f0782510b5123f7a85a41f66e7fa8ec433956d --- /dev/null +++ b/torch_compile/run_modernbert.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# ModernBERT-base text-classification on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, ModernBertForSequenceClassification +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run ModernBERT on Neuron") + parser.add_argument( + "--model", + type=str, + default="answerdotai/ModernBERT-base", + help="ModernBERT model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = ModernBertForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "Hello, my dog is cute" + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile (full graph for single encoder) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_mpnet.py b/torch_compile/run_mpnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b40447871759eb717f9a8bd1b013953db937f184 --- /dev/null +++ b/torch_compile/run_mpnet.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# MPNet sentence-embedding on Neuron +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, MPNetModel +import torch_neuronx # ensures Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run MPNet encoder on Neuron") + parser.add_argument( + "--model", + type=str, + default="microsoft/mpnet-base", + help="MPNet model name on Hugging Face Hub", + ) + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = MPNetModel.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample sentence + text = "MPNet is a variant of BERT with permutation language modeling." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).pooler_output + + # compile + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + embeddings = model(**inputs).pooler_output + run_time = time.time() - run_start + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Output embedding shape: %s", embeddings.shape) # [1, hidden] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_phi.py b/torch_compile/run_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..36d23fbe8a069c4ccf48f96aeb944f428ea581de --- /dev/null +++ b/torch_compile/run_phi.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Phi (Phi-2 default) forward-trace + manual greedy on Neuron – fixed pad token +import argparse +import logging +import time +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@torch.no_grad() +def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens): + """Manual greedy loop. Calls the *compiled* forward iteratively.""" + B, seq_len = input_ids.shape + device = input_ids.device + position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1) + + for _ in range(max_new_tokens): + logits = model_forward(input_ids, position_ids)[0] # unpack tuple + next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True) + input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:] # rolling window + return input_ids + + +def main(): + parser = argparse.ArgumentParser(description="Phi forward-compile + manual greedy on Neuron") + parser.add_argument("--model", default="microsoft/phi-2") + parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length") + parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + # Phi has no pad_token by default + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.float32, + attn_implementation="eager", + use_cache=False, # static shapes + ).eval() + + prompt = "The future of AI is" + inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt") + input_ids = inputs.input_ids + B, seq_len = input_ids.shape + + # shape lock & compile forward only (full graph) + with torch.no_grad(): + position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1) + _ = model(input_ids, position_ids) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = model(input_ids, position_ids) + logger.info("Warmup (forward): %.3f s", time.time() - start) + + # manual greedy generation + start = time.time() + final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens) + logger.info("Generate (manual loop): %.3f s", time.time() - start) + + text = tokenizer.decode(final_ids[0], skip_special_tokens=True) + logger.info("Output: %s", text) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_phi3.py b/torch_compile/run_phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..1bae895137c2ad918c8e728bdef5f2dd7b1cf2e0 --- /dev/null +++ b/torch_compile/run_phi3.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# Phi-3-mini – compile model.forward only, manual greedy loop on Neuron +import argparse +import logging +import time +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@torch.no_grad() +def greedy_generate(model_forward, tokenizer, input_ids, max_new_tokens): + B, seq_len = input_ids.shape + device = input_ids.device + position_ids = torch.arange(seq_len, dtype=torch.long, device=device).unsqueeze(0).expand(B, -1) + + for _ in range(max_new_tokens): + logits = model_forward(input_ids, position_ids)[0] + next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True) + input_ids = torch.cat([input_ids, next_id], dim=1)[:, -seq_len:] # rolling window + # position_ids stays identical (fixed seq_len) + return input_ids + + +def main(): + parser = argparse.ArgumentParser(description="Phi-3-mini forward-compile + manual greedy on Neuron") + parser.add_argument("--model", default="microsoft/Phi-3-mini-4k-instruct") + parser.add_argument("--seq-len", type=int, default=128, help="Fixed context length") + parser.add_argument("--new-tokens", type=int, default=20, help="Tokens to generate") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.float32, + attn_implementation="eager", + use_cache=False, # static shapes + ).eval() + + # fixed-shape prompt + prompt = "The future of AI is" + inputs = tokenizer(prompt, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt") + input_ids = inputs.input_ids + B, seq_len = input_ids.shape + + # shape lock & compile forward only (full graph) + with torch.no_grad(): + position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(B, -1) + _ = model(input_ids, position_ids) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = model(input_ids, position_ids) + logger.info("Warmup (forward): %.3f s", time.time() - start) + + # manual greedy generation + start = time.time() + final_ids = greedy_generate(model.forward, tokenizer, input_ids, args.new_tokens) + logger.info("Generate (manual loop): %.3f s", time.time() - start) + + text = tokenizer.decode(final_ids[0], skip_special_tokens=True) + logger.info("Output: %s", text) + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory + warnings.warn(f"Could not import StableHLO C++ extension: {e}") +`torch_dtype` is deprecated! Use `dtype` instead! +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.90it/s] +INFO:__main__:Warmup (forward): 19.975 s +INFO:__main__:Generate (manual loop): 271.678 s +INFO:__main__:Output: The future of AI is +: 1iewer +I'melissa' +""" \ No newline at end of file diff --git a/torch_compile/run_roberta.py b/torch_compile/run_roberta.py new file mode 100644 index 0000000000000000000000000000000000000000..cd9cfc29ca5873193b52d35adc3c776afb8693d4 --- /dev/null +++ b/torch_compile/run_roberta.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# RoBERTa text-classification on Neuron – full graph compile +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, RobertaForSequenceClassification +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="RoBERTa on Neuron (full graph)") + parser.add_argument( + "--model", + type=str, + default="roberta-base", + help="RoBERTa model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = RobertaForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "RoBERTa is a robustly optimized BERT pretraining approach." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile full graph + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_roformer.py b/torch_compile/run_roformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5365215bc97bf4d866aa552e81dcd037cea8cf97 --- /dev/null +++ b/torch_compile/run_roformer.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# RoFormer (Rotary-position Transformer) text-classification on Neuron – full graph +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, RoFormerForSequenceClassification +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="RoFormer on Neuron (full graph)") + parser.add_argument( + "--model", + type=str, + default="junnyu/roformer_chinese_base", + help="RoFormer model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # load tokenizer & model + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = RoFormerForSequenceClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # tokenize sample + text = "RoFormer uses rotary position embeddings." + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile full graph + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 label + predicted_class_id = logits.argmax().item() + predicted_label = model.config.id2label[predicted_class_id] + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_sam2.py b/torch_compile/run_sam2.py new file mode 100644 index 0000000000000000000000000000000000000000..b9b257ab36b0f6d72c92eca533a5de2218d5d7f2 --- /dev/null +++ b/torch_compile/run_sam2.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# SAM encoder on Neuron – constant-shape, no lambda +import argparse +import logging +import time +import torch +from transformers import SamProcessor, SamModel +from PIL import Image +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="SAM encoder on Neuron (full graph)") + parser.add_argument("--model", default="facebook/sam-vit-base") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + # load processor & model + processor = SamProcessor.from_pretrained(args.model) + model = SamModel.from_pretrained(args.model, attn_implementation="eager").eval() + + # dummy 224×224 RGB image + dummy_image = Image.new("RGB", (224, 224), color="red") + # constant-shape inputs (no points → encoder only) + inputs = processor(images=dummy_image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model.get_image_embeddings(**inputs) + + # compile encoder forward (full graph) + model.get_image_embeddings = torch.compile( + model.get_image_embeddings, backend="neuron", fullgraph=True + ) + + # warmup + start = time.time() + with torch.no_grad(): + _ = model.get_image_embeddings(**inputs) + logger.info("Warmup: %.3f s", time.time() - start) + + # benchmark + start = time.time() + with torch.no_grad(): + embeddings = model.get_image_embeddings(**inputs) + logger.info("Run: %.3f s", time.time() - start) + logger.info("Embedding shape: %s", embeddings.shape) # [1, 256, 64, 64] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_swin.py b/torch_compile/run_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..5f8bea674b50cd0b104f22f44e5760dbd1bb4f3a --- /dev/null +++ b/torch_compile/run_swin.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Swin Transformer image-classification on Neuron – full graph +import argparse +import logging +import time + +import torch +from transformers import AutoImageProcessor, SwinForImageClassification +from datasets import load_dataset +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Swin on Neuron (full graph)") + parser.add_argument("--model", default="microsoft/swin-tiny-patch4-window7-224") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & model + processor = AutoImageProcessor.from_pretrained(args.model) + model = SwinForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile full graph + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + logger.info("Warmup: %.3f s", time.time() - warmup_start) + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Run: %.3f s", run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: error: failed to legalize operation 'torch.aten.fill.Tensor' +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:662:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:736:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/modeling_layers.py:94:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:806:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:945:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:1139:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.py:611:0: note: see current operation: %1014 = "torch.aten.fill.Tensor"(%1013, %778) : (!torch.vtensor<[1,49,49,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,49,49,1],f32> +""" \ No newline at end of file diff --git a/torch_compile/run_t5_decoder.py b/torch_compile/run_t5_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2191794007c0ec525fd6d838a7bed81e65920549 --- /dev/null +++ b/torch_compile/run_t5_decoder.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# T5 decoder (no cache) on Neuron – constant shapes, full graph, no Apex +import os +os.environ["USE_FUSED_LAYER_NORM"] = "0" # MUST be before any transformers import + +import argparse +import logging +import time +import torch +from transformers import T5Tokenizer, T5Model +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="T5 decoder on Neuron (full graph, no cache)") + parser.add_argument("--model", default="t5-small") + parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + tokenizer = T5Tokenizer.from_pretrained(args.model) + # disable DynamicCache → no deepcopy of config + model = T5Model.from_pretrained( + args.model, + torch_dtype=torch.float32, + attn_implementation="eager", + use_cache=False, # <-- static shapes, no cache + ).eval() + + # constant-shape inputs + text = "hello" + enc_tok = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt") + with torch.no_grad(): + enc_out = model.encoder(input_ids=enc_tok.input_ids).last_hidden_state.detach() + + dec_tok = tokenizer("", max_length=args.seq_len, padding="max_length", return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model.decoder(input_ids=dec_tok.input_ids, encoder_hidden_states=enc_out).last_hidden_state + + # compile decoder forward only (full graph) + decode_fn = lambda inp, enc: model.decoder(input_ids=inp, encoder_hidden_states=enc).last_hidden_state + decode_fn = torch.compile(decode_fn, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = decode_fn(dec_tok.input_ids, enc_out) + logger.info("Warmup: %.3f s", time.time() - start) + + # benchmark + start = time.time() + with torch.no_grad(): + hidden = decode_fn(dec_tok.input_ids, enc_out) + logger.info("Run: %.3f s", time.time() - start) + logger.info("Hidden shape: %s", hidden.shape) # [B, seq_len, d_model] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_t5_encoder.py b/torch_compile/run_t5_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1944d8e2422ccc905bc30261c8043b190790091f --- /dev/null +++ b/torch_compile/run_t5_encoder.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# T5 encoder on Neuron – no Apex, full graph, constant shapes +import os +os.environ["USE_FUSED_LAYER_NORM"] = "0" # <── disable Apex + +import argparse +import logging +import time +import torch +from transformers import T5Tokenizer, T5Model # use T5Model (no LM head) +from datasets import load_dataset +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="T5 encoder on Neuron (full graph)") + parser.add_argument("--model", default="t5-small") + parser.add_argument("--seq-len", type=int, default=128, help="Fixed seq length") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + tokenizer = T5Tokenizer.from_pretrained(args.model) + model = T5Model.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # fixed-shape input + text = "translate English to French: The cat is on the mat." + inputs = tokenizer(text, max_length=args.seq_len, padding="max_length", truncation=True, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model.encoder(**inputs).last_hidden_state + + # compile encoder forward only (full graph) + encode_fn = lambda **kw: model.encoder(**kw).last_hidden_state + encode_fn = torch.compile(encode_fn, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = encode_fn(**inputs) + logger.info("Warmup: %.3f s", time.time() - start) + + # benchmark + start = time.time() + with torch.no_grad(): + hidden = encode_fn(**inputs) + logger.info("Run: %.3f s", time.time() - start) + logger.info("Hidden shape: %s", hidden.shape) # [B, seq_len, d_model] + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_unispeech.py b/torch_compile/run_unispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..632b089f680f3d9ea6c7a32d6540af8de696fec9 --- /dev/null +++ b/torch_compile/run_unispeech.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# UniSpeech (non-SAT) CTC speech-recognition on Neuron – constant shapes, full graph +import argparse +import logging +import time +import torch +from transformers import AutoProcessor, UniSpeechForCTC +from datasets import load_dataset +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="UniSpeech CTC on Neuron (full graph)") + parser.add_argument("--model", default="microsoft/unispeech-large-1500h-cv") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + # load small speech snippet + dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + sample = dataset[0]["audio"]["array"] # 16 kHz numpy array + sampling_rate = dataset.features["audio"].sampling_rate + + # processor + CTC model (non-SAT) + processor = AutoProcessor.from_pretrained(args.model) + model = UniSpeechForCTC.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess – fixed-length audio (4 s) + inputs = processor(sample, sampling_rate=sampling_rate, max_length=4 * 16_000, padding="max_length", return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile forward (full graph) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = model(**inputs) + logger.info("Warmup: %.3f s", time.time() - start) + + # benchmark + decode + start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + logger.info("Run: %.3f s", time.time() - start) + + predicted_ids = torch.argmax(logits, dim=-1) + transcription = processor.batch_decode(predicted_ids)[0] + logger.info("Transcription: %s", transcription) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/torch_compile/run_unispeech_sat.py b/torch_compile/run_unispeech_sat.py new file mode 100644 index 0000000000000000000000000000000000000000..b41fd98ac3960685edaae05c7a24ace63cb68bbd --- /dev/null +++ b/torch_compile/run_unispeech_sat.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# UniSpeech-SAT encoder on Neuron – full graph, constant shapes +import argparse +import logging +import time +import torch +from transformers import Wav2Vec2Processor, UniSpeechSatModel +from datasets import load_dataset +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="UniSpeech-SAT encoder on Neuron (full graph)") + parser.add_argument("--model", default="microsoft/unispeech-sat-base-100h-libri-ft") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + # load small speech snippet + dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + sample = dataset[0]["audio"]["array"] # 16 kHz numpy array + + # processor + UniSpeech-SAT encoder (no LM head) + processor = Wav2Vec2Processor.from_pretrained(args.model) + model = UniSpeechSatModel.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess – fixed-length audio (pad to 4 s) + inputs = processor(sample, sampling_rate=16_000, max_length=4 * 16_000, padding="max_length", return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).last_hidden_state + + # compile encoder forward (full graph) + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + start = time.time() + with torch.no_grad(): + _ = model(**inputs) + logger.info("Warmup: %.3f s", time.time() - start) + + # benchmark run + start = time.time() + with torch.no_grad(): + hidden = model(**inputs).last_hidden_state + logger.info("Run: %.3f s", time.time() - start) + logger.info("Output hidden shape: %s", hidden.shape) # [B, T, hidden] + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128) +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:140:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:485:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/unispeech_sat/modeling_unispeech_sat.py:1078:0: note: called from +""" \ No newline at end of file diff --git a/torch_compile/run_vit.py b/torch_compile/run_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..f1ea63c623c3df7d04a869aaa41b8e4d5279352e --- /dev/null +++ b/torch_compile/run_vit.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Vision Transformer (ViT) image-classification on Neuron – full graph, constant shapes +import argparse +import logging +import time +import torch +from transformers import AutoImageProcessor, ViTForImageClassification +from datasets import load_dataset +import torch_neuronx # guarantees Neuron backend + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="ViT on Neuron (full graph)") + parser.add_argument("--model", default="google/vit-base-patch16-224") + args = parser.parse_args() + + torch.manual_seed(42) + torch.set_default_dtype(torch.float32) + + # load dataset image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # load processor & model + processor = AutoImageProcessor.from_pretrained(args.model) + model = ViTForImageClassification.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ).eval() + + # preprocess + inputs = processor(images=image, return_tensors="pt") + + # pre-run to lock shapes + with torch.no_grad(): + _ = model(**inputs).logits + + # compile full graph + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + logger.info("Warmup: %.3f s", time.time() - warmup_start) + + # benchmark run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + + # top-1 ImageNet class + predicted_class_idx = logits.argmax(-1).item() + predicted_label = model.config.id2label[predicted_class_idx] + + logger.info("Run: %.3f s", run_time) + logger.info("Predicted label: %s", predicted_label) + + +if __name__ == "__main__": + main() diff --git a/torch_compile/run_wav2vec2.py b/torch_compile/run_wav2vec2.py new file mode 100644 index 0000000000000000000000000000000000000000..fd39b4e86c4a7bdf8c407e08658a16cea9dce80e --- /dev/null +++ b/torch_compile/run_wav2vec2.py @@ -0,0 +1,73 @@ +import argparse +import logging +import time + +import torch +from datasets import load_dataset +from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC + +import torch_neuronx + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run Wav2Vec2 on Neuron") + parser.add_argument( + "--model", type=str, default="facebook/wav2vec2-base-960h", help="Wav2Vec2 model name" + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + processor = Wav2Vec2Processor.from_pretrained(args.model) + model = Wav2Vec2ForCTC.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + dataset = dataset.sort("id") + sampling_rate = dataset.features["audio"].sampling_rate + inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") + # Run once to establish shapes before compile + with torch.no_grad(): + logits = model(**inputs).logits + + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=False) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + warmup_time = time.time() - warmup_start + + # Run + run_start = time.time() + with torch.no_grad(): + logits = model(**inputs).logits + run_time = time.time() - run_start + probabilities = torch.sigmoid(logits[0]) + labels = (probabilities > 0.5).long() + + logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s") + logger.info(f"Output label: {labels[0].tolist()}") + + +if __name__ == "__main__": + main() + +""" +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrizations.py:325:0: error: number of output elements (4718592) doesn't match expected number of elements (128) +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:303:0: note: called from +/usr/local/lib/python3.10/site-packages/torch/nn/utils/parametrize.py:407:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:372:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:713:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1462:0: note: called from +/usr/local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1862:0: note: called from + +# dynamic shape of intermediate tensors leading to static shape error while runing the traced artifact. +""" diff --git a/torch_compile/run_whisper.py b/torch_compile/run_whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..0be4046852560b8250ef43a25ea894d25aa93e24 --- /dev/null +++ b/torch_compile/run_whisper.py @@ -0,0 +1,91 @@ +import argparse +import logging +import time + +import torch +from transformers import AutoTokenizer, WhisperForConditionalGeneration + +import torch_neuronx + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Run Whisper on Neuron") + parser.add_argument( + "--model", type=str, default="openai/whisper-tiny", help="Whisper model name" + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + model = WhisperForConditionalGeneration.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + + num_mel_bins = model.config.num_mel_bins + input_features = torch.randn(args.batch_size, num_mel_bins, 3000, dtype=torch.float32) + gen_kwargs = { + "max_new_tokens": 64, + "do_sample": False, + "cache_implementation": "static", + "eos_token_id": -1, + } + + # Run once to establish shapes before compile + with torch.no_grad(): + _ = model.generate(input_features=input_features, **gen_kwargs) + + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + output = model.generate(input_features=input_features, **gen_kwargs) + warmup_time = time.time() - warmup_start + + # Run + run_start = time.time() + with torch.no_grad(): + output = model.generate(input_features=input_features, **gen_kwargs) + run_time = time.time() - run_start + + logger.info(f"Warmup: {warmup_time:.2f}s, Run: {run_time:.4f}s") + logger.info(f"Output: {tokenizer.batch_decode(output, skip_special_tokens=True)}") + + +if __name__ == "__main__": + main() + +""" +Traceback (most recent call last): + File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 64, in + main() + File "/workspace/torch-neuron-sample/scripts/tests/run_whisper.py", line 50, in main + output = model.generate(input_features=input_features, **gen_kwargs) + File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 704, in generate + init_tokens = self._retrieve_init_tokens( + File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1572, in _retrieve_init_tokens + lang_ids = self.detect_language( + File "/usr/local/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py", line 1683, in detect_language + lang_ids = logits.argmax(-1) + File "/torch-neuronx/torch_neuronx/python_ops/auto_registration.py", line 306, in wrapper + result = operation(*args, **kwargs) + File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 712, in __call__ + result = impl.execute(*args, **kwargs) + File "/torch-neuronx/torch_neuronx/python_ops/base.py", line 109, in execute + result = self._execute_impl(*args2, **kwargs2) + File "/torch-neuronx/torch_neuronx/python_ops/to_copy.py", line 102, in _execute_impl + cpu_dst = copy_neuron_to_cpu( + File "/torch-neuronx/torch_neuronx/python_ops/cast_policy.py", line 102, in copy_neuron_to_cpu + _C._nrt_copy_neuron_to_cpu_tensor(neuron_src, cpu_tmp, non_blocking=non_blocking) +RuntimeError: Compilation error occurred on Neuron for operation=aten::_index_put_impl_; +error message="COMPILATION FAILED: Error: 2026-01-16T11:49:13Z 2026-01-16 11:49:13.062190: E hilo/hlo_passes/NeuronHloVerifier.cc:647] [ERROR] [NCC_EVRF024] Output tensor size of 10,759,912,900 bytes with shape of f32[51865,51865] exceeds 4GB limit for individual tensor size. TIP: Consider applying model parallelism or tensor parallelism per https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html." +python stack trace= +""" \ No newline at end of file diff --git a/torch_compile/run_xlm.py b/torch_compile/run_xlm.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/run_xlm_roberta.py b/torch_compile/run_xlm_roberta.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/run_yolos.py b/torch_compile/run_yolos.py new file mode 100644 index 0000000000000000000000000000000000000000..a5be09edfdef97150033803304a82af6e826ce85 --- /dev/null +++ b/torch_compile/run_yolos.py @@ -0,0 +1,83 @@ +import argparse +import logging +import time +import os + +import torch +from transformers import AutoImageProcessor, YolosForObjectDetection +from datasets import load_dataset +import torch_neuronx # ensure Neuron backend is available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + # Allow CPU fallback + # ERROR:torch_neuronx.neuron_dynamo_backend.backend:Execution failed: Compilation error occurred on Neuron for operation=torch_compile; + # error message="COMPILATION FAILED: Error: 2026-01-20T12:06:37Z tensor_op_name: _gather.577 | hlo_id: 577 | [ERROR] [NCC_EXTP003] Instructions generated by compiler 290400 exceeds the typical limit of 150000. Input computation graph is too big due to large operators - Consider using smaller batches or sequence length, or applying tensor parellelism. For further troubleshooting visit https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/app_notes/nxd-training-tp-appnote.html" + # python stack trace= + os.environ["TORCH_NEURONX_FALLBACK_ONLY_FOR_UNIMPLEMENTED_OPS"] = "0" + + parser = argparse.ArgumentParser(description="Run YOLOS object detection on Neuron") + parser.add_argument( + "--model", + type=str, + default="hustvl/yolos-base", + help="YOLOS model name on Hugging Face Hub", + ) + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + args = parser.parse_args() + + torch.set_default_dtype(torch.float32) + torch.manual_seed(42) + + # Load dataset and pick an image + dataset = load_dataset("huggingface/cats-image") + image = dataset["test"]["image"][0] + + # Load processor and model + image_processor = AutoImageProcessor.from_pretrained(args.model) + model = YolosForObjectDetection.from_pretrained( + args.model, torch_dtype=torch.float32, attn_implementation="eager" + ) + model.eval() + + # Preprocess image + inputs = image_processor(images=image, return_tensors="pt") + + # Pre-run once to fix shapes before compilation + with torch.no_grad(): + outputs = model(**inputs) + + # Compile forward pass + model.forward = torch.compile(model.forward, backend="neuron", fullgraph=True) + + # Warmup + warmup_start = time.time() + with torch.no_grad(): + _ = model(**inputs) + warmup_time = time.time() - warmup_start + + # Actual run + run_start = time.time() + with torch.no_grad(): + outputs = model(**inputs) + run_time = time.time() - run_start + + # Post-process: keep only top detection + logits = outputs.logits # [B, num_queries, num_classes + 1] + probs = logits.softmax(dim=-1)[0, :, :-1] # drop "no-object" + scores, labels = probs.max(dim=-1) # CPU fallback allowed + best_idx = scores.argmax().item() + + logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time) + logger.info("Top detection: class=%d, score=%.3f", labels[best_idx].item(), scores[best_idx].item()) + + +if __name__ == "__main__": + main() + +""" +Need to fall back to CPU. +""" \ No newline at end of file diff --git a/torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123150520_241/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt b/torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123150520_241/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123154351_1091/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt b/torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123154351_1091/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0123155008_1504/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0123155008_1504/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123155008_1504/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0123155008_1504/used_ops.txt b/torch_compile/torch_neuronx_dump/0123155008_1504/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123155008_1504/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0123170234_2151/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0123170234_2151/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0123170234_2151/used_ops.txt b/torch_compile/torch_neuronx_dump/0123170234_2151/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e9c5cd1499119b6a246fc693204a424208b972f --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0123170234_2151/used_ops.txt @@ -0,0 +1,3 @@ +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0124034514_67/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0124034514_67/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0124034514_67/used_ops.txt b/torch_compile/torch_neuronx_dump/0124034514_67/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..92ef6b149322b1226d7b1e8a951e19c763a39aea --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0124034514_67/used_ops.txt @@ -0,0 +1,4 @@ +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator '_to_copy' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126040417_153/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126040417_153/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126040417_153/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126040417_153/used_ops.txt b/torch_compile/torch_neuronx_dump/0126040417_153/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126040417_153/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126043123_948/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126043123_948/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126043123_948/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126043123_948/used_ops.txt b/torch_compile/torch_neuronx_dump/0126043123_948/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126043123_948/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126043438_1441/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126043438_1441/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126043438_1441/used_ops.txt b/torch_compile/torch_neuronx_dump/0126043438_1441/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126043438_1441/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126050910_2387/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126050910_2387/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126050910_2387/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126050910_2387/used_ops.txt b/torch_compile/torch_neuronx_dump/0126050910_2387/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126050910_2387/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126050954_2552/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126050954_2552/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126050954_2552/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126050954_2552/used_ops.txt b/torch_compile/torch_neuronx_dump/0126050954_2552/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126050954_2552/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126102633_1851/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126102633_1851/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126102633_1851/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126102633_1851/used_ops.txt b/torch_compile/torch_neuronx_dump/0126102633_1851/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126102633_1851/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126102915_2174/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126102915_2174/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126102915_2174/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126102915_2174/used_ops.txt b/torch_compile/torch_neuronx_dump/0126102915_2174/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126102915_2174/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126103539_2715/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126103539_2715/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126103539_2715/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126103539_2715/used_ops.txt b/torch_compile/torch_neuronx_dump/0126103539_2715/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126103539_2715/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126115018_4120/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126115018_4120/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126115018_4120/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126115018_4120/used_ops.txt b/torch_compile/torch_neuronx_dump/0126115018_4120/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126115018_4120/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126115240_4496/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126115240_4496/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126115240_4496/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126115240_4496/used_ops.txt b/torch_compile/torch_neuronx_dump/0126115240_4496/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126115240_4496/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126115534_4877/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126115534_4877/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126115534_4877/used_ops.txt b/torch_compile/torch_neuronx_dump/0126115534_4877/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126115534_4877/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126123818_6558/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126123818_6558/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126123818_6558/used_ops.txt b/torch_compile/torch_neuronx_dump/0126123818_6558/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126123818_6558/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126124037_6963/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126124037_6963/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126124037_6963/used_ops.txt b/torch_compile/torch_neuronx_dump/0126124037_6963/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..075a17cbed34d5cb3ab1a17645ce7271aafdedcf --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126124037_6963/used_ops.txt @@ -0,0 +1,14 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::memory::slice' executed on Neuron +Operator 'concat' executed on Neuron +Operator 'aten::stack' executed on Neuron +Operator 'aten::isfinite' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'ne_scalar' executed on Neuron +Operator 'aten::bitwise_and.Tensor_out' executed on Neuron +Operator 'neuron::copy::neuron_to_neuron' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126124243_8375/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126124243_8375/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8cb9ac08b3a3ded32acea48a9b4e325ff488f51 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126124243_8375/offloaded_ops.txt @@ -0,0 +1,2 @@ +Operator ' aten::argmax.out ' fell back to CPU +Operator 'aten::cat.out' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126124243_8375/used_ops.txt b/torch_compile/torch_neuronx_dump/0126124243_8375/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126124243_8375/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126125716_9173/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126125716_9173/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8cb9ac08b3a3ded32acea48a9b4e325ff488f51 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126125716_9173/offloaded_ops.txt @@ -0,0 +1,2 @@ +Operator ' aten::argmax.out ' fell back to CPU +Operator 'aten::cat.out' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126125716_9173/used_ops.txt b/torch_compile/torch_neuronx_dump/0126125716_9173/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126125716_9173/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126130939_9928/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126130939_9928/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126130939_9928/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126130939_9928/used_ops.txt b/torch_compile/torch_neuronx_dump/0126130939_9928/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126130939_9928/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126132406_10606/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126132406_10606/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126132406_10606/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126132406_10606/used_ops.txt b/torch_compile/torch_neuronx_dump/0126132406_10606/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126132406_10606/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126134844_11381/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126134844_11381/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126134844_11381/used_ops.txt b/torch_compile/torch_neuronx_dump/0126134844_11381/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126134844_11381/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126142037_653/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126142037_653/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126142037_653/used_ops.txt b/torch_compile/torch_neuronx_dump/0126142037_653/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126142037_653/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126143035_1196/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126143035_1196/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/torch_compile/torch_neuronx_dump/0126143035_1196/used_ops.txt b/torch_compile/torch_neuronx_dump/0126143035_1196/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c9e4a1b2c941598192b2ce1b97d4ce5e9eb1 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126143035_1196/used_ops.txt @@ -0,0 +1,6 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron diff --git a/torch_compile/torch_neuronx_dump/0126153503_2961/offloaded_ops.txt b/torch_compile/torch_neuronx_dump/0126153503_2961/offloaded_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f263970c22c7c100f2036fd2218428a55566ed0 --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126153503_2961/offloaded_ops.txt @@ -0,0 +1 @@ +Operator ' aten::argmax.out ' fell back to CPU diff --git a/torch_compile/torch_neuronx_dump/0126153503_2961/used_ops.txt b/torch_compile/torch_neuronx_dump/0126153503_2961/used_ops.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a02ea757641bd3e8f82088ade97986b6d5ab3a --- /dev/null +++ b/torch_compile/torch_neuronx_dump/0126153503_2961/used_ops.txt @@ -0,0 +1,8 @@ +Operator 'torch_compile' executed on Neuron +Operator 'neuron::memory::alloc' executed on Neuron +Operator 'neuron::copy::cpu_to_neuron' executed on Neuron +Operator '_to_copy' executed on Neuron +Operator 'model_default' executed on Neuron +Operator 'neuron::memory::dealloc' executed on Neuron +Operator 'neuron::copy::neuron_to_cpu' executed on Neuron +Operator 'copy_' executed on Neuron