diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ba73310ad46c2088cd2409cb1378ad69d7eeaf87 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20251218_082501-lbhu7589/run-lbhu7589.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/003000/pretrained_model/config.json b/checkpoints/003000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/003000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/model.safetensors b/checkpoints/003000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..400873293ffb9ebc5fd7ba69046ca364254b86dd --- /dev/null +++ b/checkpoints/003000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cabb52699c0b8dd502a62185d39c484559f5a0a20d5af555d8a6c1aff70ce422 +size 7473096344 diff --git a/checkpoints/003000/pretrained_model/policy_postprocessor.json b/checkpoints/003000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/003000/pretrained_model/policy_preprocessor.json b/checkpoints/003000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/003000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/003000/pretrained_model/train_config.json b/checkpoints/003000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/003000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/003000/training_state/optimizer_param_groups.json b/checkpoints/003000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..7523b143e8d347f09c246adf1337948cd5de2cff --- /dev/null +++ b/checkpoints/003000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 2.3773823397119138e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/003000/training_state/optimizer_state.safetensors b/checkpoints/003000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cf8475fcf06eb041afcb09ed9f1d5095ef5df481 --- /dev/null +++ b/checkpoints/003000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec0dc92337fcf38a25f5c85b23b8ef64c6dc1e8b495d665f32807c6fbf3f5f3 +size 13473373724 diff --git a/checkpoints/003000/training_state/rng_state.safetensors b/checkpoints/003000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c020f299bcdb1b1b5d52142d0b9bc26df9a44f9b --- /dev/null +++ b/checkpoints/003000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81715cce67a25ad52e738f9a3b85a3c011889a2fe163b78cb213054d8951e5d0 +size 15708 diff --git a/checkpoints/003000/training_state/scheduler_state.json b/checkpoints/003000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..105b634b8c699ed16b9a22da9dfb8aebd1998e78 --- /dev/null +++ b/checkpoints/003000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 3000, + "_step_count": 3001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.3773823397119138e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/003000/training_state/training_step.json b/checkpoints/003000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a44127b7dfea653fd776d529fa83c55d32081c --- /dev/null +++ b/checkpoints/003000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 3000 +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/config.json b/checkpoints/006000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/006000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/model.safetensors b/checkpoints/006000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9f447ba678560dccbba276e679ed465af933e25e --- /dev/null +++ b/checkpoints/006000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f46e53c1f9d0a9b56aeb90f6a603994657b373c2ea68fc87893848ade02a4a4 +size 7473096344 diff --git a/checkpoints/006000/pretrained_model/policy_postprocessor.json b/checkpoints/006000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/006000/pretrained_model/policy_preprocessor.json b/checkpoints/006000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/006000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/006000/pretrained_model/train_config.json b/checkpoints/006000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/006000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/006000/training_state/optimizer_param_groups.json b/checkpoints/006000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..c4aebc09d62298a631a9c7a568bf48c7c91f69de --- /dev/null +++ b/checkpoints/006000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 2.0362584088290323e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/006000/training_state/optimizer_state.safetensors b/checkpoints/006000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53442352a389efd893608507a35f791507891002 --- /dev/null +++ b/checkpoints/006000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d9e38b483ea87533f5d5fa49fa94fb40a1f61774e5242fb223edd2adcf8070 +size 13473373724 diff --git a/checkpoints/006000/training_state/rng_state.safetensors b/checkpoints/006000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c840687a0c181a6dcb301b6ce16777a079b685aa --- /dev/null +++ b/checkpoints/006000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95abe3c61e6c73249b9fbe931b932e9fb7b2fca6b4ecef9fa7e1c07f6e205d17 +size 15708 diff --git a/checkpoints/006000/training_state/scheduler_state.json b/checkpoints/006000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..68325d69a8a27443a36200650435af4d253cfd8a --- /dev/null +++ b/checkpoints/006000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 6000, + "_step_count": 6001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.0362584088290323e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/006000/training_state/training_step.json b/checkpoints/006000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..e267ac589be64705f8674638b9f5099c886778da --- /dev/null +++ b/checkpoints/006000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 6000 +} \ No newline at end of file diff --git a/checkpoints/009000/pretrained_model/config.json b/checkpoints/009000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/009000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/009000/pretrained_model/model.safetensors b/checkpoints/009000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ca6ae796a95f9a07604180692f2f83cd89509a5 --- /dev/null +++ b/checkpoints/009000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f34310305ade6540d0e311f905a495de229086550c6a7093a58d484960d3d041 +size 7473096344 diff --git a/checkpoints/009000/pretrained_model/policy_postprocessor.json b/checkpoints/009000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/009000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/009000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/009000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/009000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/009000/pretrained_model/policy_preprocessor.json b/checkpoints/009000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/009000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/009000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/009000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/009000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/009000/pretrained_model/train_config.json b/checkpoints/009000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/009000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/009000/training_state/optimizer_param_groups.json b/checkpoints/009000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..23997e0cc700833bd23604c71cb5a7cce8463eaf --- /dev/null +++ b/checkpoints/009000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 1.5509887731702598e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/009000/training_state/optimizer_state.safetensors b/checkpoints/009000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38fe9812a55667e16b9b7487bfcb2f222bb29ad2 --- /dev/null +++ b/checkpoints/009000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5367b0b2cc44a319d162f48af734aea137e00a0cc5830cd1fcaacee24acc2200 +size 13473373724 diff --git a/checkpoints/009000/training_state/rng_state.safetensors b/checkpoints/009000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4333b1f722424e34e7d325ef145800224fd68443 --- /dev/null +++ b/checkpoints/009000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adae7bdb117b54dfa7139dc5796af049e6314dff2ae862e0be246b04ea12d948 +size 15708 diff --git a/checkpoints/009000/training_state/scheduler_state.json b/checkpoints/009000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a69c2609e24a182b45cc70e032eaa0316f5a2aba --- /dev/null +++ b/checkpoints/009000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 9000, + "_step_count": 9001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.5509887731702598e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/009000/training_state/training_step.json b/checkpoints/009000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..7247596175281e9d3f295d2b6936944a71b64d34 --- /dev/null +++ b/checkpoints/009000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 9000 +} \ No newline at end of file diff --git a/checkpoints/012000/pretrained_model/config.json b/checkpoints/012000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/012000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/012000/pretrained_model/model.safetensors b/checkpoints/012000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..457c11615288951766998cabf4a82642c4805baf --- /dev/null +++ b/checkpoints/012000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8757f2d1b64716ba3138c2e29f3fd9e02ffb0088212e79752dfcbb19b6a234a2 +size 7473096344 diff --git a/checkpoints/012000/pretrained_model/policy_postprocessor.json b/checkpoints/012000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/012000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/012000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/012000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/012000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/012000/pretrained_model/policy_preprocessor.json b/checkpoints/012000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/012000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/012000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/012000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/012000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/012000/pretrained_model/train_config.json b/checkpoints/012000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/012000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/012000/training_state/optimizer_param_groups.json b/checkpoints/012000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..df55289b4239bd395b70164afbf7c00aa4676353 --- /dev/null +++ b/checkpoints/012000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 1.0273558813281845e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/012000/training_state/optimizer_state.safetensors b/checkpoints/012000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd23c6e81a1abfa78329d981a3a585f315564af3 --- /dev/null +++ b/checkpoints/012000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8bfc8740873eb803e20b30b5bc43d95e8a4c6087c87d3e0a7cf43f235f6b4f +size 13473373724 diff --git a/checkpoints/012000/training_state/rng_state.safetensors b/checkpoints/012000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6b95c36978f176ccca22383c3591f10b897f008b --- /dev/null +++ b/checkpoints/012000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e4425f49878fff60b185e2993c4a1d74faf9c71f298eaecb28dd519b6a4b2d +size 15708 diff --git a/checkpoints/012000/training_state/scheduler_state.json b/checkpoints/012000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc9c901b5845dedbc453a41fed217352f49ac86 --- /dev/null +++ b/checkpoints/012000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 12000, + "_step_count": 12001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.0273558813281845e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/012000/training_state/training_step.json b/checkpoints/012000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..c4fb27ac819b81943e6545c7c18510bdfb8eae1b --- /dev/null +++ b/checkpoints/012000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 12000 +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/config.json b/checkpoints/015000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/015000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/model.safetensors b/checkpoints/015000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07d33fc2808fb2983a54165d57d93ce911717a8b --- /dev/null +++ b/checkpoints/015000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9959329fa753f1f9d55668296df18cfa58447b30b29fe0f28b5f93bf2bc66e +size 7473096344 diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor.json b/checkpoints/015000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor.json b/checkpoints/015000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/015000/pretrained_model/train_config.json b/checkpoints/015000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/015000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_param_groups.json b/checkpoints/015000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..eb57fc5b3ad82ab979db27f977481447bc059d25 --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 5.795048711651342e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_state.safetensors b/checkpoints/015000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e935f325d8453c60dedb50e0774f8184b9e091d8 --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9abb30b2c6f5051d3db1edadd9f38d2bdb1ec99e801c0af1c6eaf0c2c838fc8b +size 13473373724 diff --git a/checkpoints/015000/training_state/rng_state.safetensors b/checkpoints/015000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..914895b89ddb53707d53acaf07cd49f6089a7fd9 --- /dev/null +++ b/checkpoints/015000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1250848eb6dbf81e71934ba9278fa7d9b9fbce48b50961fd0a372d6bac5a69bc +size 15708 diff --git a/checkpoints/015000/training_state/scheduler_state.json b/checkpoints/015000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f7c98cf4f5cc42e248f71dd2864c32cdeb4bb61 --- /dev/null +++ b/checkpoints/015000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 15000, + "_step_count": 15001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 5.795048711651342e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/training_step.json b/checkpoints/015000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..5cec056c8ba7f3c2e865a0f73ae59975a0503067 --- /dev/null +++ b/checkpoints/015000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 15000 +} \ No newline at end of file diff --git a/checkpoints/018000/pretrained_model/config.json b/checkpoints/018000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/018000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/018000/pretrained_model/model.safetensors b/checkpoints/018000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2eaa1ed3bc1c0967b65df641a0233fc180086ddb --- /dev/null +++ b/checkpoints/018000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22d7d2eda26a2b8e28e6597134e24504394198843ae86a7d8519cc59dcfda2cd +size 7473096344 diff --git a/checkpoints/018000/pretrained_model/policy_postprocessor.json b/checkpoints/018000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/018000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/018000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/018000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/018000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/018000/pretrained_model/policy_preprocessor.json b/checkpoints/018000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/018000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/018000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/018000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/018000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/018000/pretrained_model/train_config.json b/checkpoints/018000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/018000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/018000/training_state/optimizer_param_groups.json b/checkpoints/018000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..de61c0a738c16b725e53ec8a1e1e9fdaa5212f1d --- /dev/null +++ b/checkpoints/018000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 3.0506141916795233e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/018000/training_state/optimizer_state.safetensors b/checkpoints/018000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a81beddf4422c4350fda706a55aacacb69dedd56 --- /dev/null +++ b/checkpoints/018000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb73ac4f58ab2892a8590597b7929fa27dc79dc18bf750bd2719edc8ff4c079 +size 13473373724 diff --git a/checkpoints/018000/training_state/rng_state.safetensors b/checkpoints/018000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..85500850bd48f1c00d41181d086da0037a0958b7 --- /dev/null +++ b/checkpoints/018000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264cebccc0924f3d4de1a30f245730dc34fc891e8b2307bd57096099fbf18f01 +size 15708 diff --git a/checkpoints/018000/training_state/scheduler_state.json b/checkpoints/018000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..06936aab4ebdf3e60323b0dba8792afda5bcd561 --- /dev/null +++ b/checkpoints/018000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 18000, + "_step_count": 18001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 3.0506141916795233e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/018000/training_state/training_step.json b/checkpoints/018000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..332a3c17a399c305da7a515c33ffb382d0aff339 --- /dev/null +++ b/checkpoints/018000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 18000 +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/config.json b/checkpoints/020000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9ef4ac072452ff8debbca6be6175a654f28cbe --- /dev/null +++ b/checkpoints/020000/pretrained_model/config.json @@ -0,0 +1,92 @@ +{ + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/model.safetensors b/checkpoints/020000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f01a86fe095291cbf8b1c457940144ffc298a00e --- /dev/null +++ b/checkpoints/020000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ecc6b24852b2a2a4b665ddd726c9562191eb350650aad991f4d883fd948cd42 +size 7473096344 diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor.json b/checkpoints/020000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..3171b86625d5ec895c36b4cc1824cecfd7b37e2e --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor.json b/checkpoints/020000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..d12396bdc2de6672958b31ffcd2a82aab560645f --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,87 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": {} + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + } + }, + "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors" + }, + { + "registry_name": "pi05_prepare_state_tokenizer_processor_step", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 200, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "google/paligemma-3b-pt-224" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b736d299e020843d180bc1ef9a2446226abe0c9 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor_step_2_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ab0f660e6cbe8404932b42c7fff5aed80b13de7f0f5e9f43c56b5bd52afe0d +size 9328 diff --git a/checkpoints/020000/pretrained_model/train_config.json b/checkpoints/020000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ed480db655730638958aa3e3ad1a23c27953bc --- /dev/null +++ b/checkpoints/020000/pretrained_model/train_config.json @@ -0,0 +1,222 @@ +{ + "dataset": { + "repo_id": "Zasha01/lego_cube_final", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": null, + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "streaming": false + }, + "env": null, + "policy": { + "type": "pi05", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 14 + ] + }, + "observation.images.context": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.right_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + }, + "observation.images.left_wrist": { + "type": "VISUAL", + "shape": [ + 3, + 360, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 14 + ] + } + }, + "device": "cuda", + "use_amp": false, + "push_to_hub": true, + "repo_id": "Zasha01/pi_lego_cube_final_final", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/pi05_base", + "paligemma_variant": "gemma_2b", + "action_expert_variant": "gemma_300m", + "dtype": "bfloat16", + "chunk_size": 50, + "n_action_steps": 50, + "max_state_dim": 32, + "max_action_dim": 32, + "num_inference_steps": 10, + "time_sampling_beta_alpha": 1.5, + "time_sampling_beta_beta": 1.0, + "time_sampling_scale": 0.999, + "time_sampling_offset": 0.001, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "image_resolution": [ + 224, + 224 + ], + "empty_cameras": 0, + "tokenizer_max_length": 200, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "QUANTILES", + "ACTION": "QUANTILES" + }, + "gradient_checkpointing": true, + "compile_model": true, + "compile_mode": "max-autotune", + "optimizer_lr": 2.5e-05, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 0.01, + "optimizer_grad_clip_norm": 1.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06 + }, + "output_dir": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "job_name": "pi_lego_cube_final_final", + "resume": false, + "seed": 1000, + "num_workers": 4, + "batch_size": 16, + "steps": 20000, + "eval_freq": 20000, + "log_freq": 200, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 3000, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 2.5e-05, + "weight_decay": 0.01, + "grad_clip_norm": 1.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 2.5e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 50, + "use_async_envs": false + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "lbhu7589", + "mode": null + }, + "checkpoint_path": null, + "rename_map": {} +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_param_groups.json b/checkpoints/020000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..6675215978183c324ea77b3a2cc0c61d8509d2f2 --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_param_groups.json @@ -0,0 +1,833 @@ +[ + { + "lr": 2.5e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 0.01, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2.5e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 507, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 517, + 518, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 550, + 551, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 560, + 561, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 571, + 572, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 616, + 617, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 630, + 631, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 662, + 663, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 672, + 673, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 683, + 684, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 742, + 743, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 774, + 775, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 795, + 796, + 797, + 798, + 799, + 800, + 801, + 802, + 803, + 804, + 805, + 806, + 807, + 808, + 809, + 810, + 811 + ] + } +] \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_state.safetensors b/checkpoints/020000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3af7f8ad68d19448b2021b4f7ac0d0c8cdc1f60e --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4b7b46cd360b90c6e4ba1b656bb5fdd89cc50790d49e8d4d2cf27e0939945f +size 13473373724 diff --git a/checkpoints/020000/training_state/rng_state.safetensors b/checkpoints/020000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea1c5f2f85150f086feed6891e0cc3670bee6124 --- /dev/null +++ b/checkpoints/020000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a52430410dc58c4b72f0018ac6b065984e696f9c24c99056c49f5befc1650146 +size 15708 diff --git a/checkpoints/020000/training_state/scheduler_state.json b/checkpoints/020000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..363c7f78eca29a35d7ba1aa2ce407ab5913db105 --- /dev/null +++ b/checkpoints/020000/training_state/scheduler_state.json @@ -0,0 +1,14 @@ +{ + "base_lrs": [ + 2.5e-05 + ], + "last_epoch": 20000, + "_step_count": 20001, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.5e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/training_step.json b/checkpoints/020000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9bb47026c5d5237ca6fc5dbff6020dd122ea05 --- /dev/null +++ b/checkpoints/020000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 20000 +} \ No newline at end of file diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ed0c9899904a04afc45239544936cf018981a6d4 --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-12-18T08:25:01.693088813Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2025-12-18T08:25:01.91641179Z","level":"INFO","msg":"stream: created new stream","id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916449066Z","level":"INFO","msg":"stream: started","id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.91647249Z","level":"INFO","msg":"writer: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916562906Z","level":"INFO","msg":"handler: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916631089Z","level":"INFO","msg":"sender: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T12:52:04.148929989Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/1144795626: no space left on device"} +{"time":"2025-12-18T13:43:42.980176868Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3041263947: no space left on device"} +{"time":"2025-12-18T14:33:55.212227787Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3394625431: no space left on device"} +{"time":"2025-12-18T15:08:48.221381228Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3733320415: no space left on device"} +{"time":"2025-12-18T15:08:54.968312016Z","level":"INFO","msg":"stream: closing","id":"lbhu7589"} +{"time":"2025-12-18T15:12:00.986662316Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-18T15:12:01.128825565Z","level":"INFO","msg":"handler: closed","stream_id":"lbhu7589"} +{"time":"2025-12-18T15:12:01.128903532Z","level":"INFO","msg":"sender: closed","stream_id":"lbhu7589"} +{"time":"2025-12-18T15:12:01.128914298Z","level":"INFO","msg":"stream: closed","id":"lbhu7589"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..36077de555c88e3a37b9ea5f3dfee261c62752b6 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,23 @@ +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Configure stats pid to 204 +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from /workspace/wandb/settings +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from environment variables +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454/wandb/run-20251218_082501-lbhu7589/logs/debug.log +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454/wandb/run-20251218_082501-lbhu7589/logs/debug-internal.log +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():813] calling init triggers +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'dataset': {'repo_id': 'Zasha01/lego_cube_final', 'root': None, 'episodes': None, 'image_transforms': {'enable': False, 'max_num_transforms': 3, 'random_order': False, 'tfs': {'brightness': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'brightness': [0.8, 1.2]}}, 'contrast': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'contrast': [0.8, 1.2]}}, 'saturation': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'saturation': [0.5, 1.5]}}, 'hue': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'hue': [-0.05, 0.05]}}, 'sharpness': {'weight': 1.0, 'type': 'SharpnessJitter', 'kwargs': {'sharpness': [0.5, 1.5]}}, 'affine': {'weight': 1.0, 'type': 'RandomAffine', 'kwargs': {'degrees': [-5.0, 5.0], 'translate': [0.05, 0.05]}}}}, 'revision': None, 'use_imagenet_stats': True, 'video_backend': 'torchcodec', 'streaming': False}, 'env': None, 'policy': {'type': 'pi05', 'n_obs_steps': 1, 'input_features': {}, 'output_features': {}, 'device': 'cuda', 'use_amp': False, 'push_to_hub': True, 'repo_id': 'Zasha01/pi_lego_cube_final_final', 'private': None, 'tags': None, 'license': None, 'pretrained_path': 'lerobot/pi05_base', 'paligemma_variant': 'gemma_2b', 'action_expert_variant': 'gemma_300m', 'dtype': 'bfloat16', 'chunk_size': 50, 'n_action_steps': 50, 'max_state_dim': 32, 'max_action_dim': 32, 'num_inference_steps': 10, 'time_sampling_beta_alpha': 1.5, 'time_sampling_beta_beta': 1.0, 'time_sampling_scale': 0.999, 'time_sampling_offset': 0.001, 'min_period': 0.004, 'max_period': 4.0, 'rtc_config': None, 'image_resolution': [224, 224], 'empty_cameras': 0, 'tokenizer_max_length': 200, 'normalization_mapping': {'VISUAL': , 'STATE': , 'ACTION': }, 'gradient_checkpointing': True, 'compile_model': True, 'compile_mode': 'max-autotune', 'optimizer_lr': 2.5e-05, 'optimizer_betas': [0.9, 0.95], 'optimizer_eps': 1e-08, 'optimizer_weight_decay': 0.01, 'optimizer_grad_clip_norm': 1.0, 'scheduler_warmup_steps': 1000, 'scheduler_decay_steps': 30000, 'scheduler_decay_lr': 2.5e-06}, 'output_dir': '/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454', 'job_name': 'pi_lego_cube_final_final', 'resume': False, 'seed': 1000, 'num_workers': 4, 'batch_size': 16, 'steps': 20000, 'eval_freq': 20000, 'log_freq': 200, 'tolerance_s': 0.0001, 'save_checkpoint': True, 'save_freq': 3000, 'use_policy_training_preset': True, 'optimizer': {'type': 'adamw', 'lr': 2.5e-05, 'weight_decay': 0.01, 'grad_clip_norm': 1.0, 'betas': [0.9, 0.95], 'eps': 1e-08}, 'scheduler': {'type': 'cosine_decay_with_warmup', 'num_warmup_steps': 1000, 'num_decay_steps': 30000, 'peak_lr': 2.5e-05, 'decay_lr': 2.5e-06}, 'eval': {'n_episodes': 50, 'batch_size': 50, 'use_async_envs': False}, 'wandb': {'enable': True, 'disable_artifact': False, 'project': 'lerobot', 'entity': None, 'notes': None, 'run_id': None, 'mode': None}, 'checkpoint_path': None, 'rename_map': {}, '_wandb': {}} +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():854] starting backend +2025-12-18 08:25:01,685 INFO MainThread:204 [wandb_init.py:init():857] sending inform_init request +2025-12-18 08:25:01,689 INFO MainThread:204 [wandb_init.py:init():865] backend started and connected +2025-12-18 08:25:01,691 INFO MainThread:204 [wandb_init.py:init():936] updated telemetry +2025-12-18 08:25:01,691 INFO MainThread:204 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2025-12-18 08:25:02,100 INFO MainThread:204 [wandb_init.py:init():1011] starting run threads in backend +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_console_start():2506] atexit reg +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2423] Wrapping output streams. +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2446] Redirects installed. +2025-12-18 08:25:02,169 INFO MainThread:204 [wandb_init.py:init():1049] run started, returning control to user process +2025-12-18 15:08:54,967 INFO wandb-AsyncioManager-main:204 [service_client.py:_forward_responses():84] Reached EOF. +2025-12-18 15:08:54,968 INFO wandb-AsyncioManager-main:204 [mailbox.py:close():137] Closing mailbox, abandoning 2 handles. diff --git a/wandb/run-20251218_082501-lbhu7589/files/config.yaml b/wandb/run-20251218_082501-lbhu7589/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..71c316ee2661774ccad127c6f54af004aa51cb64 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/files/config.yaml @@ -0,0 +1,259 @@ +_wandb: + value: + cli_version: 0.21.4 + e: + 0xueh9yuf1xlvkfx6jbcmvxu1bi9nb84: + args: + - --dataset.repo_id + - Zasha01/lego_cube_final + - --policy.type + - pi05 + - --output_dir + - /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454 + - --job_name + - pi_lego_cube_final_final + - --steps + - "20000" + - --batch_size + - "16" + - --save_checkpoint + - "true" + - --save_freq + - "3000" + - --policy.dtype + - bfloat16 + - --policy.device + - cuda + - --wandb.enable=true + - --policy.repo_id + - Zasha01/pi_lego_cube_final_final + - --policy.push_to_hub=true + - --policy.pretrained_path + - lerobot/pi05_base + - --policy.compile_model=true + - --policy.gradient_checkpointing=true + cpu_count: 28 + cpu_count_logical: 28 + cudaVersion: "12.8" + disk: + /: + total: "103865303040" + used: "43216367616" + executable: /opt/conda/envs/lerobot/bin/python3 + gpu: NVIDIA H100 PCIe + gpu_count: 1 + gpu_nvidia: + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d0e23bc7-e8b5-2dff-a2e7-119c21f8a7ac + host: brev-uxf0tw16k + memory: + total: "190128013312" + os: Linux-6.8.0-64-generic-x86_64-with-glibc2.35 + program: -m lerobot.scripts.lerobot_train + python: CPython 3.10.19 + root: /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454 + startedAt: "2025-12-18T08:25:01.479404Z" + writerId: 0xueh9yuf1xlvkfx6jbcmvxu1bi9nb84 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + "3": + - 13 + - 15 + - 16 + - 61 + "4": 3.10.19 + "5": 0.21.4 + "6": 4.53.3 + "10": + - 21 + "12": 0.21.4 + "13": linux-x86_64 +batch_size: + value: 16 +checkpoint_path: + value: null +dataset: + value: + episodes: null + image_transforms: + enable: false + max_num_transforms: 3 + random_order: false + tfs: + affine: + kwargs: + degrees: + - -5 + - 5 + translate: + - 0.05 + - 0.05 + type: RandomAffine + weight: 1 + brightness: + kwargs: + brightness: + - 0.8 + - 1.2 + type: ColorJitter + weight: 1 + contrast: + kwargs: + contrast: + - 0.8 + - 1.2 + type: ColorJitter + weight: 1 + hue: + kwargs: + hue: + - -0.05 + - 0.05 + type: ColorJitter + weight: 1 + saturation: + kwargs: + saturation: + - 0.5 + - 1.5 + type: ColorJitter + weight: 1 + sharpness: + kwargs: + sharpness: + - 0.5 + - 1.5 + type: SharpnessJitter + weight: 1 + repo_id: Zasha01/lego_cube_final + revision: null + root: null + streaming: false + use_imagenet_stats: true + video_backend: torchcodec +env: + value: null +eval: + value: + batch_size: 50 + n_episodes: 50 + use_async_envs: false +eval_freq: + value: 20000 +job_name: + value: pi_lego_cube_final_final +log_freq: + value: 200 +num_workers: + value: 4 +optimizer: + value: + betas: + - 0.9 + - 0.95 + eps: 1e-08 + grad_clip_norm: 1 + lr: 2.5e-05 + type: adamw + weight_decay: 0.01 +output_dir: + value: /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454 +policy: + value: + action_expert_variant: gemma_300m + chunk_size: 50 + compile_mode: max-autotune + compile_model: true + device: cuda + dtype: bfloat16 + empty_cameras: 0 + gradient_checkpointing: true + image_resolution: + - 224 + - 224 + license: null + max_action_dim: 32 + max_period: 4 + max_state_dim: 32 + min_period: 0.004 + n_action_steps: 50 + n_obs_steps: 1 + normalization_mapping: + ACTION: QUANTILES + STATE: QUANTILES + VISUAL: IDENTITY + num_inference_steps: 10 + optimizer_betas: + - 0.9 + - 0.95 + optimizer_eps: 1e-08 + optimizer_grad_clip_norm: 1 + optimizer_lr: 2.5e-05 + optimizer_weight_decay: 0.01 + paligemma_variant: gemma_2b + pretrained_path: lerobot/pi05_base + private: null + push_to_hub: true + repo_id: Zasha01/pi_lego_cube_final_final + rtc_config: null + scheduler_decay_lr: 2.5e-06 + scheduler_decay_steps: 30000 + scheduler_warmup_steps: 1000 + tags: null + time_sampling_beta_alpha: 1.5 + time_sampling_beta_beta: 1 + time_sampling_offset: 0.001 + time_sampling_scale: 0.999 + tokenizer_max_length: 200 + type: pi05 + use_amp: false +resume: + value: false +save_checkpoint: + value: true +save_freq: + value: 3000 +scheduler: + value: + decay_lr: 2.5e-06 + num_decay_steps: 30000 + num_warmup_steps: 1000 + peak_lr: 2.5e-05 + type: cosine_decay_with_warmup +seed: + value: 1000 +steps: + value: 20000 +tolerance_s: + value: 0.0001 +use_policy_training_preset: + value: true +wandb: + value: + disable_artifact: false + enable: true + entity: null + mode: null + notes: null + project: lerobot + run_id: null diff --git a/wandb/run-20251218_082501-lbhu7589/files/output.log b/wandb/run-20251218_082501-lbhu7589/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..82b88880792af7d0c2d3f91a97d58ca7801c3230 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/files/output.log @@ -0,0 +1,2637 @@ +INFO 2025-12-18 08:25:02 db_utils.py:102 Logs will be synced with wandb. +INFO 2025-12-18 08:25:02 db_utils.py:103 Track this run --> https://wandb.ai/z-sharfeddine-technical-university-of-munich/lerobot/runs/lbhu7589 +INFO 2025-12-18 08:25:02 ot_train.py:184 Creating dataset +meta/episodes/chunk-000/file-000.parquet: 100%|█| 63.1k/63.1 +meta/episodes/chunk-000/file-001.parquet: 100%|█| 186k/186k +meta/episodes/chunk-000/file-002.parquet: 100%|█| 188k/188k +meta/episodes/chunk-000/file-003.parquet: 100%|█| 310k/310k +meta/episodes/chunk-000/file-004.parquet: 100%|█| 124k/124k +meta/episodes/chunk-000/file-005.parquet: 100%|█| 414k/414k +meta/episodes/chunk-000/file-006.parquet: 100%|█| 247k/247k +meta/episodes/chunk-000/file-007.parquet: 100%|█| 124k/124k +meta/episodes/chunk-000/file-008.parquet: 100%|█| 255k/255k +info.json: 4.49kB [00:00, 8.52MB/s] +stats.json: 21.0kB [00:00, 29.9MB/s] +meta/tasks.parquet: 100%|█| 2.30k/2.30k [00:00<00:00, 5.22kB +.gitattributes: 2.46kB [00:00, 14.0MB/s] +README.md: 4.98kB [00:00, 9.93MB/s] +data/chunk-000/file-000.parquet: 100%|█| 60.4k/60.4k [00:00< +data/chunk-000/file-001.parquet: 100%|█| 1.15M/1.15M [00:00< +data/chunk-000/file-002.parquet: 100%|█| 1.13M/1.13M [00:00< +data/chunk-000/file-003.parquet: 100%|█| 1.86M/1.86M [00:00< +data/chunk-000/file-004.parquet: 100%|█| 490k/490k [00:00<00 +data/chunk-000/file-005.parquet: 100%|█| 2.37M/2.37M [00:00< +data/chunk-000/file-006.parquet: 100%|█| 1.39M/1.39M [00:00< +data/chunk-000/file-007.parquet: 100%|█| 451k/451k [00:00<00 +data/chunk-000/file-008.parquet: 100%|█| 1.44M/1.44M [00:00< +videos/observation.images.context/chunk-(…): 100%|█| 17.2M/1 +videos/observation.images.context/chunk-(…): 100%|█| 207M/20 +videos/observation.images.context/chunk-(…): 100%|█| 129M/12 +videos/observation.images.context/chunk-(…): 100%|█| 201M/20 +videos/observation.images.context/chunk-(…): 100%|█| 107M/10 +videos/observation.images.context/chunk-(…): 100%|█| 205M/20 +videos/observation.images.context/chunk-(…): 100%|█| 208M/20 +videos/observation.images.context/chunk-(…): 100%|█| 83.2M/8 +videos/observation.images.context/chunk-(…): 100%|█| 129M/12 +videos/observation.images.context/chunk-(…): 100%|█| 210M/21 +videos/observation.images.context/chunk-(…): 100%|█| 208M/20 +videos/observation.images.context/chunk-(…): 100%|█| 190M/19 +videos/observation.images.context/chunk-(…): 100%|█| 209M/20 +videos/observation.images.context/chunk-(…): 100%|█| 151M/15 +videos/observation.images.context/chunk-(…): 100%|█| 115M/11 +videos/observation.images.context/chunk-(…): 100%|█| 210M/21 +videos/observation.images.context/chunk-(…): 100%|█| 164M/16 +videos/observation.images.left_wrist/chu(…): 100%|█| 14.7M/1 +videos/observation.images.left_wrist/chu(…): 100%|█| 199M/19 +videos/observation.images.left_wrist/chu(…): 100%|█| 102M/10 +videos/observation.images.left_wrist/chu(…): 100%|█| 198M/19 +videos/observation.images.left_wrist/chu(…): 100%|█| 75.2M/7 +videos/observation.images.left_wrist/chu(…): 100%|█| 202M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 203M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 10.0M/1 +videos/observation.images.left_wrist/chu(…): 100%|█| 105M/10 +videos/observation.images.left_wrist/chu(…): 100%|█| 204M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 209M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 92.8M/9 +videos/observation.images.left_wrist/chu(…): 100%|█| 202M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 96.9M/9 +videos/observation.images.left_wrist/chu(…): 100%|█| 98.4M/9 +videos/observation.images.left_wrist/chu(…): 100%|█| 204M/20 +videos/observation.images.left_wrist/chu(…): 100%|█| 117M/11 +videos/observation.images.right_wrist/ch(…): 100%|█| 12.9M/1 +videos/observation.images.right_wrist/ch(…): 100%|█| 208M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 56.0M/5 +videos/observation.images.right_wrist/ch(…): 100%|█| 208M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 29.4M/2 +videos/observation.images.right_wrist/ch(…): 100%|█| 209M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 177M/17 +videos/observation.images.right_wrist/ch(…): 100%|█| 102M/10 +videos/observation.images.right_wrist/ch(…): 100%|█| 209M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 208M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 58.2M/5 +videos/observation.images.right_wrist/ch(…): 100%|█| 202M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 69.6M/6 +videos/observation.images.right_wrist/ch(…): 100%|█| 86.2M/8 +videos/observation.images.right_wrist/ch(…): 100%|█| 206M/20 +videos/observation.images.right_wrist/ch(…): 100%|█| 78.3M/7 +INFO 2025-12-18 08:26:50 ot_train.py:203 Creating policy +The PI05 model is a direct port of the OpenPI implementation. +This implementation follows the original OpenPI structure for compatibility. +Original implementation: https://github.com/Physical-Intelligence/openpi +INFO 2025-12-18 08:27:35 ing_pi05.py:568 Enabled gradient checkpointing for PI05Pytorch model +Loading model from: lerobot/pi05_base +model.safetensors: 100%|█| 14.5G/14.5G [00:12<00:00, 1.19GB/ +✓ Loaded state dict from model.safetensors +WARNING 2025-12-18 08:27:48 ng_pi05.py:1074 Vision embedding key might need handling: paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias +WARNING 2025-12-18 08:27:48 ng_pi05.py:1074 Vision embedding key might need handling: paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight +Remapped: action_in_proj.bias -> model.action_in_proj.bias +Remapped: action_in_proj.weight -> model.action_in_proj.weight +Remapped: action_out_proj.bias -> model.action_out_proj.bias +Remapped: action_out_proj.weight -> model.action_out_proj.weight +Remapped: paligemma_with_expert.gemma_expert.lm_head.weight -> model.paligemma_with_expert.gemma_expert.lm_head.weight +Remapped: paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias -> model.paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.bias +Remapped: paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight -> model.paligemma_with_expert.gemma_expert.model.layers.0.input_layernorm.dense.weight +Remapped: paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight -> model.paligemma_with_expert.gemma_expert.model.layers.0.mlp.down_proj.weight +Remapped: paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight -> model.paligemma_with_expert.gemma_expert.model.layers.0.mlp.gate_proj.weight +Remapped: paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight -> model.paligemma_with_expert.gemma_expert.model.layers.0.mlp.up_proj.weight +Remapped 812 state dict keys +Warning: Could not remap state dict keys: Error(s) in loading state_dict for PI05Policy: + Missing key(s) in state_dict: "model.paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight". +policy_preprocessor.json: 1.03kB [00:00, 1.98MB/s] +tokenizer_config.json: 100%|█| 40.0k/40.0k [00:00<00:00, 17. +tokenizer.model: 100%|█| 4.26M/4.26M [00:00<00:00, 18.2MB/s] +tokenizer.json: 100%|███| 17.5M/17.5M [00:00<00:00, 134MB/s] +added_tokens.json: 100%|██| 24.0/24.0 [00:00<00:00, 184kB/s] +special_tokens_map.json: 100%|█| 607/607 [00:00<00:00, 4.55M +policy_postprocessor.json: 100%|█| 450/450 [00:00<00:00, 2.9 +INFO 2025-12-18 08:27:52 ot_train.py:248 Creating optimizer and scheduler +INFO 2025-12-18 08:27:52 hedulers.py:105 Auto-scaling LR scheduler: num_training_steps (20000) < num_decay_steps (30000). Scaling warmup: 1000 → 666, decay: 30000 → 20000 (scale factor: 0.667) +INFO 2025-12-18 08:27:52 ot_train.py:260 Output dir: /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454 +INFO 2025-12-18 08:27:52 ot_train.py:267 cfg.steps=20000 (20K) +INFO 2025-12-18 08:27:52 ot_train.py:268 dataset.num_frames=100660 (101K) +INFO 2025-12-18 08:27:52 ot_train.py:269 dataset.num_episodes=251 +INFO 2025-12-18 08:27:52 ot_train.py:272 Effective batch size: 16 x 1 = 16 +INFO 2025-12-18 08:27:52 ot_train.py:273 num_learnable_params=3616757520 (4B) +INFO 2025-12-18 08:27:52 ot_train.py:274 num_total_params=3616757520 (4B) +INFO 2025-12-18 08:27:52 ot_train.py:330 Start offline training on a fixed dataset +AUTOTUNE mm(15488x2048, 2048x16384) + mm 1.9583 ms 100.0% + triton_mm_9603 2.3996 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9602 2.5408 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9604 2.6307 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9598 3.2644 ms 60.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9595 3.5547 ms 55.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9597 3.7869 ms 51.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9600 3.9684 ms 49.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9596 4.0225 ms 48.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9599 4.0616 ms 48.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 1.5500 seconds and 71.0004 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x4304) + mm 0.0852 ms 100.0% + triton_mm_100 0.0950 ms 89.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_101 0.1065 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_99 0.1112 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_94 0.1176 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_95 0.1334 ms 63.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_92 0.1410 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_93 0.1431 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_96 0.1454 ms 58.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_97 0.1460 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6533 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x1152) + mm 0.0263 ms 100.0% + triton_mm_23 0.0308 ms 85.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_24 0.0311 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_18 0.0332 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_25 0.0334 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_19 0.0406 ms 64.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16 0.0422 ms 62.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_17 0.0434 ms 60.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_21 0.0435 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_20 0.0436 ms 60.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4289 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x4096) + mm 0.0191 ms 100.0% + triton_mm_9696 0.0200 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9697 0.0231 ms 82.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9690 0.0242 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9695 0.0250 ms 76.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9688 0.0254 ms 75.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9692 0.0284 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9689 0.0285 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9691 0.0293 ms 65.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9686 0.0300 ms 63.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3866 seconds and 0.0002 seconds precompiling for 20 choices +E1218 08:33:31.613000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: +E1218 08:33:31.613000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 08:33:31.613000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. +AUTOTUNE mm(800x1024, 1024x32) + triton_mm_15415 0.0106 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_15406 0.0121 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15407 0.0124 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15420 0.0128 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + mm 0.0129 ms 82.6% + triton_mm_15414 0.0131 ms 81.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15411 0.0136 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15413 0.0141 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15417 0.0149 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15405 0.0152 ms 70.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2921 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(16x1024, 1024x1024) + triton_mm_9333 0.0099 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_9337 0.0111 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9332 0.0119 ms 82.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_9331 0.0126 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9336 0.0134 ms 73.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9341 0.0135 ms 72.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0140 ms 70.2% + triton_mm_9343 0.0145 ms 68.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9330 0.0147 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_9345 0.0148 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2895 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(15488x2048, 2048x2048) + mm 0.2540 ms 100.0% + triton_mm_9565 0.2918 ms 87.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9566 0.2977 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9564 0.3206 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9560 0.3603 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9559 0.3712 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9557 0.4233 ms 60.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9558 0.4421 ms 57.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9562 0.4690 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9561 0.4866 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8247 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(800x2048, 2048x1024) + mm 0.0154 ms 100.0% + triton_mm_9636 0.0154 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9642 0.0180 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9632 0.0195 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9631 0.0210 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9635 0.0212 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9641 0.0232 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9634 0.0250 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9638 0.0250 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9628 0.0277 ms 55.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3601 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE convolution(16x3x224x224, 1152x3x14x14) + convolution 0.2652 ms 100.0% + triton_convolution2d_3 0.4559 ms 58.2% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_6 0.5171 ms 51.3% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_1 0.5485 ms 48.3% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_0 0.6501 ms 40.8% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_4 0.6596 ms 40.2% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_5 0.8541 ms 31.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_2 5.3309 ms 5.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=1024, BLOCK_N=16, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5129 seconds and 0.0002 seconds precompiling for 8 choices +AUTOTUNE mm(4096x4304, 4304x1152) + mm 0.0841 ms 100.0% + triton_mm_119 0.1139 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_120 0.1180 ms 71.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_118 0.1194 ms 70.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113 0.1483 ms 56.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_114 0.1519 ms 55.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_116 0.1696 ms 49.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_112 0.1721 ms 48.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_115 0.1811 ms 46.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_111 0.1813 ms 46.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.7495 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(4096x2048, 4096x1152, 1152x2048) + bias_addmm 0.0430 ms 100.0% + triton_mm_3102 0.0519 ms 82.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_3096 0.0579 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_3103 0.0585 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + addmm 0.0635 ms 67.8% + triton_mm_3101 0.0641 ms 67.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_3097 0.0704 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_3095 0.0730 ms 59.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_3099 0.0742 ms 58.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_3098 0.0794 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.5443 seconds and 0.0003 seconds precompiling for 21 choices +AUTOTUNE mm(800x32, 32x1024) + triton_mm_9320 0.0064 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9321 0.0064 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9315 0.0064 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9318 0.0064 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_9319 0.0064 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9323 0.0064 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9316 0.0064 ms 99.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9322 0.0066 ms 96.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9324 0.0067 ms 95.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9314 0.0067 ms 94.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2626 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(15488x2048, 2048x256) + mm 0.0553 ms 100.0% + triton_mm_9393 0.0639 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9400 0.0683 ms 80.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9399 0.0693 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9398 0.0700 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9394 0.0707 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9389 0.0809 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9390 0.0812 ms 68.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9391 0.0820 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9392 0.0829 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5153 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x1024, 1024x3072) + triton_mm_9428 0.0149 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9424 0.0150 ms 99.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_9432 0.0160 ms 93.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9436 0.0164 ms 91.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9422 0.0166 ms 90.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9423 0.0167 ms 89.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_9427 0.0172 ms 87.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9434 0.0180 ms 82.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9421 0.0180 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_9431 0.0183 ms 81.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2947 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(800x1024, 1024x2048) + mm 0.0129 ms 100.0% + triton_mm_9455 0.0134 ms 96.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9448 0.0147 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9454 0.0149 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9444 0.0167 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9449 0.0170 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9447 0.0173 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9451 0.0173 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9446 0.0204 ms 63.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9450 0.0204 ms 63.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3436 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x256) + triton_mm_9460 0.0084 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0085 ms 98.5% + triton_mm_9464 0.0088 ms 96.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9468 0.0097 ms 86.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9459 0.0115 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9463 0.0116 ms 72.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9458 0.0118 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9474 0.0119 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9467 0.0120 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9457 0.0126 ms 66.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3159 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(16x128x1, 16x1x1018) + triton_bmm_9496 0.0089 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_9494 0.0092 ms 96.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_bmm_9495 0.0094 ms 94.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_bmm_9500 0.0098 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_9497 0.0101 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_9498 0.0101 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_bmm_9503 0.0104 ms 85.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_9502 0.0105 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9501 0.0106 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_bmm_9499 0.0106 ms 83.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2570 seconds and 0.0002 seconds precompiling for 17 choices +AUTOTUNE bmm(128x1024x256, 128x256x1024) + bmm 0.2363 ms 100.0% + triton_bmm_9526 0.2839 ms 83.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9527 0.3044 ms 77.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9519 0.3078 ms 76.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9523 0.3116 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9520 0.3177 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_9521 0.3229 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9524 0.3259 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_9528 0.3559 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_9516 0.3680 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8243 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1024x1024, 128x1024x256) + bmm 0.2307 ms 100.0% + triton_bmm_9547 0.2420 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_9546 0.2448 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9540 0.2452 ms 94.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9541 0.2771 ms 83.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_bmm_9538 0.2792 ms 82.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_9539 0.2877 ms 80.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_9543 0.3005 ms 76.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_9536 0.3024 ms 76.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_9545 0.3174 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8063 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x16384, 16384x2048) + mm 1.9698 ms 100.0% + triton_mm_9623 2.2660 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9622 2.2858 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9621 2.4590 ms 80.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9617 2.9764 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_9614 3.4176 ms 57.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9616 3.7735 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9615 4.0802 ms 48.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9613 4.1278 ms 47.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9620 4.1928 ms 47.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.4342 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(800x4096, 4096x1024) + triton_mm_9710 0.0257 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0290 ms 88.7% + triton_mm_9716 0.0312 ms 82.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_9706 0.0316 ms 81.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_9709 0.0382 ms 67.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9705 0.0390 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_9715 0.0411 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_9712 0.0475 ms 54.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9708 0.0477 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_9702 0.0486 ms 52.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4248 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x2048, 2048x16384) + mm 1.9785 ms 100.0% + triton_mm_16820 2.4310 ms 81.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16822 2.5890 ms 76.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16821 2.6041 ms 76.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16816 3.4506 ms 57.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16813 3.6484 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16819 3.9376 ms 50.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_16818 3.9510 ms 50.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16815 3.9605 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16814 4.0792 ms 48.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.3974 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x4304) + mm 0.0857 ms 100.0% + triton_mm_36401 0.1143 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_36400 0.1176 ms 72.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36399 0.1206 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36394 0.1411 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36392 0.1440 ms 59.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36397 0.1445 ms 59.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36396 0.1458 ms 58.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36393 0.1476 ms 58.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36398 0.1607 ms 53.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6991 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x4096) + mm 0.0190 ms 100.0% + triton_mm_15802 0.0222 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15794 0.0224 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15803 0.0225 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15795 0.0244 ms 77.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15796 0.0252 ms 75.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15798 0.0257 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15801 0.0259 ms 73.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15799 0.0282 ms 67.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15792 0.0299 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3716 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x1024, 1024x1024) + triton_mm_33217 0.0106 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_33221 0.0119 ms 89.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_33216 0.0130 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_33215 0.0136 ms 77.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_33220 0.0138 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0145 ms 73.5% + triton_mm_33229 0.0148 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33214 0.0150 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_33227 0.0153 ms 69.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_33224 0.0160 ms 66.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2959 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(800x32, 32x1024) + triton_mm_15423 0.0067 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15424 0.0068 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15422 0.0071 ms 94.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_15421 0.0071 ms 93.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_15427 0.0071 ms 93.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_15428 0.0072 ms 92.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15434 0.0073 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15429 0.0074 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15425 0.0074 ms 90.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15433 0.0074 ms 89.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2644 seconds and 0.0004 seconds precompiling for 18 choices +E1218 08:47:16.781000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: +E1218 08:47:16.781000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 08:47:16.781000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. +AUTOTUNE mm(32x800, 800x1024) + triton_mm_15441 0.0120 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15446 0.0124 ms 96.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0132 ms 90.8% + triton_mm_15442 0.0135 ms 88.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15445 0.0140 ms 86.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15452 0.0147 ms 81.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15440 0.0148 ms 81.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15449 0.0150 ms 80.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15451 0.0162 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15448 0.0165 ms 72.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2892 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(3072x16, 16x1024) + triton_mm_15473 0.0105 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_15474 0.0107 ms 98.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15472 0.0111 ms 95.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_15476 0.0114 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15478 0.0118 ms 88.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + mm 0.0124 ms 84.6% + triton_mm_15479 0.0125 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15477 0.0126 ms 83.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_15475 0.0127 ms 82.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15487 0.0132 ms 79.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2633 seconds and 0.0002 seconds precompiling for 17 choices +AUTOTUNE mm(800x4096, 4096x1024) + mm 0.0236 ms 100.0% + triton_mm_15835 0.0243 ms 97.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_15841 0.0302 ms 78.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15831 0.0315 ms 74.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_15834 0.0363 ms 65.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15830 0.0379 ms 62.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15840 0.0395 ms 59.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15833 0.0433 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15837 0.0441 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15827 0.0485 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4187 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x800, 800x4096) + mm 0.0202 ms 100.0% + triton_mm_15775 0.0203 ms 99.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15779 0.0223 ms 90.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15782 0.0230 ms 88.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15776 0.0237 ms 85.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15777 0.0253 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15780 0.0257 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15783 0.0269 ms 75.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15784 0.0296 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15773 0.0326 ms 62.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3818 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x800, 800x1024) + mm 0.0197 ms 100.0% + triton_mm_15813 0.0201 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15817 0.0224 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15820 0.0230 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15814 0.0238 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15818 0.0245 ms 80.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15815 0.0247 ms 79.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15821 0.0250 ms 78.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15822 0.0289 ms 68.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15811 0.0322 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3783 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(16x3072, 3072x1024, 16x3072, 3072x1024) + _mm_plus_mm 0.0407 ms 100.0% + triton_mm_plus_mm_15900 0.0685 ms 59.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_15899 0.0816 ms 49.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_15903 0.0847 ms 48.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_15902 0.1699 ms 24.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_15897 0.2009 ms 20.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_15898 0.2394 ms 17.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_15905 0.2539 ms 16.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_plus_mm_15896 0.2666 ms 15.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_15904 0.3812 ms 10.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3616 seconds and 0.0002 seconds precompiling for 11 choices +AUTOTUNE mm(1024x800, 800x2048) + mm 0.0131 ms 100.0% + triton_mm_15920 0.0150 ms 87.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15916 0.0156 ms 84.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15917 0.0162 ms 80.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15923 0.0165 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15915 0.0169 ms 77.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15913 0.0189 ms 69.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15919 0.0189 ms 68.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15922 0.0203 ms 64.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15924 0.0204 ms 63.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3446 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x2048) + mm 0.0130 ms 100.0% + triton_mm_15943 0.0134 ms 96.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_15936 0.0148 ms 88.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15935 0.0163 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15932 0.0164 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_15939 0.0164 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_15937 0.0184 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_15934 0.0190 ms 68.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15942 0.0194 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_15938 0.0222 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3417 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1018x256, 128x256x1018) + triton_bmm_15979 0.3150 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15972 0.3332 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15980 0.3402 ms 92.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15976 0.3486 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15974 0.3530 ms 89.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15981 0.3946 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_15973 0.3948 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_15977 0.3964 ms 79.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_15969 0.4081 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_bmm_15978 0.4353 ms 72.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8517 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1018x1018, 128x1018x256) + triton_bmm_15960 0.2943 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15953 0.3104 ms 94.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15962 0.3377 ms 87.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_15955 0.3417 ms 86.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15954 0.3576 ms 82.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + bmm 0.3697 ms 79.6% + triton_bmm_15957 0.3724 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15959 0.3840 ms 76.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_15958 0.4029 ms 73.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_15961 0.4452 ms 66.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8175 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x256x1018, 128x1018x1018) + triton_bmm_15998 0.3033 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15995 0.3285 ms 92.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15996 0.3516 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_16000 0.3608 ms 84.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_15991 0.3620 ms 83.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + bmm 0.3880 ms 78.2% + triton_bmm_15997 0.3900 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_15993 0.4359 ms 69.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_15992 0.4381 ms 69.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_15999 0.4714 ms 64.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8337 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1018x1018, 128x1018x256) + bmm 0.4218 ms 100.0% + triton_bmm_16017 0.4461 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_16010 0.4762 ms 88.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_16014 0.5030 ms 83.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_16011 0.5188 ms 81.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_16019 0.5203 ms 81.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_16012 0.5317 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_16015 0.5541 ms 76.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_16008 0.6125 ms 68.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_16007 0.6288 ms 67.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8792 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(256x800, 800x1024) + mm 0.0081 ms 100.0% + triton_mm_16028 0.0088 ms 91.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16032 0.0099 ms 81.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16027 0.0110 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_16024 0.0116 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16031 0.0118 ms 68.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16038 0.0119 ms 68.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16034 0.0121 ms 66.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16030 0.0122 ms 66.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16023 0.0122 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3289 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x256, 256x1024) + triton_mm_16050 0.0072 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16051 0.0074 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16049 0.0076 ms 94.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16046 0.0076 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_16053 0.0076 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + mm 0.0078 ms 92.6% + triton_mm_16048 0.0082 ms 87.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16056 0.0082 ms 87.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16057 0.0084 ms 85.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16047 0.0091 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3003 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(2048x800, 800x1024) + mm 0.0129 ms 100.0% + triton_mm_16106 0.0150 ms 86.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16110 0.0150 ms 85.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16107 0.0158 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16105 0.0163 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16113 0.0164 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16103 0.0187 ms 69.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_16109 0.0189 ms 68.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16112 0.0201 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16114 0.0201 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3402 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x2048, 2048x1024) + triton_mm_16127 0.0150 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0150 ms 99.6% + triton_mm_16133 0.0178 ms 84.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16123 0.0191 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16122 0.0202 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_16126 0.0205 ms 72.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16132 0.0225 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16125 0.0229 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16129 0.0236 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16119 0.0294 ms 50.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4267 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x3072, 3072x1024) + triton_mm_16138 0.0189 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + mm 0.0212 ms 88.9% + triton_mm_16142 0.0226 ms 83.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16137 0.0289 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_16136 0.0302 ms 62.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16141 0.0302 ms 62.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16150 0.0321 ms 58.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16148 0.0356 ms 53.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16145 0.0369 ms 51.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16146 0.0369 ms 51.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3481 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(256x15488, 15488x2048) + mm 0.0650 ms 100.0% + triton_mm_16179 0.0717 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16185 0.0952 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16175 0.1034 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16178 0.1220 ms 53.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16174 0.1265 ms 51.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_16171 0.1271 ms 51.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_16184 0.1332 ms 48.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16177 0.1400 ms 46.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16181 0.1413 ms 46.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6935 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x256, 256x2048) + mm 0.0495 ms 100.0% + triton_mm_16203 0.0628 ms 78.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16202 0.0664 ms 74.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16195 0.0678 ms 72.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16199 0.0724 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16197 0.0735 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16200 0.0773 ms 64.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16196 0.0776 ms 63.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16204 0.0787 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16201 0.0835 ms 59.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5451 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(2048x15488, 15488x2048) + mm 0.2436 ms 100.0% + triton_mm_16259 0.3118 ms 78.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16252 0.3427 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16261 0.3513 ms 69.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16255 0.3606 ms 67.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16254 0.3924 ms 62.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16260 0.4060 ms 60.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16256 0.4138 ms 58.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16253 0.4256 ms 57.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16251 0.4395 ms 55.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8254 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x2048, 2048x2048) + mm 0.2684 ms 100.0% + triton_mm_16279 0.3006 ms 89.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16280 0.3055 ms 87.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16278 0.3108 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16273 0.3802 ms 70.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16271 0.3954 ms 67.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16274 0.4213 ms 63.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16272 0.4403 ms 61.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16275 0.4412 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16276 0.4514 ms 59.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8256 seconds and 0.0006 seconds precompiling for 20 choices +AUTOTUNE mm(2048x15488, 15488x16384) + mm 1.8614 ms 100.0% + triton_mm_16801 2.0917 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16803 2.1596 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16802 2.5610 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16797 2.7185 ms 68.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16794 3.1415 ms 59.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16795 3.4247 ms 54.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16796 3.5682 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16800 3.5957 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_16799 3.7836 ms 49.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.3588 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16384x15488, 15488x2048) + mm 1.9554 ms 100.0% + triton_mm_16840 2.1004 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16839 2.1393 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16841 2.2009 ms 88.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16835 2.7853 ms 70.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16832 3.2950 ms 59.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16838 3.3614 ms 58.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_16837 3.4287 ms 57.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16834 3.7128 ms 52.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16829 3.7722 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 1.3626 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x16384, 16384x2048) + mm 2.0051 ms 100.0% + triton_mm_16860 2.2257 ms 90.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_16858 2.3506 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16859 2.4622 ms 81.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16854 2.8353 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_16851 3.4583 ms 58.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16856 3.6333 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16852 3.6772 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_16853 3.7557 ms 53.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_16857 3.8517 ms 52.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.3833 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x16, 16x1024) + triton_mm_33232 0.0065 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33231 0.0065 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_33230 0.0068 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_33233 0.0071 ms 91.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33234 0.0071 ms 91.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_33236 0.0073 ms 89.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + mm 0.0074 ms 88.3% + triton_mm_33235 0.0074 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_33237 0.0074 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_33240 0.0082 ms 79.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2516 seconds and 0.0002 seconds precompiling for 17 choices +E1218 08:47:34.818000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: +E1218 08:47:34.818000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 08:47:34.818000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. +AUTOTUNE mm(1024x800, 800x32) + triton_mm_33264 0.0120 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0132 ms 91.2% + triton_mm_33263 0.0176 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_33270 0.0302 ms 39.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33272 0.0306 ms 39.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33269 0.0322 ms 37.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_33271 0.0338 ms 35.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_33265 0.0347 ms 34.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33262 0.0407 ms 29.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_33268 0.0407 ms 29.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3257 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(4096x2048, 2048x1152) + mm 0.0421 ms 100.0% + triton_mm_33295 0.0501 ms 84.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33297 0.0523 ms 80.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33296 0.0526 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33290 0.0535 ms 78.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33289 0.0636 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_33288 0.0694 ms 60.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33291 0.0695 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_33292 0.0735 ms 57.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33286 0.0769 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5032 seconds and 0.0005 seconds precompiling for 20 choices +AUTOTUNE mm(4096x4304, 4304x1152) + mm 0.0856 ms 100.0% + triton_mm_33417 0.1180 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33418 0.1263 ms 67.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_33416 0.1298 ms 66.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33411 0.1522 ms 56.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33412 0.1596 ms 53.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_33410 0.1811 ms 47.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_33414 0.1828 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_33409 0.1857 ms 46.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_33413 0.1870 ms 45.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.7575 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x4304, 4304x1152) + mm 0.0916 ms 100.0% + triton_mm_36418 0.1053 ms 87.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36419 0.1126 ms 81.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36420 0.1241 ms 73.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_36413 0.1245 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36414 0.1526 ms 60.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_36411 0.1527 ms 60.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36412 0.1592 ms 57.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36416 0.1629 ms 56.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36415 0.1631 ms 56.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.6987 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x1152) + mm 0.0287 ms 100.0% + triton_mm_36437 0.0323 ms 88.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36439 0.0333 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_36432 0.0335 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36430 0.0380 ms 75.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36431 0.0387 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36435 0.0418 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_36438 0.0420 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_36433 0.0452 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_36434 0.0467 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4249 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(2048x4096, 4096x1152, 2048x4096, 4096x1152) + _mm_plus_mm 0.0846 ms 100.0% + triton_mm_plus_mm_42565 0.1936 ms 43.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42566 0.2079 ms 40.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_42570 0.2271 ms 37.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42571 0.2526 ms 33.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42573 0.3224 ms 26.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42568 0.3620 ms 23.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42567 0.3932 ms 21.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_42569 0.4575 ms 18.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42572 0.6876 ms 12.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4385 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x4096, 4096x4304, 1152x4096, 4096x4304) + _mm_plus_mm 0.1707 ms 100.0% + triton_mm_plus_mm_42595 0.3715 ms 45.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_42594 0.4118 ms 41.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42599 0.4844 ms 35.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42600 0.5408 ms 31.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42602 0.6719 ms 25.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42597 0.7427 ms 23.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42596 0.8079 ms 21.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_42598 0.9653 ms 17.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42603 1.2716 ms 13.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.4655 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(4304x4096, 4096x1152, 4304x4096, 4096x1152) + _mm_plus_mm 0.1701 ms 100.0% + triton_mm_plus_mm_42624 0.3544 ms 48.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_42623 0.3925 ms 43.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42628 0.4566 ms 37.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42629 0.5391 ms 31.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42631 0.6501 ms 26.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42626 0.7598 ms 22.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42625 0.7922 ms 21.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_42627 0.9747 ms 17.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42632 1.2762 ms 13.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.4669 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x4096, 4096x1152, 1152x4096, 4096x1152) + _mm_plus_mm 0.0537 ms 100.0% + triton_mm_plus_mm_42653 0.1116 ms 48.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_42658 0.1298 ms 41.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42652 0.1915 ms 28.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42654 0.2048 ms 26.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_42657 0.2270 ms 23.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_42655 0.2319 ms 23.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42656 0.2602 ms 20.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_42660 0.3184 ms 16.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_42661 0.3250 ms 16.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.3993 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE addmm(2048x1152, 2048x4096, 4096x1152) + addmm 0.0501 ms 100.0% + triton_mm_50388 0.0503 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50394 0.0538 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50391 0.0592 ms 84.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50387 0.0638 ms 78.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50389 0.0682 ms 73.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_50386 0.0683 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50390 0.0691 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50393 0.0705 ms 71.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50395 0.0757 ms 66.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5258 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x4304, 1152x4096, 4096x4304) + addmm 0.1054 ms 100.0% + triton_mm_50432 0.1155 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50433 0.1169 ms 90.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_50426 0.1409 ms 74.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50431 0.1484 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50429 0.1489 ms 70.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50425 0.1516 ms 69.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50428 0.1573 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50427 0.1580 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_50424 0.1587 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.7273 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(4304x1152, 4304x4096, 4096x1152) + addmm 0.1051 ms 100.0% + triton_mm_50470 0.1121 ms 93.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50471 0.1189 ms 88.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_50464 0.1233 ms 85.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50462 0.1366 ms 77.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50469 0.1383 ms 76.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50465 0.1423 ms 73.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_50463 0.1432 ms 73.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50467 0.1492 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50466 0.1595 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.6767 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x1152, 1152x4096, 4096x1152) + triton_mm_50509 0.0347 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + addmm 0.0348 ms 99.9% + triton_mm_50502 0.0423 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50503 0.0443 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_50508 0.0451 ms 77.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_50501 0.0475 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50499 0.0475 ms 73.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_50498 0.0481 ms 72.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_50505 0.0481 ms 72.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_50500 0.0629 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4516 seconds and 0.0003 seconds precompiling for 20 choices +INFO 2025-12-18 08:53:43 ot_train.py:357 step:200 smpl:3K ep:8 epch:0.03 loss:0.193 grdn:2.098 lr:3.8e-06 updt_s:7.738 data_s:0.017 +WARNING 2025-12-18 08:53:43 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 08:57:01 ot_train.py:357 step:400 smpl:6K ep:16 epch:0.06 loss:0.104 grdn:1.525 lr:1.1e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 08:57:01 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:00:19 ot_train.py:357 step:600 smpl:10K ep:24 epch:0.10 loss:0.076 grdn:1.338 lr:1.9e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:00:19 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:03:36 ot_train.py:357 step:800 smpl:13K ep:32 epch:0.13 loss:0.066 grdn:0.870 lr:2.5e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:03:36 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:06:54 ot_train.py:357 step:1K smpl:16K ep:40 epch:0.16 loss:0.064 grdn:0.706 lr:2.5e-05 updt_s:0.982 data_s:0.007 +WARNING 2025-12-18 09:06:54 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:10:12 ot_train.py:357 step:1K smpl:19K ep:48 epch:0.19 loss:0.057 grdn:0.568 lr:2.5e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:10:12 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:13:29 ot_train.py:357 step:1K smpl:22K ep:56 epch:0.22 loss:0.055 grdn:0.551 lr:2.5e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:13:29 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:16:47 ot_train.py:357 step:2K smpl:26K ep:64 epch:0.25 loss:0.053 grdn:0.525 lr:2.5e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 09:16:47 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:20:04 ot_train.py:357 step:2K smpl:29K ep:72 epch:0.29 loss:0.051 grdn:0.472 lr:2.5e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:20:04 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:23:22 ot_train.py:357 step:2K smpl:32K ep:80 epch:0.32 loss:0.049 grdn:0.461 lr:2.5e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:23:22 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:26:40 ot_train.py:357 step:2K smpl:35K ep:88 epch:0.35 loss:0.050 grdn:0.469 lr:2.4e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:26:40 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:29:57 ot_train.py:357 step:2K smpl:38K ep:96 epch:0.38 loss:0.049 grdn:0.504 lr:2.4e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:29:57 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:33:15 ot_train.py:357 step:3K smpl:42K ep:104 epch:0.41 loss:0.047 grdn:0.458 lr:2.4e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 09:33:15 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:36:32 ot_train.py:357 step:3K smpl:45K ep:112 epch:0.45 loss:0.047 grdn:0.463 lr:2.4e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:36:32 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:39:50 ot_train.py:357 step:3K smpl:48K ep:120 epch:0.48 loss:0.046 grdn:0.483 lr:2.4e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 09:39:50 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:39:50 ot_train.py:367 Checkpoint policy after step 3000 +INFO 2025-12-18 09:43:56 ot_train.py:357 step:3K smpl:51K ep:128 epch:0.51 loss:0.044 grdn:0.442 lr:2.4e-05 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 09:43:56 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:47:14 ot_train.py:357 step:3K smpl:54K ep:136 epch:0.54 loss:0.043 grdn:0.464 lr:2.4e-05 updt_s:0.983 data_s:0.007 +WARNING 2025-12-18 09:47:14 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:50:31 ot_train.py:357 step:4K smpl:58K ep:144 epch:0.57 loss:0.045 grdn:0.470 lr:2.3e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:50:31 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:53:49 ot_train.py:357 step:4K smpl:61K ep:152 epch:0.60 loss:0.043 grdn:0.494 lr:2.3e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 09:53:49 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 09:57:07 ot_train.py:357 step:4K smpl:64K ep:160 epch:0.64 loss:0.042 grdn:0.450 lr:2.3e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 09:57:07 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:00:24 ot_train.py:357 step:4K smpl:67K ep:168 epch:0.67 loss:0.042 grdn:0.468 lr:2.3e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:00:24 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:03:42 ot_train.py:357 step:4K smpl:70K ep:176 epch:0.70 loss:0.042 grdn:0.475 lr:2.3e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:03:42 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:06:59 ot_train.py:357 step:5K smpl:74K ep:184 epch:0.73 loss:0.040 grdn:0.452 lr:2.2e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:06:59 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:10:16 ot_train.py:357 step:5K smpl:77K ep:192 epch:0.76 loss:0.040 grdn:0.484 lr:2.2e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:10:16 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:13:34 ot_train.py:357 step:5K smpl:80K ep:199 epch:0.79 loss:0.039 grdn:0.496 lr:2.2e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:13:34 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:16:51 ot_train.py:357 step:5K smpl:83K ep:207 epch:0.83 loss:0.039 grdn:0.490 lr:2.2e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:16:51 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:20:09 ot_train.py:357 step:5K smpl:86K ep:215 epch:0.86 loss:0.038 grdn:0.528 lr:2.1e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 10:20:09 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:23:26 ot_train.py:357 step:6K smpl:90K ep:223 epch:0.89 loss:0.039 grdn:0.493 lr:2.1e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:23:26 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:26:44 ot_train.py:357 step:6K smpl:93K ep:231 epch:0.92 loss:0.038 grdn:0.514 lr:2.1e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:26:44 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:30:01 ot_train.py:357 step:6K smpl:96K ep:239 epch:0.95 loss:0.036 grdn:0.494 lr:2.1e-05 updt_s:0.981 data_s:0.007 +WARNING 2025-12-18 10:30:01 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 10:30:01 ot_train.py:367 Checkpoint policy after step 6000 +INFO 2025-12-18 10:34:08 ot_train.py:357 step:6K smpl:99K ep:247 epch:0.99 loss:0.037 grdn:0.511 lr:2.0e-05 updt_s:0.980 data_s:0.007 +WARNING 2025-12-18 10:34:08 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +AUTOTUNE mm(3872x2048, 2048x16384) + mm 0.4980 ms 100.0% + triton_mm_66155 0.5899 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66156 0.6291 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66154 0.6477 ms 76.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66150 0.7941 ms 62.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66149 0.8009 ms 62.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66147 0.9084 ms 54.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66151 0.9436 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66152 0.9573 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66148 0.9915 ms 50.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.9190 seconds and 61.6122 seconds precompiling for 20 choices +AUTOTUNE mm(1024x1152, 1152x4304) + mm 0.0248 ms 100.0% + triton_mm_56651 0.0310 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56652 0.0329 ms 75.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56646 0.0342 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56653 0.0344 ms 72.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_56647 0.0388 ms 63.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_56644 0.0394 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56645 0.0399 ms 62.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_56648 0.0416 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56649 0.0438 ms 56.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4349 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x1152, 1152x1152) + mm 0.0122 ms 100.0% + triton_mm_56577 0.0137 ms 89.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_56570 0.0151 ms 80.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56566 0.0153 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_56576 0.0158 ms 77.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56571 0.0168 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_56569 0.0175 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_56573 0.0175 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_56567 0.0184 ms 66.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_56568 0.0210 ms 58.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3438 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(200x1024, 1024x4096) + mm 0.0132 ms 100.0% + triton_mm_66249 0.0146 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66238 0.0158 ms 83.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_66242 0.0159 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66248 0.0164 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66243 0.0172 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66245 0.0185 ms 71.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66241 0.0187 ms 70.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66239 0.0187 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66240 0.0212 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3429 seconds and 0.0004 seconds precompiling for 20 choices +E1218 10:40:41.012000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Runtime error during autotuning: +E1218 10:40:41.012000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 10:40:41.012000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Ignoring this choice. +AUTOTUNE mm(200x1024, 1024x32) + triton_mm_71967 0.0094 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_71958 0.0110 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0111 ms 84.7% + triton_mm_71959 0.0115 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_71966 0.0119 ms 78.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_71972 0.0124 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_71963 0.0124 ms 75.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_71965 0.0131 ms 71.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_71969 0.0138 ms 68.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_71971 0.0140 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2866 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(4x1024, 1024x1024) + triton_mm_65885 0.0100 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_65889 0.0110 ms 91.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65884 0.0121 ms 83.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_65883 0.0128 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65888 0.0135 ms 74.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65893 0.0136 ms 73.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0141 ms 71.0% + triton_mm_65882 0.0145 ms 69.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_65895 0.0146 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_65897 0.0148 ms 67.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2938 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(3872x2048, 2048x2048) + mm 0.0637 ms 100.0% + triton_mm_66117 0.0835 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66118 0.0852 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66116 0.0879 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66111 0.0912 ms 69.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66112 0.0974 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66110 0.1115 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66109 0.1164 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66114 0.1164 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66113 0.1231 ms 51.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.6095 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x2048, 2048x1024) + mm 0.0107 ms 100.0% + triton_mm_66184 0.0115 ms 93.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66188 0.0134 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66180 0.0155 ms 69.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66194 0.0174 ms 61.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66183 0.0191 ms 56.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_66178 0.0200 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66177 0.0203 ms 52.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_66187 0.0203 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66193 0.0224 ms 47.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4255 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE convolution(4x3x224x224, 1152x3x14x14) + convolution 0.0994 ms 100.0% + triton_convolution2d_56555 0.1571 ms 63.3% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_56556 0.1637 ms 60.7% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_56553 0.1932 ms 51.4% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_56558 0.1982 ms 50.1% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_56552 0.2569 ms 38.7% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_56557 0.3087 ms 32.2% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_56554 1.8184 ms 5.5% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=1024, BLOCK_N=16, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3265 seconds and 0.0003 seconds precompiling for 8 choices +AUTOTUNE mm(1024x4304, 4304x1152) + mm 0.0308 ms 100.0% + triton_mm_56672 0.0365 ms 84.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_56665 0.0466 ms 66.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56671 0.0483 ms 63.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_56661 0.0526 ms 58.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_56666 0.0526 ms 58.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_56662 0.0544 ms 56.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_56664 0.0574 ms 53.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_56668 0.0574 ms 53.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_56667 0.0706 ms 43.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4748 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1024x2048, 1024x1152, 1152x2048) + bias_addmm 0.0172 ms 100.0% + triton_mm_59648 0.0184 ms 93.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_59654 0.0215 ms 80.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_59650 0.0245 ms 70.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_59646 0.0246 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_59644 0.0248 ms 69.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_59655 0.0250 ms 69.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + addmm 0.0250 ms 68.9% + triton_mm_59651 0.0251 ms 68.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_59647 0.0252 ms 68.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3884 seconds and 0.0003 seconds precompiling for 21 choices +AUTOTUNE mm(200x32, 32x1024) + triton_mm_65865 0.0052 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_65866 0.0053 ms 98.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_65870 0.0056 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_65872 0.0056 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65867 0.0057 ms 92.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_65871 0.0057 ms 92.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_65864 0.0058 ms 91.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_65868 0.0059 ms 89.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65869 0.0062 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_65873 0.0062 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2681 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(3872x2048, 2048x256) + mm 0.0203 ms 100.0% + triton_mm_65952 0.0246 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_65946 0.0278 ms 72.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_65945 0.0278 ms 72.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65941 0.0280 ms 72.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_65942 0.0292 ms 69.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65951 0.0293 ms 69.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65944 0.0327 ms 61.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_65948 0.0329 ms 61.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_65938 0.0344 ms 58.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3741 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4x1024, 1024x3072) + triton_mm_65980 0.0150 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65976 0.0151 ms 99.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_65984 0.0162 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_65975 0.0165 ms 90.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_65988 0.0169 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_65974 0.0172 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_65979 0.0173 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65973 0.0179 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_65983 0.0180 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65986 0.0183 ms 82.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3017 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(200x1024, 1024x2048) + mm 0.0100 ms 100.0% + triton_mm_66001 0.0111 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_65996 0.0131 ms 76.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_66007 0.0131 ms 76.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_65997 0.0132 ms 75.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66000 0.0136 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66006 0.0146 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_65990 0.0150 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_65993 0.0155 ms 64.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66003 0.0156 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3297 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x1024, 1024x256) + triton_mm_66012 0.0076 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0078 ms 98.4% + triton_mm_66016 0.0080 ms 95.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66020 0.0092 ms 82.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66011 0.0109 ms 70.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66015 0.0110 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_66010 0.0111 ms 68.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66019 0.0114 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66026 0.0117 ms 65.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66009 0.0122 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3253 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(4x128x1, 4x1x1018) + triton_bmm_66046 0.0057 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_bmm_66047 0.0058 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_bmm_66048 0.0059 ms 96.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_66054 0.0064 ms 89.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66049 0.0064 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_66050 0.0064 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_bmm_66051 0.0064 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_bmm_66056 0.0064 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_bmm_66053 0.0065 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_bmm_66055 0.0065 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2569 seconds and 0.0002 seconds precompiling for 17 choices +AUTOTUNE bmm(32x1024x256, 32x256x1024) + bmm 0.0625 ms 100.0% + triton_bmm_66078 0.0749 ms 83.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66079 0.0803 ms 77.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66071 0.0825 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66075 0.0845 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66073 0.0860 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66072 0.0862 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_66076 0.0883 ms 70.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_66080 0.0942 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_66068 0.1001 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.5761 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(32x1024x1024, 32x1024x256) + bmm 0.0692 ms 100.0% + triton_bmm_66092 0.0727 ms 95.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66099 0.0744 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_66098 0.0810 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66093 0.0814 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_bmm_66091 0.0841 ms 82.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_66090 0.0844 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_66088 0.0892 ms 77.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_66095 0.0909 ms 76.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_66089 0.0975 ms 71.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.5411 seconds and 0.0007 seconds precompiling for 20 choices +AUTOTUNE mm(3872x16384, 16384x2048) + mm 0.4792 ms 100.0% + triton_mm_66175 0.6427 ms 74.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66174 0.6799 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66169 0.6800 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66173 0.6806 ms 70.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66168 0.7702 ms 62.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66165 0.9315 ms 51.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66166 0.9333 ms 51.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66167 0.9378 ms 51.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_66171 1.0017 ms 47.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.9131 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x4096, 4096x1024) + mm 0.0176 ms 100.0% + triton_mm_66258 0.0178 ms 98.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66262 0.0204 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_66254 0.0262 ms 67.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_66268 0.0305 ms 57.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66257 0.0353 ms 49.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_66261 0.0356 ms 49.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66252 0.0367 ms 47.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_66267 0.0388 ms 45.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_66251 0.0399 ms 44.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4102 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(3872x2048, 2048x16384) + mm 0.5086 ms 100.0% + triton_mm_73373 0.5853 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73374 0.6160 ms 82.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_73372 0.6188 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73367 0.8211 ms 61.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73368 0.8219 ms 61.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_73365 0.8268 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73370 0.8849 ms 57.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_73369 0.9173 ms 55.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73366 0.9243 ms 55.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8883 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(1024x1152, 1152x4304) + mm 0.0252 ms 100.0% + triton_mm_92951 0.0322 ms 78.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92953 0.0351 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_92948 0.0388 ms 64.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92944 0.0406 ms 62.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92945 0.0414 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92946 0.0414 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92949 0.0434 ms 58.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92952 0.0443 ms 56.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92947 0.0462 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4839 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(200x1024, 1024x4096) + mm 0.0131 ms 100.0% + triton_mm_72355 0.0143 ms 91.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72348 0.0155 ms 84.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72344 0.0155 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72347 0.0165 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72351 0.0166 ms 78.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72349 0.0182 ms 72.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72345 0.0188 ms 69.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72346 0.0195 ms 67.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72354 0.0197 ms 66.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3990 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(4x1024, 1024x1024) + triton_mm_89769 0.0098 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_89773 0.0121 ms 81.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89768 0.0131 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_89767 0.0135 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89772 0.0136 ms 72.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0142 ms 69.1% + triton_mm_89766 0.0146 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_89781 0.0147 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89776 0.0151 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89779 0.0153 ms 64.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2898 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(200x32, 32x1024) + triton_mm_71975 0.0053 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_71974 0.0057 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_71976 0.0057 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_71980 0.0057 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_71979 0.0059 ms 89.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_71981 0.0059 ms 89.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_71973 0.0060 ms 88.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_71977 0.0062 ms 84.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0063 ms 83.8% + triton_mm_71986 0.0064 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2698 seconds and 0.0004 seconds precompiling for 18 choices +E1218 10:52:13.364000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Runtime error during autotuning: +E1218 10:52:13.364000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 10:52:13.364000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Ignoring this choice. +AUTOTUNE mm(32x200, 200x1024) + triton_mm_71993 0.0069 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_71992 0.0077 ms 89.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72004 0.0079 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_71991 0.0079 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_71994 0.0080 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_71997 0.0080 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + mm 0.0085 ms 81.6% + triton_mm_72003 0.0087 ms 80.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72001 0.0091 ms 76.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_71999 0.0092 ms 75.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2681 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(3072x4, 4x1024) + triton_mm_72025 0.0107 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72026 0.0107 ms 99.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72024 0.0108 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_72030 0.0115 ms 92.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72028 0.0116 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72027 0.0119 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72031 0.0121 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72029 0.0122 ms 87.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72037 0.0126 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_72039 0.0127 ms 84.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2655 seconds and 0.0003 seconds precompiling for 17 choices +AUTOTUNE mm(200x4096, 4096x1024) + mm 0.0163 ms 100.0% + triton_mm_72383 0.0167 ms 97.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72387 0.0210 ms 77.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72379 0.0259 ms 63.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72393 0.0296 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72377 0.0327 ms 49.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72386 0.0341 ms 47.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72382 0.0348 ms 46.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72392 0.0370 ms 44.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72378 0.0396 ms 41.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4024 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x200, 200x4096) + triton_mm_72327 0.0121 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0124 ms 97.7% + triton_mm_72331 0.0126 ms 95.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72334 0.0126 ms 95.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72328 0.0140 ms 86.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72329 0.0144 ms 84.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72332 0.0146 ms 83.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72324 0.0158 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72335 0.0159 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72336 0.0180 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3310 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x200, 200x1024) + mm 0.0109 ms 100.0% + triton_mm_72365 0.0121 ms 90.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72372 0.0127 ms 85.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72369 0.0127 ms 85.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72370 0.0139 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72366 0.0140 ms 77.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72367 0.0150 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72373 0.0150 ms 72.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72362 0.0158 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72371 0.0164 ms 66.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3316 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(4x3072, 3072x1024, 4x3072, 3072x1024) + _mm_plus_mm 0.0400 ms 100.0% + triton_mm_plus_mm_72452 0.0681 ms 58.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_72451 0.0828 ms 48.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_72455 0.0852 ms 46.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_72454 0.1695 ms 23.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_72449 0.1977 ms 20.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_72450 0.2365 ms 16.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_72457 0.2526 ms 15.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_plus_mm_72448 0.2670 ms 15.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_72456 0.3812 ms 10.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3637 seconds and 0.0002 seconds precompiling for 11 choices +AUTOTUNE mm(1024x200, 200x2048) + triton_mm_72467 0.0096 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0098 ms 98.7% + triton_mm_72469 0.0098 ms 98.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72472 0.0099 ms 97.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72471 0.0101 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72468 0.0102 ms 94.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72475 0.0104 ms 92.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72474 0.0107 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72464 0.0109 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72465 0.0116 ms 83.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3165 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x1024, 1024x2048) + mm 0.0101 ms 100.0% + triton_mm_72489 0.0114 ms 89.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72495 0.0129 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72484 0.0132 ms 76.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72488 0.0133 ms 76.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72485 0.0135 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72487 0.0146 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72491 0.0148 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72479 0.0158 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72481 0.0159 ms 63.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3253 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(32x1018x256, 32x256x1018) + triton_bmm_72531 0.0839 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72532 0.0889 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72524 0.0905 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72526 0.0929 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72528 0.0946 ms 88.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72525 0.1007 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72529 0.1017 ms 82.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72533 0.1044 ms 80.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_72521 0.1094 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_bmm_72530 0.1134 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5996 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(32x1018x1018, 32x1018x256) + triton_bmm_72512 0.0934 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72505 0.0974 ms 95.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72507 0.0976 ms 95.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72506 0.0977 ms 95.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72514 0.1032 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_72509 0.1052 ms 88.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72510 0.1096 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72511 0.1230 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_72503 0.1287 ms 72.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + bmm 0.1344 ms 69.5% +SingleProcess AUTOTUNE benchmarking takes 0.6084 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE bmm(32x256x1018, 32x1018x1018) + triton_bmm_72550 0.0964 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72548 0.0972 ms 99.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72547 0.1033 ms 93.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72543 0.1040 ms 92.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72552 0.1079 ms 89.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_72544 0.1140 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72545 0.1169 ms 82.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72549 0.1237 ms 77.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_72541 0.1298 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + bmm 0.1348 ms 71.5% +SingleProcess AUTOTUNE benchmarking takes 0.6249 seconds and 0.0006 seconds precompiling for 20 choices +AUTOTUNE bmm(32x1018x1018, 32x1018x256) + triton_bmm_72569 0.1335 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72562 0.1353 ms 98.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72566 0.1421 ms 93.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72563 0.1426 ms 93.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + bmm 0.1468 ms 91.0% + triton_bmm_72567 0.1485 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_72571 0.1554 ms 85.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_72564 0.1591 ms 83.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_72560 0.1625 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_72559 0.1813 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.7181 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(256x200, 200x1024) + triton_mm_72580 0.0067 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0069 ms 96.7% + triton_mm_72584 0.0069 ms 96.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72579 0.0072 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72583 0.0073 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72575 0.0073 ms 91.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72586 0.0074 ms 89.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72582 0.0075 ms 89.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72573 0.0076 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72574 0.0076 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3094 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x256, 256x1024) + triton_mm_72599 0.0068 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72598 0.0069 ms 97.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + mm 0.0070 ms 96.4% + triton_mm_72602 0.0071 ms 95.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72603 0.0072 ms 94.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72593 0.0072 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72592 0.0073 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72605 0.0077 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72594 0.0078 ms 87.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72601 0.0078 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3036 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(2048x200, 200x1024) + mm 0.0091 ms 100.0% + triton_mm_72657 0.0096 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72659 0.0098 ms 92.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72662 0.0099 ms 91.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72658 0.0102 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72661 0.0102 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72665 0.0104 ms 87.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72664 0.0107 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72654 0.0110 ms 82.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72655 0.0115 ms 79.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3150 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(200x2048, 2048x1024) + mm 0.0111 ms 100.0% + triton_mm_72675 0.0117 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72679 0.0141 ms 78.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72671 0.0165 ms 67.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72685 0.0175 ms 63.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72669 0.0188 ms 58.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72674 0.0193 ms 57.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72678 0.0201 ms 55.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72684 0.0218 ms 50.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72677 0.0220 ms 50.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3474 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(4x3072, 3072x1024) + triton_mm_72690 0.0172 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + mm 0.0205 ms 84.0% + triton_mm_72694 0.0240 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72689 0.0289 ms 59.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_72693 0.0301 ms 57.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72688 0.0303 ms 56.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72702 0.0323 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72687 0.0345 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_72697 0.0346 ms 49.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72700 0.0351 ms 49.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3521 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(256x3872, 3872x2048) + mm 0.0238 ms 100.0% + triton_mm_72731 0.0243 ms 97.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72737 0.0314 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72727 0.0335 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72730 0.0381 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72726 0.0387 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_72723 0.0396 ms 60.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_72729 0.0413 ms 57.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72733 0.0415 ms 57.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72736 0.0420 ms 56.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4152 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(3872x256, 256x2048) + mm 0.0176 ms 100.0% + triton_mm_72747 0.0203 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72751 0.0213 ms 82.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72749 0.0218 ms 80.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72754 0.0219 ms 80.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72755 0.0220 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72748 0.0235 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72752 0.0246 ms 71.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72744 0.0260 ms 67.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_72756 0.0266 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3684 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(2048x3872, 3872x2048) + mm 0.0635 ms 100.0% + triton_mm_72804 0.0741 ms 85.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72811 0.0772 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72808 0.0884 ms 71.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72812 0.0929 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72806 0.0953 ms 66.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72813 0.1001 ms 63.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72809 0.1044 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72805 0.1047 ms 60.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72807 0.1065 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.6097 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(3872x2048, 2048x2048) + mm 0.0654 ms 100.0% + triton_mm_72831 0.0815 ms 80.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72830 0.0840 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72832 0.0841 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_72825 0.0863 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72823 0.0881 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72824 0.1004 ms 65.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_72826 0.1061 ms 61.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_72827 0.1097 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_72828 0.1121 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5941 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(2048x3872, 3872x16384) + mm 0.4794 ms 100.0% + triton_mm_73353 0.5697 ms 84.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73354 0.5725 ms 83.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73355 0.6076 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_73349 0.7725 ms 62.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_73346 0.7801 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73348 0.8330 ms 57.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73347 0.8356 ms 57.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_73351 0.8754 ms 54.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_73350 0.8891 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.9518 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16384x3872, 3872x2048) + mm 0.4818 ms 100.0% + triton_mm_73391 0.5397 ms 89.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73392 0.5638 ms 85.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73393 0.6107 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_73384 0.7385 ms 65.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73387 0.7576 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_73386 0.8040 ms 59.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73388 0.8161 ms 59.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73385 0.8286 ms 58.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_73389 0.8481 ms 56.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8835 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(3872x16384, 16384x2048) + mm 0.4923 ms 100.0% + triton_mm_73412 0.6420 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_73411 0.6843 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73410 0.7096 ms 69.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73406 0.7460 ms 66.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_73405 0.8121 ms 60.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73404 0.8273 ms 59.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_73403 0.9172 ms 53.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_73402 0.9224 ms 53.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_73408 0.9330 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.9045 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x4, 4x1024) + triton_mm_89783 0.0065 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_89784 0.0065 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89782 0.0065 ms 99.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_89789 0.0070 ms 91.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89788 0.0071 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_89787 0.0072 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_89785 0.0072 ms 89.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89786 0.0073 ms 88.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0076 ms 85.6% + triton_mm_89793 0.0079 ms 81.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2604 seconds and 0.0003 seconds precompiling for 17 choices +E1218 10:52:28.328000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Runtime error during autotuning: +E1218 10:52:28.328000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 10:52:28.328000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/0] Ignoring this choice. +AUTOTUNE mm(1024x200, 200x32) + triton_mm_89816 0.0070 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89815 0.0081 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + mm 0.0085 ms 82.0% + triton_mm_89822 0.0126 ms 55.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89824 0.0133 ms 52.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89823 0.0140 ms 49.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_89814 0.0141 ms 49.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_89821 0.0143 ms 48.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_89817 0.0143 ms 48.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89820 0.0152 ms 46.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2862 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(1024x2048, 2048x1152) + mm 0.0173 ms 100.0% + triton_mm_89849 0.0181 ms 95.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89838 0.0217 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_89842 0.0222 ms 78.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89848 0.0236 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89841 0.0249 ms 69.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_89845 0.0255 ms 67.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_89843 0.0256 ms 67.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_89839 0.0259 ms 66.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89840 0.0322 ms 53.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3743 seconds and 0.0005 seconds precompiling for 20 choices +AUTOTUNE mm(1024x4304, 4304x1152) + mm 0.0302 ms 100.0% + triton_mm_89970 0.0377 ms 80.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_89963 0.0476 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89969 0.0494 ms 61.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_89959 0.0544 ms 55.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_89964 0.0561 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_89962 0.0576 ms 52.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_89960 0.0581 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_89966 0.0590 ms 51.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_89965 0.0716 ms 42.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4694 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x4304, 4304x1152) + mm 0.0305 ms 100.0% + triton_mm_92972 0.0361 ms 84.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_92965 0.0434 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92971 0.0454 ms 67.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92966 0.0524 ms 58.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_92968 0.0547 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92964 0.0548 ms 55.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92961 0.0554 ms 55.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_92962 0.0565 ms 54.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_92963 0.0639 ms 47.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4654 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x1152, 1152x1152) + mm 0.0124 ms 100.0% + triton_mm_92991 0.0135 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_92984 0.0149 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92980 0.0152 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_92983 0.0167 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92987 0.0170 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_92985 0.0183 ms 67.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_92981 0.0188 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_92982 0.0196 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_92990 0.0205 ms 60.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3385 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(2048x1024, 1024x1152, 2048x1024, 1024x1152) + _mm_plus_mm 0.0313 ms 100.0% + triton_mm_plus_mm_99118 0.0481 ms 65.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_99117 0.0562 ms 55.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99122 0.0632 ms 49.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99123 0.0680 ms 46.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99125 0.0888 ms 35.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99120 0.0969 ms 32.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99119 0.1109 ms 28.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_99121 0.1219 ms 25.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99124 0.1743 ms 18.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2807 seconds and 0.0002 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x1024, 1024x4304, 1152x1024, 1024x4304) + _mm_plus_mm 0.0539 ms 100.0% + triton_mm_plus_mm_99147 0.0955 ms 56.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_99146 0.1108 ms 48.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99151 0.1297 ms 41.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99152 0.1451 ms 37.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99154 0.1751 ms 30.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99149 0.1982 ms 27.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99148 0.2026 ms 26.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_99150 0.2466 ms 21.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99155 0.3235 ms 16.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.3653 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(4304x1024, 1024x1152, 4304x1024, 1024x1152) + _mm_plus_mm 0.0533 ms 100.0% + triton_mm_plus_mm_99176 0.0979 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_99175 0.1098 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99180 0.1208 ms 44.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99181 0.1440 ms 37.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99183 0.1759 ms 30.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99177 0.2027 ms 26.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_99178 0.2032 ms 26.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99179 0.2427 ms 22.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99184 0.3247 ms 16.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.3650 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x1024, 1024x1152, 1152x1024, 1024x1152) + _mm_plus_mm 0.0224 ms 100.0% + triton_mm_plus_mm_99205 0.0312 ms 71.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_99210 0.0356 ms 63.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99204 0.0517 ms 43.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_99206 0.0564 ms 39.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_99207 0.0569 ms 39.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99209 0.0602 ms 37.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_99208 0.0694 ms 32.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_99213 0.0840 ms 26.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_plus_mm_99212 0.0841 ms 26.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2445 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE addmm(2048x1152, 2048x1024, 1024x1152) + triton_mm_106940 0.0207 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106939 0.0210 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_106943 0.0210 ms 98.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_106946 0.0219 ms 94.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106938 0.0238 ms 87.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106942 0.0238 ms 87.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106945 0.0245 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + addmm 0.0253 ms 81.9% + triton_mm_106936 0.0254 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_106941 0.0260 ms 79.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3785 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x4304, 1152x1024, 1024x4304) + triton_mm_106984 0.0419 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + addmm 0.0422 ms 99.2% + triton_mm_106981 0.0427 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_106977 0.0458 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_106985 0.0466 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_106980 0.0483 ms 86.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106983 0.0486 ms 86.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106978 0.0490 ms 85.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_106974 0.0531 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_106979 0.0543 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4412 seconds and 0.0005 seconds precompiling for 20 choices +AUTOTUNE addmm(4304x1152, 4304x1024, 1024x1152) + triton_mm_107015 0.0408 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + addmm 0.0414 ms 98.5% + triton_mm_107022 0.0417 ms 98.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107016 0.0433 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107023 0.0448 ms 91.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_107019 0.0452 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_107021 0.0454 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107014 0.0492 ms 83.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107017 0.0492 ms 83.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_107018 0.0513 ms 79.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4387 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x1152, 1152x1024, 1024x1152) + triton_mm_107050 0.0161 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_107061 0.0161 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_107057 0.0171 ms 93.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_107060 0.0172 ms 93.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107053 0.0173 ms 92.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_107054 0.0176 ms 91.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + addmm 0.0181 ms 88.7% + triton_mm_107055 0.0187 ms 86.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_107056 0.0204 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_107052 0.0213 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3455 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x2048, 2048x16384) + mm 1.9597 ms 100.0% + triton_mm_122707 2.4504 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122706 2.6231 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122708 2.9282 ms 66.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122699 3.6172 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122702 3.7141 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122701 3.8772 ms 50.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122703 3.9248 ms 49.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122704 3.9497 ms 49.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122700 4.0676 ms 48.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.5624 seconds and 0.0005 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x4304) + mm 0.0851 ms 100.0% + triton_mm_113204 0.1024 ms 83.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113205 0.1120 ms 76.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_113203 0.1141 ms 74.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113198 0.1243 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113199 0.1360 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_113196 0.1425 ms 59.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113197 0.1448 ms 58.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_113201 0.1456 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_113200 0.1495 ms 56.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.6577 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x1152) + mm 0.0261 ms 100.0% + triton_mm_113127 0.0324 ms 80.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113128 0.0334 ms 78.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113122 0.0341 ms 76.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113129 0.0355 ms 73.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_113123 0.0417 ms 62.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_113120 0.0424 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113121 0.0443 ms 58.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_113124 0.0448 ms 58.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113125 0.0450 ms 58.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4359 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x4096) + mm 0.0195 ms 100.0% + triton_mm_122800 0.0210 ms 92.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122801 0.0237 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122792 0.0258 ms 75.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122799 0.0261 ms 74.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122794 0.0268 ms 72.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122796 0.0282 ms 69.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122793 0.0285 ms 68.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122795 0.0301 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122797 0.0302 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3875 seconds and 0.0002 seconds precompiling for 20 choices +E1218 11:00:22.681000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Runtime error during autotuning: +E1218 11:00:22.681000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 11:00:22.681000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Ignoring this choice. +AUTOTUNE mm(800x1024, 1024x32) + triton_mm_128519 0.0108 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_128510 0.0126 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128511 0.0129 ms 84.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128518 0.0131 ms 82.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0133 ms 81.5% + triton_mm_128515 0.0136 ms 79.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_128524 0.0138 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128517 0.0142 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128509 0.0153 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_128523 0.0156 ms 69.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3297 seconds and 0.0002 seconds precompiling for 18 choices +AUTOTUNE mm(16x1024, 1024x1024) + triton_mm_122437 0.0105 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_122441 0.0111 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122436 0.0123 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_122435 0.0127 ms 82.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122440 0.0135 ms 77.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122445 0.0136 ms 76.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0141 ms 74.1% + triton_mm_122434 0.0144 ms 72.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_122447 0.0147 ms 71.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122449 0.0149 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3623 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(15488x2048, 2048x2048) + mm 0.2533 ms 100.0% + triton_mm_122669 0.3187 ms 79.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122668 0.3291 ms 77.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122670 0.3565 ms 71.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122663 0.3936 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122664 0.4111 ms 61.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122661 0.4335 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122666 0.4591 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122665 0.4605 ms 55.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122662 0.4628 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8318 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(800x2048, 2048x1024) + mm 0.0147 ms 100.0% + triton_mm_122740 0.0153 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122736 0.0204 ms 72.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122746 0.0207 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122739 0.0209 ms 70.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122735 0.0212 ms 69.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122745 0.0233 ms 63.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122738 0.0257 ms 57.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122742 0.0260 ms 56.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122732 0.0304 ms 48.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3627 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE convolution(16x3x224x224, 1152x3x14x14) + convolution 0.2662 ms 100.0% + triton_convolution2d_113107 0.4701 ms 56.6% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_113110 0.5168 ms 51.5% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_113105 0.5464 ms 48.7% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_113104 0.6500 ms 40.9% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_113108 0.6616 ms 40.2% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=4 + triton_convolution2d_113109 0.8578 ms 31.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=256, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=2, num_warps=8 + triton_convolution2d_113106 5.5019 ms 4.8% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=1024, BLOCK_N=16, GROUPS=1, KERNEL_H=14, KERNEL_W=14, PADDING_H=0, PADDING_W=0, STRIDE_H=14, STRIDE_W=14, UNROLL=False, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3716 seconds and 0.0002 seconds precompiling for 8 choices +AUTOTUNE mm(4096x4304, 4304x1152) + mm 0.0856 ms 100.0% + triton_mm_113223 0.1207 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113224 0.1292 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_113222 0.1323 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113217 0.1526 ms 56.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113218 0.1632 ms 52.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_113216 0.1820 ms 47.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_113219 0.1826 ms 46.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_113220 0.1838 ms 46.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_113215 0.1852 ms 46.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.7115 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(4096x2048, 4096x1152, 1152x2048) + bias_addmm 0.0438 ms 100.0% + triton_mm_116206 0.0575 ms 76.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_116200 0.0583 ms 75.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_116207 0.0612 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + addmm 0.0647 ms 67.8% + triton_mm_116205 0.0683 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_116201 0.0715 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_116199 0.0742 ms 59.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_116203 0.0772 ms 56.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_116198 0.0788 ms 55.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.5458 seconds and 0.0003 seconds precompiling for 21 choices +AUTOTUNE mm(800x32, 32x1024) + triton_mm_122419 0.0066 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122423 0.0067 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122424 0.0067 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122428 0.0067 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122429 0.0067 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122422 0.0068 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_122418 0.0069 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122426 0.0069 ms 95.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122425 0.0070 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122417 0.0071 ms 93.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3351 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(15488x2048, 2048x256) + mm 0.0554 ms 100.0% + triton_mm_122497 0.0652 ms 84.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122503 0.0704 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122502 0.0717 ms 77.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122498 0.0748 ms 74.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122504 0.0751 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122495 0.0841 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122493 0.0845 ms 65.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122499 0.0878 ms 63.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122494 0.0891 ms 62.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.5422 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x1024, 1024x3072) + triton_mm_122528 0.0152 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_122532 0.0154 ms 98.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122540 0.0167 ms 91.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122536 0.0168 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122527 0.0169 ms 90.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_122526 0.0169 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122531 0.0173 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122525 0.0181 ms 84.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_122538 0.0186 ms 81.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122535 0.0186 ms 81.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3037 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(800x1024, 1024x2048) + mm 0.0130 ms 100.0% + triton_mm_122559 0.0137 ms 95.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122552 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122558 0.0156 ms 83.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122548 0.0171 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122553 0.0173 ms 75.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122555 0.0175 ms 74.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122551 0.0180 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122550 0.0207 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122554 0.0216 ms 60.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3467 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x256) + mm 0.0087 ms 100.0% + triton_mm_122564 0.0089 ms 98.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122568 0.0092 ms 95.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122572 0.0101 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122563 0.0118 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122567 0.0122 ms 71.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122571 0.0123 ms 71.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122562 0.0123 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122578 0.0126 ms 69.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122561 0.0130 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3237 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x16384, 16384x2048) + mm 1.9866 ms 100.0% + triton_mm_122726 2.2910 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122725 2.5002 ms 79.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122727 2.5549 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122721 2.8021 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_122718 3.7120 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122720 3.8475 ms 51.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122717 4.1655 ms 47.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122719 4.1839 ms 47.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122724 4.2836 ms 46.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.4235 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x4096, 4096x1024) + triton_mm_122814 0.0256 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + mm 0.0282 ms 90.8% + triton_mm_122810 0.0328 ms 78.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122820 0.0351 ms 72.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_122813 0.0382 ms 66.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122809 0.0392 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_122819 0.0417 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_122806 0.0480 ms 53.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_122812 0.0482 ms 53.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_122816 0.0483 ms 53.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4281 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x2048, 2048x16384) + mm 1.9869 ms 100.0% + triton_mm_129924 2.4697 ms 80.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129925 2.5382 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129926 2.7246 ms 72.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129917 3.3344 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129921 3.7297 ms 53.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129919 3.8658 ms 51.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129918 3.8934 ms 51.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129922 3.9324 ms 50.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129920 3.9861 ms 49.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 1.4257 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x4304) + mm 0.0865 ms 100.0% + triton_mm_149504 0.1044 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149503 0.1111 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149505 0.1172 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_149498 0.1400 ms 61.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149496 0.1431 ms 60.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149501 0.1477 ms 58.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_149500 0.1478 ms 58.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149497 0.1547 ms 55.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_149502 0.1597 ms 54.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6665 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x4096) + mm 0.0194 ms 100.0% + triton_mm_128906 0.0205 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128898 0.0226 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128907 0.0230 ms 84.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128905 0.0232 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128899 0.0239 ms 81.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128902 0.0255 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128900 0.0258 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128903 0.0285 ms 67.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128896 0.0295 ms 65.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3738 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x1024, 1024x1024) + triton_mm_146325 0.0109 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_146321 0.0117 ms 93.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_146320 0.0124 ms 88.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_146319 0.0128 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_146324 0.0137 ms 79.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + mm 0.0143 ms 76.7% + triton_mm_146333 0.0143 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_146331 0.0145 ms 75.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_146318 0.0162 ms 67.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_146327 0.0164 ms 66.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.2944 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(800x32, 32x1024) + triton_mm_128527 0.0068 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128526 0.0068 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_128528 0.0068 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128537 0.0069 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128538 0.0069 ms 97.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128529 0.0071 ms 95.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128533 0.0073 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128525 0.0073 ms 92.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_128531 0.0074 ms 91.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + mm 0.0075 ms 91.0% +SingleProcess AUTOTUNE benchmarking takes 0.2766 seconds and 0.0004 seconds precompiling for 18 choices +E1218 11:11:07.695000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Runtime error during autotuning: +E1218 11:11:07.695000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 11:11:07.695000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Ignoring this choice. +AUTOTUNE mm(32x800, 800x1024) + triton_mm_128546 0.0110 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128545 0.0118 ms 93.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128544 0.0124 ms 88.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128550 0.0125 ms 88.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0134 ms 82.1% + triton_mm_128556 0.0135 ms 81.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128549 0.0149 ms 73.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_128553 0.0153 ms 71.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128543 0.0157 ms 70.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_128555 0.0167 ms 65.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2945 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(3072x16, 16x1024) + triton_mm_128577 0.0114 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_128576 0.0121 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + mm 0.0126 ms 90.4% + triton_mm_128578 0.0133 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128589 0.0157 ms 72.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_128581 0.0173 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_128590 0.0183 ms 62.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128582 0.0191 ms 59.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_128587 0.0192 ms 59.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128584 0.0196 ms 58.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2859 seconds and 0.0002 seconds precompiling for 17 choices +AUTOTUNE mm(800x4096, 4096x1024) + mm 0.0233 ms 100.0% + triton_mm_128939 0.0265 ms 87.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_128935 0.0339 ms 68.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_128945 0.0344 ms 67.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128938 0.0362 ms 64.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128934 0.0381 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_128944 0.0409 ms 56.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128937 0.0437 ms 53.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128941 0.0443 ms 52.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128931 0.0500 ms 46.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4243 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x800, 800x4096) + mm 0.0202 ms 100.0% + triton_mm_128886 0.0245 ms 82.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128881 0.0271 ms 74.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128887 0.0286 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128879 0.0288 ms 70.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128888 0.0309 ms 65.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128884 0.0314 ms 64.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128880 0.0323 ms 62.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128883 0.0325 ms 62.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128877 0.0358 ms 56.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4060 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x800, 800x1024) + mm 0.0197 ms 100.0% + triton_mm_128924 0.0246 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128919 0.0269 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128925 0.0270 ms 73.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128921 0.0282 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128917 0.0292 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_128926 0.0301 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_128922 0.0311 ms 63.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128918 0.0325 ms 60.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_128915 0.0356 ms 55.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4014 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(16x3072, 3072x1024, 16x3072, 3072x1024) + _mm_plus_mm 0.0406 ms 100.0% + triton_mm_plus_mm_129004 0.0697 ms 58.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_129003 0.0827 ms 49.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_129007 0.0859 ms 47.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_129006 0.1686 ms 24.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_129001 0.2014 ms 20.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_129002 0.2403 ms 16.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_129000 0.2664 ms 15.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_129009 0.3493 ms 11.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_plus_mm_129008 0.3818 ms 10.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3685 seconds and 0.0002 seconds precompiling for 11 choices +AUTOTUNE mm(1024x800, 800x2048) + mm 0.0134 ms 100.0% + triton_mm_129021 0.0180 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129023 0.0184 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129027 0.0185 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129019 0.0189 ms 70.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129026 0.0192 ms 69.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129024 0.0209 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129028 0.0211 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129020 0.0218 ms 61.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129017 0.0220 ms 60.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3541 seconds and 0.0005 seconds precompiling for 20 choices +AUTOTUNE mm(800x1024, 1024x2048) + mm 0.0126 ms 100.0% + triton_mm_129047 0.0137 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129040 0.0150 ms 83.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129046 0.0156 ms 80.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129043 0.0166 ms 75.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129039 0.0167 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129036 0.0169 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_129041 0.0177 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129038 0.0192 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129042 0.0206 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4105 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1018x1018, 128x1018x256) + triton_bmm_129064 0.2949 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129057 0.3100 ms 95.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129066 0.3381 ms 87.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_129059 0.3402 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129058 0.3553 ms 83.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + bmm 0.3698 ms 79.8% + triton_bmm_129061 0.3734 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129063 0.3868 ms 76.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_129062 0.4102 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_129065 0.4475 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8257 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE bmm(128x256x1018, 128x1018x1018) + triton_bmm_129102 0.3025 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129099 0.3311 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129100 0.3530 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_129104 0.3605 ms 83.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_129095 0.3626 ms 83.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + bmm 0.3899 ms 77.6% + triton_bmm_129101 0.3916 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_bmm_129097 0.4307 ms 70.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129096 0.4325 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_129103 0.4771 ms 63.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8364 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE bmm(128x1018x1018, 128x1018x256) + bmm 0.4206 ms 100.0% + triton_bmm_129121 0.4475 ms 94.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129114 0.4761 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129118 0.5017 ms 83.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129123 0.5141 ms 81.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_bmm_129115 0.5284 ms 79.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_129119 0.5325 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_bmm_129116 0.5326 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_bmm_129112 0.6188 ms 68.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_bmm_129111 0.6293 ms 66.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8814 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(256x800, 800x1024) + mm 0.0084 ms 100.0% + triton_mm_129132 0.0091 ms 92.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129128 0.0119 ms 70.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129131 0.0119 ms 70.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_129136 0.0123 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129142 0.0123 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129135 0.0124 ms 67.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129127 0.0129 ms 64.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129125 0.0130 ms 64.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_129134 0.0133 ms 63.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3297 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(800x256, 256x1024) + triton_mm_129154 0.0072 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129155 0.0074 ms 97.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129150 0.0076 ms 95.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_129157 0.0076 ms 94.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + mm 0.0077 ms 93.8% + triton_mm_129153 0.0078 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129160 0.0084 ms 86.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129152 0.0084 ms 86.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129161 0.0084 ms 85.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129159 0.0090 ms 80.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3073 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(2048x800, 800x1024) + mm 0.0132 ms 100.0% + triton_mm_129211 0.0179 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129209 0.0182 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129217 0.0184 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129213 0.0188 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129216 0.0192 ms 68.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129218 0.0209 ms 63.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129214 0.0212 ms 62.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129210 0.0215 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129207 0.0220 ms 59.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.3542 seconds and 0.0002 seconds precompiling for 20 choices +AUTOTUNE mm(800x2048, 2048x1024) + mm 0.0148 ms 100.0% + triton_mm_129231 0.0165 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129230 0.0207 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129227 0.0207 ms 71.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129237 0.0208 ms 71.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129226 0.0210 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_129236 0.0231 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129229 0.0244 ms 60.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129233 0.0244 ms 60.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129223 0.0307 ms 48.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3624 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16x3072, 3072x1024) + triton_mm_129246 0.0206 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0212 ms 97.1% + triton_mm_129242 0.0217 ms 95.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_129241 0.0271 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 + triton_mm_129240 0.0272 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129245 0.0310 ms 66.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129254 0.0316 ms 65.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129252 0.0328 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129239 0.0368 ms 56.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 + triton_mm_129249 0.0381 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.3543 seconds and 0.0004 seconds precompiling for 18 choices +AUTOTUNE mm(256x15488, 15488x2048) + mm 0.0644 ms 100.0% + triton_mm_129289 0.1089 ms 59.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129283 0.1108 ms 58.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129279 0.1184 ms 54.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129278 0.1339 ms 48.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_129282 0.1354 ms 47.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129275 0.1378 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129288 0.1445 ms 44.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129281 0.1553 ms 41.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129285 0.1557 ms 41.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6684 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x256, 256x2048) + mm 0.0504 ms 100.0% + triton_mm_129307 0.0643 ms 78.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129306 0.0653 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129299 0.0703 ms 71.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129303 0.0743 ms 67.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129301 0.0751 ms 67.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129304 0.0782 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129300 0.0786 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129308 0.0812 ms 62.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129305 0.0835 ms 60.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5526 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(2048x15488, 15488x2048) + mm 0.2436 ms 100.0% + triton_mm_129365 0.3556 ms 68.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129363 0.3787 ms 64.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129358 0.4082 ms 59.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129359 0.4391 ms 55.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129364 0.4648 ms 52.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129355 0.4985 ms 48.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_129356 0.5012 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129361 0.5012 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129357 0.5279 ms 46.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.8434 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x2048, 2048x2048) + mm 0.2623 ms 100.0% + triton_mm_129382 0.3164 ms 82.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129383 0.3174 ms 82.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129384 0.3562 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129377 0.3886 ms 67.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129375 0.3988 ms 65.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129376 0.4337 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129379 0.4384 ms 59.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129380 0.4540 ms 57.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129378 0.4601 ms 57.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.8415 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm(2048x15488, 15488x16384) + mm 1.8604 ms 100.0% + triton_mm_129907 2.4047 ms 77.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129906 2.4063 ms 77.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129905 2.4659 ms 75.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129898 3.5297 ms 52.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129904 3.6647 ms 50.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_129902 3.6759 ms 50.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129900 3.8806 ms 47.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129903 3.9307 ms 47.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129899 3.9777 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.3879 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(16384x15488, 15488x2048) + mm 1.8860 ms 100.0% + triton_mm_129944 2.4118 ms 78.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129943 2.4603 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129945 2.5251 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129940 3.5040 ms 53.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129942 3.6130 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_129936 3.6204 ms 52.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129941 3.6572 ms 51.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129938 3.8307 ms 49.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129933 3.9016 ms 48.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 1.3818 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(15488x16384, 16384x2048) + mm 1.9378 ms 100.0% + triton_mm_129963 2.2692 ms 85.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129962 2.4322 ms 79.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129964 2.5928 ms 74.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_129958 3.5541 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_129955 3.6398 ms 53.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129957 3.8362 ms 50.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_129960 3.8937 ms 49.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129956 3.9433 ms 49.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_129961 3.9964 ms 48.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 1.4202 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(1024x16, 16x1024) + triton_mm_146335 0.0069 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_146334 0.0071 ms 96.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 + triton_mm_146336 0.0074 ms 93.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + mm 0.0074 ms 92.7% + triton_mm_146339 0.0091 ms 75.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_146347 0.0097 ms 71.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 + triton_mm_146340 0.0105 ms 65.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_146348 0.0107 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146345 0.0118 ms 58.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146342 0.0120 ms 57.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.2729 seconds and 0.0003 seconds precompiling for 17 choices +E1218 11:11:26.384000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Runtime error during autotuning: +E1218 11:11:26.384000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.. +E1218 11:11:26.384000 204 site-packages/torch/_inductor/select_algorithm.py:2100] [2/1] Ignoring this choice. +AUTOTUNE mm(1024x800, 800x32) + triton_mm_146368 0.0118 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 + mm 0.0134 ms 88.1% + triton_mm_146367 0.0157 ms 75.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_146374 0.0302 ms 39.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146376 0.0304 ms 38.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146373 0.0323 ms 36.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_146377 0.0328 ms 36.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_146375 0.0338 ms 35.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_146369 0.0352 ms 33.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_146366 0.0410 ms 28.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.3285 seconds and 0.0003 seconds precompiling for 18 choices +AUTOTUNE mm(4096x2048, 2048x1152) + mm 0.0421 ms 100.0% + triton_mm_146399 0.0493 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146400 0.0520 ms 81.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146401 0.0527 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_146394 0.0548 ms 76.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146395 0.0697 ms 60.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_146392 0.0711 ms 59.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146397 0.0754 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_146396 0.0760 ms 55.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_146393 0.0767 ms 54.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5126 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x4304, 4304x1152) + mm 0.0913 ms 100.0% + triton_mm_149522 0.1137 ms 80.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149523 0.1193 ms 76.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149524 0.1270 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_149517 0.1344 ms 67.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149515 0.1483 ms 61.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149518 0.1487 ms 61.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_149519 0.1707 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149520 0.1774 ms 51.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_149516 0.1795 ms 50.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6897 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE mm(4096x1152, 1152x1152) + mm 0.0282 ms 100.0% + triton_mm_149541 0.0311 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149542 0.0328 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149536 0.0343 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149543 0.0347 ms 81.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_149534 0.0388 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149535 0.0389 ms 72.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_149538 0.0424 ms 66.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_149537 0.0425 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_149539 0.0427 ms 66.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4306 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE mm_plus_mm(2048x4096, 4096x1152, 2048x4096, 4096x1152) + _mm_plus_mm 0.0840 ms 100.0% + triton_mm_plus_mm_155669 0.1924 ms 43.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155670 0.2126 ms 39.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_155674 0.2326 ms 36.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155675 0.3197 ms 26.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155677 0.3253 ms 25.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155672 0.3815 ms 22.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155671 0.4030 ms 20.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_155673 0.4604 ms 18.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155676 0.6881 ms 12.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4370 seconds and 0.0002 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x4096, 4096x4304, 1152x4096, 4096x4304) + _mm_plus_mm 0.1700 ms 100.0% + triton_mm_plus_mm_155698 0.4101 ms 41.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155699 0.4144 ms 41.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_155703 0.4939 ms 34.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155704 0.5582 ms 30.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155706 0.6811 ms 25.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155701 0.7591 ms 22.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155700 0.7963 ms 21.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_155702 0.9486 ms 17.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155705 1.4425 ms 11.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4689 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(4304x4096, 4096x1152, 4304x4096, 4096x1152) + _mm_plus_mm 0.1698 ms 100.0% + triton_mm_plus_mm_155727 0.3970 ms 42.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155728 0.4016 ms 42.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_155732 0.4683 ms 36.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155733 0.5557 ms 30.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155735 0.6564 ms 25.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155730 0.7723 ms 22.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155729 0.8172 ms 20.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_155731 0.9306 ms 18.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155734 1.4413 ms 11.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.4660 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE mm_plus_mm(1152x4096, 4096x1152, 1152x4096, 4096x1152) + _mm_plus_mm 0.0536 ms 100.0% + triton_mm_plus_mm_155757 0.1157 ms 46.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_plus_mm_155762 0.1539 ms 34.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155756 0.1910 ms 28.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155758 0.2093 ms 25.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 + triton_mm_plus_mm_155761 0.2310 ms 23.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 + triton_mm_plus_mm_155759 0.2329 ms 23.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155760 0.2606 ms 20.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_plus_mm_155764 0.3231 ms 16.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 + triton_mm_plus_mm_155765 0.3437 ms 15.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 +SingleProcess AUTOTUNE benchmarking takes 0.4030 seconds and 0.0003 seconds precompiling for 11 choices +AUTOTUNE addmm(2048x1152, 2048x4096, 4096x1152) + addmm 0.0500 ms 100.0% + triton_mm_163492 0.0564 ms 88.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163498 0.0655 ms 76.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163490 0.0677 ms 73.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163494 0.0705 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163497 0.0726 ms 68.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163493 0.0786 ms 63.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_163499 0.0795 ms 62.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_163495 0.0818 ms 61.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_163491 0.0888 ms 56.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.5409 seconds and 0.0003 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x4304, 1152x4096, 4096x4304) + addmm 0.1057 ms 100.0% + triton_mm_163537 0.1202 ms 88.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_163536 0.1233 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163530 0.1442 ms 73.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163535 0.1543 ms 68.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163532 0.1569 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163528 0.1617 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163533 0.1634 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_163531 0.1699 ms 62.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_163529 0.1805 ms 58.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.7031 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE addmm(4304x1152, 4304x4096, 4096x1152) + addmm 0.1062 ms 100.0% + triton_mm_163574 0.1208 ms 87.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163575 0.1211 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_163568 0.1316 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163566 0.1420 ms 74.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163573 0.1470 ms 72.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163570 0.1540 ms 68.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163569 0.1555 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_163571 0.1600 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_163567 0.1732 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 +SingleProcess AUTOTUNE benchmarking takes 0.6985 seconds and 0.0004 seconds precompiling for 20 choices +AUTOTUNE addmm(1152x1152, 1152x4096, 4096x1152) + addmm 0.0348 ms 100.0% + triton_mm_163613 0.0372 ms 93.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 + triton_mm_163606 0.0433 ms 80.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163612 0.0467 ms 74.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 + triton_mm_163603 0.0500 ms 69.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 + triton_mm_163602 0.0519 ms 67.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 + triton_mm_163607 0.0525 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 + triton_mm_163609 0.0537 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_163605 0.0591 ms 58.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 + triton_mm_163604 0.0623 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 +SingleProcess AUTOTUNE benchmarking takes 0.4654 seconds and 0.0004 seconds precompiling for 20 choices +INFO 2025-12-18 11:18:04 ot_train.py:357 step:6K smpl:102K ep:255 epch:1.02 loss:0.037 grdn:0.491 lr:2.0e-05 updt_s:13.159 data_s:0.018 +WARNING 2025-12-18 11:18:04 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:21:20 ot_train.py:357 step:7K smpl:106K ep:263 epch:1.05 loss:0.034 grdn:0.480 lr:2.0e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 11:21:20 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:24:38 ot_train.py:357 step:7K smpl:109K ep:271 epch:1.08 loss:0.036 grdn:0.528 lr:1.9e-05 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 11:24:38 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:27:56 ot_train.py:357 step:7K smpl:112K ep:279 epch:1.11 loss:0.034 grdn:0.531 lr:1.9e-05 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 11:27:56 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:31:13 ot_train.py:357 step:7K smpl:115K ep:287 epch:1.14 loss:0.033 grdn:0.481 lr:1.9e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 11:31:13 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:34:30 ot_train.py:357 step:7K smpl:118K ep:295 epch:1.18 loss:0.033 grdn:0.495 lr:1.8e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 11:34:30 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:37:47 ot_train.py:357 step:8K smpl:122K ep:303 epch:1.21 loss:0.033 grdn:0.483 lr:1.8e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 11:37:47 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:41:04 ot_train.py:357 step:8K smpl:125K ep:311 epch:1.24 loss:0.033 grdn:0.494 lr:1.8e-05 updt_s:0.977 data_s:0.008 +WARNING 2025-12-18 11:41:04 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:44:22 ot_train.py:357 step:8K smpl:128K ep:319 epch:1.27 loss:0.031 grdn:0.489 lr:1.7e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 11:44:22 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:47:39 ot_train.py:357 step:8K smpl:131K ep:327 epch:1.30 loss:0.031 grdn:0.525 lr:1.7e-05 updt_s:0.977 data_s:0.008 +WARNING 2025-12-18 11:47:39 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:50:56 ot_train.py:357 step:8K smpl:134K ep:335 epch:1.34 loss:0.031 grdn:0.494 lr:1.7e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 11:50:56 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:54:13 ot_train.py:357 step:9K smpl:138K ep:343 epch:1.37 loss:0.030 grdn:0.482 lr:1.6e-05 updt_s:0.978 data_s:0.009 +WARNING 2025-12-18 11:54:13 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 11:57:31 ot_train.py:357 step:9K smpl:141K ep:351 epch:1.40 loss:0.030 grdn:0.501 lr:1.6e-05 updt_s:0.978 data_s:0.009 +WARNING 2025-12-18 11:57:31 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:00:48 ot_train.py:357 step:9K smpl:144K ep:359 epch:1.43 loss:0.030 grdn:0.527 lr:1.6e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 12:00:48 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:00:48 ot_train.py:367 Checkpoint policy after step 9000 +INFO 2025-12-18 12:05:01 ot_train.py:357 step:9K smpl:147K ep:367 epch:1.46 loss:0.028 grdn:0.490 lr:1.5e-05 updt_s:0.974 data_s:0.009 +WARNING 2025-12-18 12:05:01 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:08:18 ot_train.py:357 step:9K smpl:150K ep:375 epch:1.49 loss:0.030 grdn:0.531 lr:1.5e-05 updt_s:0.977 data_s:0.009 +WARNING 2025-12-18 12:08:18 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:11:35 ot_train.py:357 step:10K smpl:154K ep:383 epch:1.53 loss:0.028 grdn:0.496 lr:1.5e-05 updt_s:0.978 data_s:0.009 +WARNING 2025-12-18 12:11:35 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:14:53 ot_train.py:357 step:10K smpl:157K ep:391 epch:1.56 loss:0.029 grdn:0.516 lr:1.4e-05 updt_s:0.979 data_s:0.009 +WARNING 2025-12-18 12:14:53 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:18:10 ot_train.py:357 step:10K smpl:160K ep:399 epch:1.59 loss:0.027 grdn:0.501 lr:1.4e-05 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 12:18:10 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:21:27 ot_train.py:357 step:10K smpl:163K ep:407 epch:1.62 loss:0.027 grdn:0.496 lr:1.4e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:21:27 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:24:43 ot_train.py:357 step:10K smpl:166K ep:415 epch:1.65 loss:0.028 grdn:0.508 lr:1.3e-05 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 12:24:44 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:28:00 ot_train.py:357 step:11K smpl:170K ep:423 epch:1.68 loss:0.027 grdn:0.531 lr:1.3e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:28:00 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:31:17 ot_train.py:357 step:11K smpl:173K ep:431 epch:1.72 loss:0.028 grdn:0.535 lr:1.3e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:31:17 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:34:34 ot_train.py:357 step:11K smpl:176K ep:439 epch:1.75 loss:0.027 grdn:0.563 lr:1.2e-05 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 12:34:34 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:37:50 ot_train.py:357 step:11K smpl:179K ep:447 epch:1.78 loss:0.027 grdn:0.518 lr:1.2e-05 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 12:37:50 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:41:07 ot_train.py:357 step:11K smpl:182K ep:455 epch:1.81 loss:0.026 grdn:0.525 lr:1.1e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:41:07 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:44:24 ot_train.py:357 step:12K smpl:186K ep:463 epch:1.84 loss:0.026 grdn:0.546 lr:1.1e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:44:24 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:47:41 ot_train.py:357 step:12K smpl:189K ep:471 epch:1.88 loss:0.026 grdn:0.516 lr:1.1e-05 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 12:47:41 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:50:57 ot_train.py:357 step:12K smpl:192K ep:479 epch:1.91 loss:0.026 grdn:0.554 lr:1.0e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:50:57 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:50:57 ot_train.py:367 Checkpoint policy after step 12000 +INFO 2025-12-18 12:55:11 ot_train.py:357 step:12K smpl:195K ep:487 epch:1.94 loss:0.026 grdn:0.507 lr:1.0e-05 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 12:55:11 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 12:58:28 ot_train.py:357 step:12K smpl:198K ep:495 epch:1.97 loss:0.025 grdn:0.516 lr:9.8e-06 updt_s:0.980 data_s:0.009 +WARNING 2025-12-18 12:58:28 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:03:11 ot_train.py:357 step:13K smpl:202K ep:503 epch:2.00 loss:0.026 grdn:0.516 lr:9.4e-06 updt_s:1.388 data_s:0.024 +WARNING 2025-12-18 13:03:11 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:06:27 ot_train.py:357 step:13K smpl:205K ep:511 epch:2.03 loss:0.024 grdn:0.501 lr:9.1e-06 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 13:06:27 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:09:45 ot_train.py:357 step:13K smpl:208K ep:519 epch:2.07 loss:0.024 grdn:0.534 lr:8.8e-06 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 13:09:45 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:13:02 ot_train.py:357 step:13K smpl:211K ep:527 epch:2.10 loss:0.024 grdn:0.495 lr:8.5e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 13:13:02 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:16:19 ot_train.py:357 step:13K smpl:214K ep:535 epch:2.13 loss:0.024 grdn:0.521 lr:8.2e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:16:19 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:19:37 ot_train.py:357 step:14K smpl:218K ep:543 epch:2.16 loss:0.024 grdn:0.515 lr:7.9e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:19:37 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:22:54 ot_train.py:357 step:14K smpl:221K ep:551 epch:2.19 loss:0.024 grdn:0.511 lr:7.6e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:22:54 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:26:12 ot_train.py:357 step:14K smpl:224K ep:559 epch:2.23 loss:0.024 grdn:0.513 lr:7.3e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:26:12 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:29:29 ot_train.py:357 step:14K smpl:227K ep:567 epch:2.26 loss:0.024 grdn:0.534 lr:7.0e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:29:29 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:32:47 ot_train.py:357 step:14K smpl:230K ep:575 epch:2.29 loss:0.023 grdn:0.495 lr:6.7e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:32:47 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:36:04 ot_train.py:357 step:15K smpl:234K ep:582 epch:2.32 loss:0.024 grdn:0.547 lr:6.4e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:36:04 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:39:21 ot_train.py:357 step:15K smpl:237K ep:590 epch:2.35 loss:0.023 grdn:0.517 lr:6.2e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:39:21 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:42:39 ot_train.py:357 step:15K smpl:240K ep:598 epch:2.38 loss:0.023 grdn:0.508 lr:5.9e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:42:39 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:42:39 ot_train.py:367 Checkpoint policy after step 15000 +INFO 2025-12-18 13:46:50 ot_train.py:357 step:15K smpl:243K ep:606 epch:2.42 loss:0.022 grdn:0.500 lr:5.7e-06 updt_s:0.975 data_s:0.008 +WARNING 2025-12-18 13:46:50 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:50:07 ot_train.py:357 step:15K smpl:246K ep:614 epch:2.45 loss:0.022 grdn:0.512 lr:5.4e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:50:07 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:53:25 ot_train.py:357 step:16K smpl:250K ep:622 epch:2.48 loss:0.023 grdn:0.525 lr:5.2e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:53:25 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:56:42 ot_train.py:357 step:16K smpl:253K ep:630 epch:2.51 loss:0.023 grdn:0.522 lr:5.0e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:56:42 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 13:59:59 ot_train.py:357 step:16K smpl:256K ep:638 epch:2.54 loss:0.023 grdn:0.534 lr:4.8e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 13:59:59 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:03:17 ot_train.py:357 step:16K smpl:259K ep:646 epch:2.58 loss:0.024 grdn:0.550 lr:4.5e-06 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 14:03:17 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:06:34 ot_train.py:357 step:16K smpl:262K ep:654 epch:2.61 loss:0.022 grdn:0.508 lr:4.3e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:06:34 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:09:51 ot_train.py:357 step:17K smpl:266K ep:662 epch:2.64 loss:0.023 grdn:0.510 lr:4.2e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:09:51 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:13:09 ot_train.py:357 step:17K smpl:269K ep:670 epch:2.67 loss:0.022 grdn:0.504 lr:4.0e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:13:09 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:16:26 ot_train.py:357 step:17K smpl:272K ep:678 epch:2.70 loss:0.023 grdn:0.507 lr:3.8e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:16:26 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:19:44 ot_train.py:357 step:17K smpl:275K ep:686 epch:2.73 loss:0.022 grdn:0.492 lr:3.6e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 14:19:44 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:23:02 ot_train.py:357 step:17K smpl:278K ep:694 epch:2.77 loss:0.022 grdn:0.508 lr:3.5e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:23:02 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:26:19 ot_train.py:357 step:18K smpl:282K ep:702 epch:2.80 loss:0.022 grdn:0.527 lr:3.4e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 14:26:19 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:29:37 ot_train.py:357 step:18K smpl:285K ep:710 epch:2.83 loss:0.023 grdn:0.510 lr:3.2e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 14:29:37 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:32:54 ot_train.py:357 step:18K smpl:288K ep:718 epch:2.86 loss:0.022 grdn:0.520 lr:3.1e-06 updt_s:0.978 data_s:0.008 +WARNING 2025-12-18 14:32:54 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:32:54 ot_train.py:367 Checkpoint policy after step 18000 +INFO 2025-12-18 14:37:03 ot_train.py:357 step:18K smpl:291K ep:726 epch:2.89 loss:0.022 grdn:0.523 lr:3.0e-06 updt_s:0.976 data_s:0.008 +WARNING 2025-12-18 14:37:03 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:40:21 ot_train.py:357 step:18K smpl:294K ep:734 epch:2.92 loss:0.022 grdn:0.511 lr:2.9e-06 updt_s:0.981 data_s:0.008 +WARNING 2025-12-18 14:40:21 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:43:39 ot_train.py:357 step:19K smpl:298K ep:742 epch:2.96 loss:0.022 grdn:0.520 lr:2.8e-06 updt_s:0.981 data_s:0.008 +WARNING 2025-12-18 14:43:39 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:46:56 ot_train.py:357 step:19K smpl:301K ep:750 epch:2.99 loss:0.022 grdn:0.515 lr:2.7e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 14:46:56 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:51:21 ot_train.py:357 step:19K smpl:304K ep:758 epch:3.02 loss:0.021 grdn:0.497 lr:2.7e-06 updt_s:1.298 data_s:0.023 +WARNING 2025-12-18 14:51:21 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:54:38 ot_train.py:357 step:19K smpl:307K ep:766 epch:3.05 loss:0.022 grdn:0.498 lr:2.6e-06 updt_s:0.978 data_s:0.007 +WARNING 2025-12-18 14:54:38 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 14:57:55 ot_train.py:357 step:19K smpl:310K ep:774 epch:3.08 loss:0.023 grdn:0.499 lr:2.6e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 14:57:55 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 15:01:13 ot_train.py:357 step:20K smpl:314K ep:782 epch:3.12 loss:0.022 grdn:0.527 lr:2.5e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 15:01:13 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 15:04:30 ot_train.py:357 step:20K smpl:317K ep:790 epch:3.15 loss:0.023 grdn:0.523 lr:2.5e-06 updt_s:0.980 data_s:0.008 +WARNING 2025-12-18 15:04:30 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 15:07:48 ot_train.py:357 step:20K smpl:320K ep:798 epch:3.18 loss:0.020 grdn:0.519 lr:2.5e-06 updt_s:0.979 data_s:0.008 +WARNING 2025-12-18 15:07:48 db_utils.py:141 WandB logging of key "loss_per_dim" was ignored as its type "" is not handled by this wrapper. +INFO 2025-12-18 15:07:48 ot_train.py:367 Checkpoint policy after step 20000 +INFO 2025-12-18 15:08:40 ot_train.py:438 End of training +Traceback (most recent call last): + File "/opt/conda/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/opt/conda/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 457, in + main() + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 453, in main + train() + File "/opt/lerobot/src/lerobot/configs/parser.py", line 233, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 442, in train + unwrapped_policy.push_model_to_hub(cfg) + File "/opt/lerobot/src/lerobot/policies/pretrained.py", line 219, in push_model_to_hub + self.save_pretrained(saved_path) # Calls _save_pretrained and stores model tensors + File "/opt/lerobot/src/lerobot/utils/hub.py", line 67, in save_pretrained + self._save_pretrained(save_directory) + File "/opt/lerobot/src/lerobot/policies/pretrained.py", line 72, in _save_pretrained + save_model_as_safetensor(model_to_save, str(save_directory / SAFETENSORS_SINGLE_FILE)) + File "/opt/conda/envs/lerobot/lib/python3.10/site-packages/safetensors/torch.py", line 183, in save_model + save_file(state_dict, filename, metadata=metadata) + File "/opt/conda/envs/lerobot/lib/python3.10/site-packages/safetensors/torch.py", line 307, in save_file + serialize_file(_flatten(tensors), filename, metadata=metadata) +safetensors_rust.SafetensorError: Error while serializing: I/O error: No space left on device (os error 28) +Traceback (most recent call last): + File "/opt/conda/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/opt/conda/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 457, in + main() + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 453, in main + train() + File "/opt/lerobot/src/lerobot/configs/parser.py", line 233, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/opt/lerobot/src/lerobot/scripts/lerobot_train.py", line 442, in train + unwrapped_policy.push_model_to_hub(cfg) + File "/opt/lerobot/src/lerobot/policies/pretrained.py", line 219, in push_model_to_hub + self.save_pretrained(saved_path) # Calls _save_pretrained and stores model tensors + File "/opt/lerobot/src/lerobot/utils/hub.py", line 67, in save_pretrained + self._save_pretrained(save_directory) + File "/opt/lerobot/src/lerobot/policies/pretrained.py", line 72, in _save_pretrained + save_model_as_safetensor(model_to_save, str(save_directory / SAFETENSORS_SINGLE_FILE)) + File "/opt/conda/envs/lerobot/lib/python3.10/site-packages/safetensors/torch.py", line 183, in save_model + save_file(state_dict, filename, metadata=metadata) + File "/opt/conda/envs/lerobot/lib/python3.10/site-packages/safetensors/torch.py", line 307, in save_file + serialize_file(_flatten(tensors), filename, metadata=metadata) +safetensors_rust.SafetensorError: Error while serializing: I/O error: No space left on device (os error 28) diff --git a/wandb/run-20251218_082501-lbhu7589/files/requirements.txt b/wandb/run-20251218_082501-lbhu7589/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ba1c54ee8f511dd6368487857409aad40779e5c --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/files/requirements.txt @@ -0,0 +1,119 @@ +pip==25.3 +setuptools==80.9.0 +wheel==0.45.1 +lerobot==0.4.3 +tokenizers==0.21.4 +transformers==4.53.3 +inquirerpy==0.3.4 +deepdiff==8.6.1 +xxhash==3.6.0 +dill==0.4.0 +attrs==25.4.0 +accelerate==1.12.0 +ImageIO==2.37.2 +nvidia-cusparselt-cu12==0.6.3 +mpmath==1.3.0 +safetensors==0.7.0 +imageio-ffmpeg==0.6.0 +Farama-Notifications==0.0.4 +wcwidth==0.2.14 +pyyaml-include==1.4.1 +python-xlib==0.33 +yarl==1.22.0 +nvidia-cusolver-cu12==11.7.1.2 +aiosignal==1.4.0 +nvidia-cusparse-cu12==12.5.4.2 +wandb==0.21.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-nvjitlink-cu12==12.6.85 +six==1.17.0 +triton==3.3.1 +nvidia-nvtx-cu12==12.6.77 +nvidia-curand-cu12==10.3.7.77 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cufft-cu12==11.3.0.4 +rerun-sdk==0.26.2 +evdev==1.9.2 +torch==2.7.1 +opencv-python-headless==4.12.0.88 +huggingface-hub==0.35.3 +importlib_metadata==8.7.0 +psutil==7.1.3 +av==15.1.0 +pfzy==0.3.4 +cmake==4.1.3 +typing-inspect==0.9.0 +datasets==4.1.1 +pandas==2.3.3 +torchcodec==0.5 +numpy==2.2.6 +sympy==1.14.0 +nvidia-cudnn-cu12==9.5.1.17 +draccus==0.10.0 +Jinja2==3.1.6 +hf_transfer==0.1.9 +pyarrow==22.0.0 +pytz==2025.2 +multiprocess==0.70.16 +python-dateutil==2.9.0.post0 +nvidia-cuda-cupti-cu12==12.6.80 +aiohappyeyeballs==2.6.1 +pyserial==3.5 +cloudpickle==3.1.2 +mypy_extensions==1.1.0 +einops==0.8.1 +torchvision==0.22.1 +networkx==3.4.2 +MarkupSafe==3.0.3 +aiohttp==3.13.2 +frozenlist==1.8.0 +mergedeep==1.3.4 +fsspec==2025.9.0 +regex==2025.11.3 +pillow==12.0.0 +jsonlines==4.0.0 +prompt_toolkit==3.0.52 +toml==0.10.2 +zipp==3.23.0 +nvidia-cublas-cu12==12.6.4.1 +propcache==0.4.1 +diffusers==0.35.2 +nvidia-nccl-cu12==2.26.2 +tzdata==2025.3 +termcolor==3.2.0 +async-timeout==5.0.1 +gymnasium==1.2.2 +pynput==1.8.1 +multidict==6.7.0 +orderly-set==5.5.0 +GitPython==3.1.45 +pydantic_core==2.41.5 +shellingham==1.5.4 +requests==2.32.5 +httpcore==1.0.9 +gitdb==4.0.12 +sentry-sdk==2.48.0 +filelock==3.20.1 +typer-slim==0.20.0 +smmap==5.0.2 +anyio==4.12.0 +packaging==25.0 +tqdm==4.67.1 +PyYAML==6.0.3 +pydantic==2.12.5 +h11==0.16.0 +certifi==2025.11.12 +urllib3==2.6.2 +click==8.3.1 +idna==3.11 +typing-inspection==0.4.2 +httpx==0.28.1 +typing_extensions==4.15.0 +protobuf==6.33.2 +annotated-types==0.7.0 +charset-normalizer==3.4.4 +platformdirs==4.5.1 +exceptiongroup==1.3.1 +hf-xet==1.2.0 +lerobot==0.4.3 diff --git a/wandb/run-20251218_082501-lbhu7589/files/wandb-metadata.json b/wandb/run-20251218_082501-lbhu7589/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a798ba6f1c52ebae405c77de661a502d9c56616f --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/files/wandb-metadata.json @@ -0,0 +1,63 @@ +{ + "os": "Linux-6.8.0-64-generic-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-18T08:25:01.479404Z", + "args": [ + "--dataset.repo_id", + "Zasha01/lego_cube_final", + "--policy.type", + "pi05", + "--output_dir", + "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "--job_name", + "pi_lego_cube_final_final", + "--steps", + "20000", + "--batch_size", + "16", + "--save_checkpoint", + "true", + "--save_freq", + "3000", + "--policy.dtype", + "bfloat16", + "--policy.device", + "cuda", + "--wandb.enable=true", + "--policy.repo_id", + "Zasha01/pi_lego_cube_final_final", + "--policy.push_to_hub=true", + "--policy.pretrained_path", + "lerobot/pi05_base", + "--policy.compile_model=true", + "--policy.gradient_checkpointing=true" + ], + "program": "-m lerobot.scripts.lerobot_train", + "root": "/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454", + "host": "brev-uxf0tw16k", + "executable": "/opt/conda/envs/lerobot/bin/python3", + "cpu_count": 28, + "cpu_count_logical": 28, + "gpu": "NVIDIA H100 PCIe", + "gpu_count": 1, + "disk": { + "/": { + "total": "103865303040", + "used": "43216367616" + } + }, + "memory": { + "total": "190128013312" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d0e23bc7-e8b5-2dff-a2e7-119c21f8a7ac" + } + ], + "cudaVersion": "12.8", + "writerId": "0xueh9yuf1xlvkfx6jbcmvxu1bi9nb84" +} \ No newline at end of file diff --git a/wandb/run-20251218_082501-lbhu7589/files/wandb-summary.json b/wandb/run-20251218_082501-lbhu7589/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ce432877f41900857465e1b16772b40634c27509 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/dataloading_s":0.00769950814994445,"_step":20000,"_wandb":{"runtime":24232},"_runtime":24232.867337284,"train/lr":2.501836604644694e-06,"train/episodes":797.9336379892708,"train/epochs":3.179018478044904,"_timestamp":1.766070468062329e+09,"train/grad_norm":0.5189572383463382,"train/update_s":0.9789245117499923,"train/loss":0.025076277554035187,"train/steps":20000,"train/samples":320000} \ No newline at end of file diff --git a/wandb/run-20251218_082501-lbhu7589/logs/debug-internal.log b/wandb/run-20251218_082501-lbhu7589/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ed0c9899904a04afc45239544936cf018981a6d4 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-12-18T08:25:01.693088813Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2025-12-18T08:25:01.91641179Z","level":"INFO","msg":"stream: created new stream","id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916449066Z","level":"INFO","msg":"stream: started","id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.91647249Z","level":"INFO","msg":"writer: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916562906Z","level":"INFO","msg":"handler: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T08:25:01.916631089Z","level":"INFO","msg":"sender: started","stream_id":"lbhu7589"} +{"time":"2025-12-18T12:52:04.148929989Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/1144795626: no space left on device"} +{"time":"2025-12-18T13:43:42.980176868Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3041263947: no space left on device"} +{"time":"2025-12-18T14:33:55.212227787Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3394625431: no space left on device"} +{"time":"2025-12-18T15:08:48.221381228Z","level":"ERROR","msg":"error adding file to cache","err":"write /root/.cache/wandb/artifacts/tmp/3733320415: no space left on device"} +{"time":"2025-12-18T15:08:54.968312016Z","level":"INFO","msg":"stream: closing","id":"lbhu7589"} +{"time":"2025-12-18T15:12:00.986662316Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-18T15:12:01.128825565Z","level":"INFO","msg":"handler: closed","stream_id":"lbhu7589"} +{"time":"2025-12-18T15:12:01.128903532Z","level":"INFO","msg":"sender: closed","stream_id":"lbhu7589"} +{"time":"2025-12-18T15:12:01.128914298Z","level":"INFO","msg":"stream: closed","id":"lbhu7589"} diff --git a/wandb/run-20251218_082501-lbhu7589/logs/debug.log b/wandb/run-20251218_082501-lbhu7589/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..36077de555c88e3a37b9ea5f3dfee261c62752b6 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/logs/debug.log @@ -0,0 +1,23 @@ +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Configure stats pid to 204 +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from /workspace/wandb/settings +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_setup.py:_flush():81] Loading settings from environment variables +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454/wandb/run-20251218_082501-lbhu7589/logs/debug.log +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /checkpoints/pi05/pi_lego_cube_final_final_20251218_082454/wandb/run-20251218_082501-lbhu7589/logs/debug-internal.log +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():813] calling init triggers +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'dataset': {'repo_id': 'Zasha01/lego_cube_final', 'root': None, 'episodes': None, 'image_transforms': {'enable': False, 'max_num_transforms': 3, 'random_order': False, 'tfs': {'brightness': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'brightness': [0.8, 1.2]}}, 'contrast': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'contrast': [0.8, 1.2]}}, 'saturation': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'saturation': [0.5, 1.5]}}, 'hue': {'weight': 1.0, 'type': 'ColorJitter', 'kwargs': {'hue': [-0.05, 0.05]}}, 'sharpness': {'weight': 1.0, 'type': 'SharpnessJitter', 'kwargs': {'sharpness': [0.5, 1.5]}}, 'affine': {'weight': 1.0, 'type': 'RandomAffine', 'kwargs': {'degrees': [-5.0, 5.0], 'translate': [0.05, 0.05]}}}}, 'revision': None, 'use_imagenet_stats': True, 'video_backend': 'torchcodec', 'streaming': False}, 'env': None, 'policy': {'type': 'pi05', 'n_obs_steps': 1, 'input_features': {}, 'output_features': {}, 'device': 'cuda', 'use_amp': False, 'push_to_hub': True, 'repo_id': 'Zasha01/pi_lego_cube_final_final', 'private': None, 'tags': None, 'license': None, 'pretrained_path': 'lerobot/pi05_base', 'paligemma_variant': 'gemma_2b', 'action_expert_variant': 'gemma_300m', 'dtype': 'bfloat16', 'chunk_size': 50, 'n_action_steps': 50, 'max_state_dim': 32, 'max_action_dim': 32, 'num_inference_steps': 10, 'time_sampling_beta_alpha': 1.5, 'time_sampling_beta_beta': 1.0, 'time_sampling_scale': 0.999, 'time_sampling_offset': 0.001, 'min_period': 0.004, 'max_period': 4.0, 'rtc_config': None, 'image_resolution': [224, 224], 'empty_cameras': 0, 'tokenizer_max_length': 200, 'normalization_mapping': {'VISUAL': , 'STATE': , 'ACTION': }, 'gradient_checkpointing': True, 'compile_model': True, 'compile_mode': 'max-autotune', 'optimizer_lr': 2.5e-05, 'optimizer_betas': [0.9, 0.95], 'optimizer_eps': 1e-08, 'optimizer_weight_decay': 0.01, 'optimizer_grad_clip_norm': 1.0, 'scheduler_warmup_steps': 1000, 'scheduler_decay_steps': 30000, 'scheduler_decay_lr': 2.5e-06}, 'output_dir': '/checkpoints/pi05/pi_lego_cube_final_final_20251218_082454', 'job_name': 'pi_lego_cube_final_final', 'resume': False, 'seed': 1000, 'num_workers': 4, 'batch_size': 16, 'steps': 20000, 'eval_freq': 20000, 'log_freq': 200, 'tolerance_s': 0.0001, 'save_checkpoint': True, 'save_freq': 3000, 'use_policy_training_preset': True, 'optimizer': {'type': 'adamw', 'lr': 2.5e-05, 'weight_decay': 0.01, 'grad_clip_norm': 1.0, 'betas': [0.9, 0.95], 'eps': 1e-08}, 'scheduler': {'type': 'cosine_decay_with_warmup', 'num_warmup_steps': 1000, 'num_decay_steps': 30000, 'peak_lr': 2.5e-05, 'decay_lr': 2.5e-06}, 'eval': {'n_episodes': 50, 'batch_size': 50, 'use_async_envs': False}, 'wandb': {'enable': True, 'disable_artifact': False, 'project': 'lerobot', 'entity': None, 'notes': None, 'run_id': None, 'mode': None}, 'checkpoint_path': None, 'rename_map': {}, '_wandb': {}} +2025-12-18 08:25:01,480 INFO MainThread:204 [wandb_init.py:init():854] starting backend +2025-12-18 08:25:01,685 INFO MainThread:204 [wandb_init.py:init():857] sending inform_init request +2025-12-18 08:25:01,689 INFO MainThread:204 [wandb_init.py:init():865] backend started and connected +2025-12-18 08:25:01,691 INFO MainThread:204 [wandb_init.py:init():936] updated telemetry +2025-12-18 08:25:01,691 INFO MainThread:204 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2025-12-18 08:25:02,100 INFO MainThread:204 [wandb_init.py:init():1011] starting run threads in backend +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_console_start():2506] atexit reg +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2423] Wrapping output streams. +2025-12-18 08:25:02,166 INFO MainThread:204 [wandb_run.py:_redirect():2446] Redirects installed. +2025-12-18 08:25:02,169 INFO MainThread:204 [wandb_init.py:init():1049] run started, returning control to user process +2025-12-18 15:08:54,967 INFO wandb-AsyncioManager-main:204 [service_client.py:_forward_responses():84] Reached EOF. +2025-12-18 15:08:54,968 INFO wandb-AsyncioManager-main:204 [mailbox.py:close():137] Closing mailbox, abandoning 2 handles. diff --git a/wandb/run-20251218_082501-lbhu7589/run-lbhu7589.wandb b/wandb/run-20251218_082501-lbhu7589/run-lbhu7589.wandb new file mode 100644 index 0000000000000000000000000000000000000000..74113e217d4a26899947c0bd865b50d7a11a8f70 --- /dev/null +++ b/wandb/run-20251218_082501-lbhu7589/run-lbhu7589.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7de9a0446fc80f75b438a9547690313a84dfbaa9df0069e4510bc57bf467c45 +size 2004723