ManojAlexender commited on Feb 4, 2024

Commit

b7f66fc

verified ·

1 Parent(s): e8d2bcf

Upload folder using huggingface_hub

Browse files

Files changed (46) hide show

Hf_dataset/dataset_dict.json +1 -0
Hf_dataset/train/data-00000-of-00037.arrow +3 -0
Hf_dataset/train/data-00001-of-00037.arrow +3 -0
Hf_dataset/train/data-00002-of-00037.arrow +3 -0
Hf_dataset/train/data-00003-of-00037.arrow +3 -0
Hf_dataset/train/data-00004-of-00037.arrow +3 -0
Hf_dataset/train/data-00005-of-00037.arrow +3 -0
Hf_dataset/train/data-00006-of-00037.arrow +3 -0
Hf_dataset/train/data-00007-of-00037.arrow +3 -0
Hf_dataset/train/data-00008-of-00037.arrow +3 -0
Hf_dataset/train/data-00009-of-00037.arrow +3 -0
Hf_dataset/train/data-00010-of-00037.arrow +3 -0
Hf_dataset/train/data-00011-of-00037.arrow +3 -0
Hf_dataset/train/data-00012-of-00037.arrow +3 -0
Hf_dataset/train/data-00013-of-00037.arrow +3 -0
Hf_dataset/train/data-00014-of-00037.arrow +3 -0
Hf_dataset/train/data-00015-of-00037.arrow +3 -0
Hf_dataset/train/data-00016-of-00037.arrow +3 -0
Hf_dataset/train/data-00017-of-00037.arrow +3 -0
Hf_dataset/train/data-00018-of-00037.arrow +3 -0
Hf_dataset/train/data-00019-of-00037.arrow +3 -0
Hf_dataset/train/data-00020-of-00037.arrow +3 -0
Hf_dataset/train/data-00021-of-00037.arrow +3 -0
Hf_dataset/train/data-00022-of-00037.arrow +3 -0
Hf_dataset/train/data-00023-of-00037.arrow +3 -0
Hf_dataset/train/data-00024-of-00037.arrow +3 -0
Hf_dataset/train/data-00025-of-00037.arrow +3 -0
Hf_dataset/train/data-00026-of-00037.arrow +3 -0
Hf_dataset/train/data-00027-of-00037.arrow +3 -0
Hf_dataset/train/data-00028-of-00037.arrow +3 -0
Hf_dataset/train/data-00029-of-00037.arrow +3 -0
Hf_dataset/train/data-00030-of-00037.arrow +3 -0
Hf_dataset/train/data-00031-of-00037.arrow +3 -0
Hf_dataset/train/data-00032-of-00037.arrow +3 -0
Hf_dataset/train/data-00033-of-00037.arrow +3 -0
Hf_dataset/train/data-00034-of-00037.arrow +3 -0
Hf_dataset/train/data-00035-of-00037.arrow +3 -0
Hf_dataset/train/data-00036-of-00037.arrow +3 -0
Hf_dataset/train/dataset_info.json +70 -0
Hf_dataset/train/state.json +121 -0
classify.zip +3 -0
classify/test.jsonl +156 -0
classify/train.jsonl +0 -0
classify/valid.jsonl +158 -0
data1.zip +3 -0
test.jsonl +0 -0

Hf_dataset/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train"]}

Hf_dataset/train/data-00000-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b03aef4e355e23c0370a9db1e97a37c1b04a4e3efa43b572195b41c6fd48d92a
+size 488419816

Hf_dataset/train/data-00001-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71025d6432123cf319eeb9b845fb07c3a45319f6ddc159d4cfa8166e8f99ce71
+size 488566120

Hf_dataset/train/data-00002-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5d7718b71cd7e3e2b0ce0ffdeffb56e37bd0da70f99258d751bf1a3ffe2486d
+size 486940520

Hf_dataset/train/data-00003-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc666825fd6661b722d376b8bb767bb2272f97d707ddc1cd69e4d9a844ada80f
+size 488566120

Hf_dataset/train/data-00004-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecefe4e216a0d684d4016e447760bd42ea8d4eddc81cd12866da71c700a76abd
+size 488419816

Hf_dataset/train/data-00005-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34bcc86724e0f06b528a2efe13a1cca23e5706d60f1cc46fe5a142e13d07841f
+size 488533608

Hf_dataset/train/data-00006-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3c1a0feccf22819d8a6b588481fd5ec9d1f528faf3c8f7360fe9994358193b2
+size 488549864

Hf_dataset/train/data-00007-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:546381124467ea207b83e9fb31ce7827892ef9cefcec18674b4c14541a5cc954
+size 488419816

Hf_dataset/train/data-00008-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5bd0b707800ed5fb82d2ed31a777fc3d0b7e1b3433c27fa63119e8418403a75
+size 488549864

Hf_dataset/train/data-00009-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edbdf7990d22346652bcab219671f7c7ce414b85aed4f6dc6a265e98e882cc6a
+size 488403560

Hf_dataset/train/data-00010-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dec12b55b4975034cd4b3244f33061e45700ce103da2e56cfe8189ce0fc81d3
+size 488533608

Hf_dataset/train/data-00011-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a389f6ffa702350f9e871e5c075fca1cfa6953f27a9bdb16b10406a7bf5011f0
+size 488517352

Hf_dataset/train/data-00012-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15418679115d56c76010beddf42bf83f250c34fc1ab82628d5f662a5c49dac78
+size 488549864

Hf_dataset/train/data-00013-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51d99b951d7950a9b452baba2aa649d8a4391a4a34992f5ff3f301d72342e87f
+size 488549864

Hf_dataset/train/data-00014-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60c62eb8afb677ce286e358ed113ed62c984c82fda7684bc548bae39650e79d9
+size 488566120

Hf_dataset/train/data-00015-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5434056c18bef7299ff2379484b5411e6b4faa176ab5e062692a06888b61cae
+size 488094696

Hf_dataset/train/data-00016-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5bbaf609c7d880819330bdd1e712e3f330813d852faa8694b6c27b74a8c6216
+size 488530864

Hf_dataset/train/data-00017-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25052f7807bf4870f0d4a84b1164385fabe1cd16bf7a920aa6cbee6371ef1064
+size 488530864

Hf_dataset/train/data-00018-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9cc2211c924b39ca3f0c640faf71342066083f0ce74ea5494b5c9451a161cc5
+size 488530864

Hf_dataset/train/data-00019-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:408b741ddb5a25e9e57c9494703a7c882c3988c12dc4d86667531aba7a8bbd53
+size 488484728

Hf_dataset/train/data-00020-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:890474ee0be80b7b3720ba4a848dbf529759a927fee5c4d41dfc9cc683d35d5a
+size 488514608

Hf_dataset/train/data-00021-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55e549bf53c9993f805fb32fa8b10ca62c7dbb32b167143aebaf2d5851dde55a
+size 488563376

Hf_dataset/train/data-00022-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8fe1aac0f7ab5377eddc7a8d51f4788cf0f23c2a09f0f74f08813d4fab6c52
+size 488563376

Hf_dataset/train/data-00023-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fe97b85cf5ef4f72b44384a5e2b0aa508a23edaad9acad84853876272ef56ef
+size 488156976

Hf_dataset/train/data-00024-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef475bed79769729c4efa536d9be85ef5b7ce6f12071231b5f4d9e9b395ed7a7
+size 488547120

Hf_dataset/train/data-00025-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27f3386be18baa3a4f8bc3129606aade7377baeed3d6dbc24835414a2dd1ffb3
+size 488465840

Hf_dataset/train/data-00026-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7aa035d79a661ce0cc8e62d354ecd3f329c939e8d178b229b38183a17bcc6079
+size 488059440

Hf_dataset/train/data-00027-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2cf594acefa51e5cc2086e0c5fc699f0e93a85fb6e9040229cb27287a096e19
+size 488547120

Hf_dataset/train/data-00028-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70483eb9c423da73f579af7475d919245a39d7482fc7a2ea0409c7c6b900f3e1
+size 488563376

Hf_dataset/train/data-00029-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9801bf07f0313f725c593f2a85d6b748faf342322fe2213d4840ba9490b33940
+size 488547120

Hf_dataset/train/data-00030-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9829b110a4f30861b168fc9a938dfc30229a5cabd3dc4953063a84f9a4454b22
+size 488482096

Hf_dataset/train/data-00031-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc3404ca6e0cc30b86af8b825d4428777b199d86be26a9f5d9845980c781a35d
+size 488547120

Hf_dataset/train/data-00032-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22a23266640f293568157372fec775278abb1818d58aafba12d9c73007933dd0
+size 488563376

Hf_dataset/train/data-00033-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53fdca2e96b5897f01ada3a3c28af00bfecdcd6fd4a27cd851ec46ce41d27788
+size 488189488

Hf_dataset/train/data-00034-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b51229ca638d1c1246b2628950ed999f8bd1c75f9142f8e6c24bb4dab81a9545
+size 488384560

Hf_dataset/train/data-00035-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9e9cfe543db0d3e4e56a55109feda4610df082c9effda8b9af61e9a802d85b0
+size 488530864

Hf_dataset/train/data-00036-of-00037.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca837707633b8e88781964b33704a4105b4116db9efc0f0ade32d20e5bda21b
+size 488498352

Hf_dataset/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "json",
+  "dataset_size": 2816740545,
+  "description": "",
+  "download_checksums": {
+    "/home/manojale/Documents/Data/commit_data_8m_ready_to_feed.jsonl": {
+      "num_bytes": 3110715617,
+      "checksum": null
+    }
+  },
+  "download_size": 3110715617,
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "word_ids": {
+      "feature": {
+        "dtype": "int64",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "labels": {
+      "feature": {
+        "dtype": "int64",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 5927456162,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 2816740545,
+      "num_examples": 8345771,
+      "shard_lengths": [
+        1461651,
+        1509932,
+        1500137,
+        1478094,
+        1517113,
+        878844
+      ],
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

Hf_dataset/train/state.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00037.arrow"
+    },
+    {
+      "filename": "data-00001-of-00037.arrow"
+    },
+    {
+      "filename": "data-00002-of-00037.arrow"
+    },
+    {
+      "filename": "data-00003-of-00037.arrow"
+    },
+    {
+      "filename": "data-00004-of-00037.arrow"
+    },
+    {
+      "filename": "data-00005-of-00037.arrow"
+    },
+    {
+      "filename": "data-00006-of-00037.arrow"
+    },
+    {
+      "filename": "data-00007-of-00037.arrow"
+    },
+    {
+      "filename": "data-00008-of-00037.arrow"
+    },
+    {
+      "filename": "data-00009-of-00037.arrow"
+    },
+    {
+      "filename": "data-00010-of-00037.arrow"
+    },
+    {
+      "filename": "data-00011-of-00037.arrow"
+    },
+    {
+      "filename": "data-00012-of-00037.arrow"
+    },
+    {
+      "filename": "data-00013-of-00037.arrow"
+    },
+    {
+      "filename": "data-00014-of-00037.arrow"
+    },
+    {
+      "filename": "data-00015-of-00037.arrow"
+    },
+    {
+      "filename": "data-00016-of-00037.arrow"
+    },
+    {
+      "filename": "data-00017-of-00037.arrow"
+    },
+    {
+      "filename": "data-00018-of-00037.arrow"
+    },
+    {
+      "filename": "data-00019-of-00037.arrow"
+    },
+    {
+      "filename": "data-00020-of-00037.arrow"
+    },
+    {
+      "filename": "data-00021-of-00037.arrow"
+    },
+    {
+      "filename": "data-00022-of-00037.arrow"
+    },
+    {
+      "filename": "data-00023-of-00037.arrow"
+    },
+    {
+      "filename": "data-00024-of-00037.arrow"
+    },
+    {
+      "filename": "data-00025-of-00037.arrow"
+    },
+    {
+      "filename": "data-00026-of-00037.arrow"
+    },
+    {
+      "filename": "data-00027-of-00037.arrow"
+    },
+    {
+      "filename": "data-00028-of-00037.arrow"
+    },
+    {
+      "filename": "data-00029-of-00037.arrow"
+    },
+    {
+      "filename": "data-00030-of-00037.arrow"
+    },
+    {
+      "filename": "data-00031-of-00037.arrow"
+    },
+    {
+      "filename": "data-00032-of-00037.arrow"
+    },
+    {
+      "filename": "data-00033-of-00037.arrow"
+    },
+    {
+      "filename": "data-00034-of-00037.arrow"
+    },
+    {
+      "filename": "data-00035-of-00037.arrow"
+    },
+    {
+      "filename": "data-00036-of-00037.arrow"
+    }
+  ],
+  "_fingerprint": "9acc45c1bac1ef12",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

classify.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8b79d21e1943b5c0f7c4664427ada3d12f04d058485ce466abf8c7c5493f567
+size 168907

classify/test.jsonl ADDED Viewed

	@@ -0,0 +1,156 @@

+{"target": 0, "func": "[PATCH] PME-gather: Use templated functor instead of preprocessor\n\nAdded restrict in several places, but this does not affect performance\nwith gcc and icc.\n\nChange-Id: Id366621fa3ad02ca182b8a4da48cae940059cf46", "idx": 686}
+{"target": 0, "func": "[PATCH] Clean up documentation slightly for the case of slow runs.", "idx": 1528}
+{"target": 0, "func": "[PATCH] Adding Changelog for Release 2.8.00\n\nPart of Kokkos C++ Performance Portability Programming EcoSystem 2.8", "idx": 1536}
+{"target": 0, "func": "[PATCH] Adding Changelog for Release 2.9.99\n\nPart of Kokkos C++ Performance Portability Programming EcoSystem 2.9.99", "idx": 227}
+{"target": 0, "func": "[PATCH] Simplify the usage of the performance function by reducing\n the template parameter.", "idx": 839}
+{"target": 1, "func": "[PATCH] Attempt to work-around old gcc bugs in a more efficient\n fashion that does not lose performance on newer gcc's. [empty commit message]", "idx": 1272}
+{"target": 0, "func": "[PATCH] the mistery of the missing MIC performance on Intel 16 might\n be fixed with -qopt-assume-safe-padding", "idx": 1032}
+{"target": 0, "func": "[PATCH] Move clang-tidy build\n\nThis has seemed too slow when combined with the ASAN build.\n\nChange-Id: I45ea5856ca05edbb6107b62f219e8afd3cdbda3f", "idx": 743}
+{"target": 0, "func": "[PATCH] Use AVX512 also for DGEMM\n\nthis required switching to the generic gemm_beta code (which is faster anyway on SKX)\nfor both DGEMM and SGEMM\n\nPerformance for the not-retuned version is in the 30% range", "idx": 927}
+{"target": 0, "func": "[PATCH] Add Elem::loose_bounding_box()\n\nThe default implementation is what we were using previously to\ncalculate bounding boxes for PointLocatorTree.  For higher order\nelements we add fast approximations that are strict in the case of\nlinear geometry but that should still be bounds in the case of higher\norder geometry.", "idx": 1525}
+{"target": 0, "func": "[PATCH] Added hacks for SIMD rvec/load store in lincs & bondeds\n\nWe have added proper gather/scatter operations to work on\nrvecs for all SIMD architectures, but that will not make it into\nGromacs-5.1. Since Berk already wrote a few routines to use\nmaskloads at least for AVX & AVX2, this is a bit of a hack to\nget the performance benefits of that code already in Gromacs-5.1\n(for AVX/AVX2), without altering the SIMD module. This is definitely\na hack, and the code will be replaced once the extended SIMD\nmodule is in place.\n\nChange-Id: I385acb5f989b2ecf463948be84947fe1f6dfd19b", "idx": 1499}
+{"target": 0, "func": "[PATCH] Benchmark can now export performance data to an XML file +\n added Tanglecube function to benchmark", "idx": 577}
+{"target": 1, "func": "[PATCH] replaced std::endl with \\n in all file IO and stringstreams. \n std::endl forces a flush, which kills performance on some machines\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@834 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 440}
+{"target": 0, "func": "[PATCH] USER-DPD: specialize PairTableRXKokkos's compute_all_items()\n on NEWTON_PAIR No noticable performance change, but it does eliminate a deep\n conditional.", "idx": 1294}
+{"target": 0, "func": "[PATCH] Cleaning code, Sylvain should take a look to the previous\n version of this file and Root_of_2.h because there is an obvious slow down in\n the execution of all the filtering kernels", "idx": 972}
+{"target": 0, "func": "[PATCH] Add a single-tree depth-first traverser.  It is not as fast\n as it could be.", "idx": 360}
+{"target": 0, "func": "[PATCH] In the performance miniapps, add options to set the mesh\n refinements.\n\nRun the miniapps/performance tests at coarser mesh resolutions, so that\nthe tests run faster.\n\nExplicitly remove and ignore the temporary files created by the tests,\nbecause, in some cases, they may not be removed automatically.", "idx": 239}
+{"target": 0, "func": "[PATCH] Fixed FAST on Mac OS X", "idx": 1342}
+{"target": 0, "func": "[PATCH] Update to Kokkos r2.04.04 and add workaround for performance\n regression", "idx": 279}
+{"target": 0, "func": "[PATCH] Accelerate 3D version of inexact_locate as we do it for 2D\n\nsee commit 4c477c853c82bb1ca77bb496cdce993178c3dfa2", "idx": 1071}
+{"target": 0, "func": "[PATCH] ClangCuda: Make Cuda compilation with clang 3.9.0 work\n\nThis makes all the unit and performance tests work, with only\none test disabled (view of views).\nOn the other hand it brakes the cuda build because clang is ok with\n__device__ void foo() {}\nvoid foo() {}\nnvcc sees that as a redeclaration.", "idx": 819}
+{"target": 0, "func": "[PATCH] PetscMatrix::print_personal now prints to file when requested\n (rather than just cout).  The implementation is not particularly efficient\n (since print_personal gets passed an ostream) but it does work.  And how\n efficient do you need to be if you are printing out matrices anyway?\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@4244 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 1219}
+{"target": 0, "func": "[PATCH] nonUniformTableThermophysicalFunction: New non-uniform table\n thermophysicalFunction for liquid properties\n\nDescription\n    Non-uniform tabulated property function that linearly interpolates between\n    the values.\n\n    To speed-up the search of the non-uniform table a uniform jump-table is\n    created on construction which is used for fast indirect addressing into\n    the table.\n\nUsage\n    \\nonUniformTable\n        Property    | Description\n        values      | List of (temperature property) value pairs\n    \\endnonUniformTable\n\n    Example for the density of water between 280 and 350K\n    \\verbatim\n    rho\n    {\n        type    nonUniformTable;\n\n        values\n        (\n            (280 999.87)\n            (300 995.1)\n            (350 973.7)\n        );\n    }\n    \\endverbatim", "idx": 1044}
+{"target": 0, "func": "[PATCH] Adding Changelog for Release 3.2.00\n\nPart of Kokkos C++ Performance Portability Programming EcoSystem 3.2", "idx": 1244}
+{"target": 0, "func": "[PATCH] First experimental prototype of Efficient Ransac (written by\n Yannick)\n\nPlane detection only.\nSome work to do to make it CGAL-conforming.", "idx": 1439}
+{"target": 1, "func": "[PATCH] subCycle: Add special treatment for nSubCycles = 1\n\nNow running sub-cycling with nSubCycles = 1 is as efficient as running the same\ncode without the sub-cycling loop.", "idx": 208}
+{"target": 0, "func": "[PATCH] Prevent PME tuning excessive grid scaling\n\nWe limit the maximum grid scaling to a factor 1.8. This allows\nplenty of room for shifting work from PME on CPU to short-range\nGPU kernels, but avoids excessive scaling for diminishing return\nin performance for a significant increase in power consumption,\ncommunication volume (which may with fluctuating network load not\nshow up during tuning) as well as limiting load balancing.\n\nChange-Id: I85c02478faa6b67c063b6e1b45a9ac1755b2d81e", "idx": 184}
+{"target": 0, "func": "[PATCH] This was the version of code used for the FastMKS benchmarks\n in the recently submitted paper, \"Dual-tree Fast Exact Max-Kernel Search\".", "idx": 1250}
+{"target": 0, "func": "[PATCH] Update testing matrices for coverage and speed\n\nMoved slow aspect of pre-submit matrix to nightly (icc with release\nmode and SIMD support).\n\nRemoved slow gcc-7 config adjusting similar builds to achieve its\nformer objectives.\n\nRemoved outdated TODOs, noted new ones\n\nAdded hwloc test specifier, and todo for hwloc 2\n\nAdded tng test logic, and a specfier for each non-default case.  Fixed\nmissing return values for no-tng case, and clarified the docs.\n\nChange-Id: I340b9a64dc4e4958f260657d3d82480be62ef979", "idx": 1179}
+{"target": 1, "func": "[PATCH] Wrote the compute_children_node_keys() function in elem.h\n which allows one to generate appropriate node keys while reading in a mesh\n with multiple refinement levels. This allows us to avoid a linear search in\n the MeshRefinement::add_point routine since all the nodes can now be found in\n the nodes hash table.  The resulting performance improvement was significant.\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@1263 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 174}
+{"target": 0, "func": "[PATCH] Remove RangeSet, DenseIntMap, and fast allocation routines. \n They were not used anywhere.", "idx": 445}
+{"target": 0, "func": "[PATCH] BJP: Cleaned up code a little to make it easier to use for\n performance testing.", "idx": 996}
+{"target": 0, "func": "[PATCH] Create dedicated subcounter for nonbonded FEP\n\nNow all nonbonded work has their own separate subcoutners which allows\nmeasuring the performance of each task separately.\n\nRefs #2997\n\nChange-Id: I601445364592923d08087a858da4629b0b58ae76", "idx": 863}
+{"target": 0, "func": "[PATCH] Added FAST unit tests.", "idx": 59}
+{"target": 0, "func": "[PATCH] Some performance issues due to profiling", "idx": 540}
+{"target": 0, "func": "[PATCH] Lazily Init AWS ClientConfiguration\n\nAWS changes the ClientConfiguration in the 1.8 SDK to do the checking for env\nvariables and ec2 metadata in [1]. This can cause TileDB to behavior slow if\nS3 support is built but the environment is not configured. The AWS SDK\ncheck for the ec2 metadata and has to wait for a timeout. We need to\nlazily init the ClientConfiguration.\n\n[1] https://github.com/aws/aws-sdk-cpp/commit/147469373c9fec1037bd2d75d7cd949250c6f7c5", "idx": 1116}
+{"target": 0, "func": "[PATCH] Fixed OpenMP compile and added a tux regression test\n\nThe MGR OpenMP code has not been tested yet, so I commented it out for now\nAdded a tux OpenMP compile test and reorganized the tux tests from fast to slow (more or less)", "idx": 115}
+{"target": 1, "func": "[PATCH] Sparse refactored readers: Better vectorization for tile\n bitmaps calculations. (#2711) (#2734)\n\n* Sparse unordered with duplicates: Better vectorization for tile bitmaps.\n\nWhen computing tile bitmaps, there are a few places where we were not\nvectorizing properly and it caused performance impacts on some customer\nscenario. Fixing the use of covered/overlap in the dimension class to\nenable proper vectorization.", "idx": 1163}
+{"target": 0, "func": "[PATCH] Some performance tuning", "idx": 254}
+{"target": 0, "func": "[PATCH] fixed a bug with efficient IMVJ and reuse of old data - need\n to clear the matrix Wtil after convergence of the current time step, as\n columns are the result of J*V and J is outdated now.", "idx": 1552}
+{"target": 0, "func": "[PATCH] Update unit tests and performance tests\n\nCo-authored-by: Daniel Arndt <arndtd@ornl.gov>", "idx": 966}
+{"target": 0, "func": "[PATCH] add SDG Linf fast examples to doc file", "idx": 1083}
+{"target": 0, "func": "[PATCH] PERF Batching + Blocks images in rotate and transform\n\n* Batching all images in single block set is slow at high blocks\n* Divide batches of images into sets and launch blocks for that", "idx": 1491}
+{"target": 0, "func": "[PATCH] .EXPORT_ALL_VARIABLES: commented out since it seems to slow\n down the compilation", "idx": 535}
+{"target": 0, "func": "[PATCH] Recursive ThreadPools (#1774)\n\nCurrently, ThreadPool instances are only recursive with their own isntance. This\npatch allows multiple ThreadPool instances to recurse among themselves without\nthe potential for bottlenecking (either in performance reduction or a deadlock)\non available threads.\n\nFor instance, ThreadPool instances A and B may be called in any order:\nA->execute([&]() {\n  B->execute([&]() {\n    A->execute([&]() {\n      foo();\n    });\n  });\n});\n\nThe motiviation for this patch is to allow us to use our `compute_tp` and\n`io_tp` without worrying about their order in a call stack. This patch reduces\nthe runtime of our `bench_large_io` from 280587ms to 269621.\n\nSee the new unit tests in `unit-threadpool.cc` for more examples.", "idx": 756}
+{"target": 0, "func": "[PATCH] Introduce HostAllocationPolicy\n\nThis permits host-side standard containers and smart pointers to have\ntheir contents placed in memory suitable for efficient GPU transfer.\n\nThe behaviour can be configured at run time during simulation setup,\nso that if we are not running on a GPU, then none of the buffers that\nmight be affected actually are. The downside is that all such\ncontainers now have state.\n\nChange-Id: I9367d0f996de04c21312cef2081cc08148f80561", "idx": 1144}
+{"target": 0, "func": "[PATCH] Changed innerloop optimization options, and made SSE/3dnow\n loops default together with fast truncation on linux.", "idx": 1428}
+{"target": 0, "func": "[PATCH] for three star segments, return fast common point\n\nSigned-off-by: Panagiotis Cheilaris <philaris@cs.ntua.gr>", "idx": 1196}
+{"target": 0, "func": "[PATCH] Split up travis 'full' target; it became too slow", "idx": 1330}
+{"target": 0, "func": "[PATCH] Use Array<T> objects instead of Param objects in several\n functions\n\n* Update SIFT with new RAII memAlloc\n* Workaround for function resolution in ternary operator\n* Fix Fast and Orb functions", "idx": 173}
+{"target": 1, "func": "[PATCH] Force field updates.\n\nNew order for SWM4-NDP and SWM6 topologies, with SETTLE\ninstead of 3 constraints.  Performance is slightly better\nin this case.  Uploading a SWM4-NDP water box that has\nbeen equilibrated for 100 ps to serve as input for gmx\nsolvate.\n\nChange-Id: I67e10693ca76e77b99b371ea9887402e7ac0acc1", "idx": 982}
+{"target": 0, "func": "[PATCH] RJH: Tweaked slow convergence threshold and improved\n stability of RJH: the line search", "idx": 285}
+{"target": 0, "func": "[PATCH] removed call to fast localization for the time being...EJB", "idx": 1200}
+{"target": 0, "func": "[PATCH] Restore locking optimizations for OpenMP case\n\nrestore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461", "idx": 1460}
+{"target": 0, "func": "[PATCH] fixed issue with slow insertion in the presence of dummy\n points", "idx": 1259}
+{"target": 0, "func": "[PATCH] document \"slow\" and \"unstable\" labels for unit tests", "idx": 904}
+{"target": 0, "func": "[PATCH] Update GMM code.  It should be a little faster training, but\n it is still too slow for my preferences.  I am not sure what is making it so\n slow.", "idx": 936}
+{"target": 0, "func": "[PATCH] IA64: efc with Optimiz break moints2x and moints6x", "idx": 1465}
+{"target": 0, "func": "[PATCH] Keep COARSEN_INACTIVE flags in sync in corner case\n\nThis isn't the most efficient solution - I think we could probably let\nthese flags stay out of sync for a while, weaken the assertions that\ncomplain, and trust to make_coarsening_compatible to eventually fix\nthe inconsistencies.\n\nThis is the safest solution, though.", "idx": 620}
+{"target": 0, "func": "[PATCH] HvD: Adding a few performance tests to see if certain\n optimizations are needed. It turns out that aa**ii is about 20 times faster\n than aa**bb, where aa and bb are double precision numbers and ii is an\n integer. Whereas aa*ii is about as fast as aa*bb (measured by running \"time\n <program>\"). So it seems as if we need to add exponentiation to an integer\n power to the NWAD module.", "idx": 662}
+{"target": 0, "func": "[PATCH] Finally, a fully parallel version of the refinement. Not very\n efficient, though, but the idea was to identify all data races and to protect\n it using locks, atomics, TLS... Needs some tests now, to check if we didn't\n miss any rare data race.", "idx": 1017}
+{"target": 1, "func": "[PATCH] Fix nblib pairlist update function\n\nPreviously the function put_atoms_in_box was called only by the\nnblib integrator, or when constructing a system via a SimulationState\nobject. In the case of the integrator, this is a needless performance\ndegradation. When using an nb force calculator without\nfirst putting atoms in the box, this could lead to a cryptic\nerror from nbnxm grid search failure. Both of these issues are\nrectified with this change, which also adds a member to the\nnon-bonded force calculator to hold the requested number of OMP\nthreads to use in a call to put_atoms_in_box_omp, which is faster\nthan the non OMP version.", "idx": 373}
+{"target": 0, "func": "[PATCH] Initial fast reciprocal space LJPME implementation, with\n test.", "idx": 339}
+{"target": 0, "func": "[PATCH] Got parallelization working with CollocationIntegrator, very\n slow", "idx": 223}
+{"target": 0, "func": "[PATCH] default to 1 thread, make the user manually specify the\n number of threads to avoid mpi/thread contention\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@2672 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 1166}
+{"target": 0, "func": "[PATCH] performance updates...EJB", "idx": 702}
+{"target": 0, "func": "[PATCH] Further work on fast diagonalization", "idx": 1417}
+{"target": 0, "func": "[PATCH] Sort cell slab ranges for ND arrays (#1736)\n\nThe selective decompression intersection algorithm requires cell slab ranges\nto be sorted in ascending order. This is true for 1D arrays, but not for ND\narrays. In this scenario, we must sort them.\n\nThe ranges are sorted in vectors that have already been partitioned per-tile.\nThis should keep the sort runtimes relatively quick. In the future, we can\nbenchmark this on large arrays+queries and measure its timing. With this sort,\nwe now may coalesce cell ranges as a future optimization.\n\nIf the N*LOG(N) sorting is too slow, the alternative approach is to leave them\nunsorted and perform O(N*M) range-chunk intersection comparisons, where M is\nthe number of chunks. If the number of chunks is less than LOG(N), this may\nbe faster.\n\nThis solves the following error message:\n[TileDB::ChunkedBuffer] Error: Chunk read error; chunk unallocated error\n\nCo-authored-by: Joe Maley <joe@tiledb.com>", "idx": 1091}
+{"target": 0, "func": "[PATCH] Restore evGW performance", "idx": 1430}
+{"target": 0, "func": "[PATCH] Adding logic for detecting whether or not a Mac must link\n vecLib or Accelerate", "idx": 398}
+{"target": 0, "func": "[PATCH] Use pre-trained network to accelerate test", "idx": 992}
+{"target": 0, "func": "[PATCH] need to slow down in some changes ... cancel eckart for the\n time being", "idx": 178}
+{"target": 0, "func": "[PATCH] Change from single Performance test Executable to different\n executables per backend.", "idx": 300}
+{"target": 0, "func": "[PATCH] Auto cache compiled CUDA kernels on disk to speed up\n compilation (#2848)\n\n* Adds CMake variable AF_CACHE_KERNELS_TO_DISK to enable kernel caching. It is turned ON by default.\n* cuda::buildKernel() now dumps cubin to disk for reuse\n* Adds cuda::loadKernel() for loading cached cubin files\n* cuda::loadKernel() returns empty kernel on failure\n* Uses XDG_CACHE_HOME as cache directory for Linux\n* Adds common::deterministicHash() - This uses the FNV-1a hashing algorithm for fast and reproducible hashing of string or binary data. This is meant to replace the use of std::hash in some place, since std::hash does not guarantee its return value will be the same in subsequence executions of the program.\n* Write cached kernel to temporary file before moving into final file. This prevents data races where two threads or two processes might write to the same file.\n* Uses deterministicHash() for hashing kernel names and kernel binary data.\n* Adds kernel binary data file integrity check upon loading from disk", "idx": 122}
+{"target": 0, "func": "[PATCH] add generic Mac toolchain file\n\nI prefer that CMake find the MPI compiler wrappers instead of the base\ncompilers, if for no other reason than this often means mixing clang and\ngfortran, since clang will likely be found first in PATH.\n\nthis toolchain assumes MPI wrappers are in the PATH but one can set the\nbase directory (with a trailing \"/\") if desired.\n\nthe BLAS/LAPACK used is \"-framework Accelerate\"", "idx": 627}
+{"target": 0, "func": "[PATCH] Added performance data", "idx": 867}
+{"target": 0, "func": "[PATCH] Workaround for Visual Studio bug that causes very slow\n compilation", "idx": 1394}
+{"target": 0, "func": "[PATCH] Adding BoundaryInfo::add_elements()\n\nThis allows us to easily add boundary elements to an existing mesh\nwithout creating a new mesh for them.\n\nFactoring out _find_id_maps simplifies the code a bit.  If\nBoundaryInfo::sync() is too slow we might also factor out\n_add_elements to avoid a redundant _find_id_maps call, but I doubt\nthe redundancy will ever show up in profiling.", "idx": 699}
+{"target": 0, "func": "[PATCH] Make mallocs uniformly an error in PetscMatrix\n\nThis adds the setting to the other two init methods that\nif a new malloc occurs during MatSetValues then it is an\nerror. I think this is an improvement because it establishes\nuniformity across our init methods and it can prevent users\nfrom running extremely slow simulations.\n\nI fully expect this to cause failures in MOOSE...", "idx": 1307}
+{"target": 0, "func": "[PATCH] Add the possibility to remove the far points\n\nThe far points are added by the parallel version to reduce the contention\non the infinite vertex", "idx": 30}
+{"target": 0, "func": "[PATCH] Use more accurate error message.\n\nAs discussed over in idaholab/moose#9097, Nanoflann does not actually\nimplement an *approximate* nearest node search algorithm. In contrast\nto \"exact\" nearest node searches, approximate nearest node searches\nare not guaranteed to return the nearest node, in return for potential\nperformance improvements.\n\nSee, for example, their README file [0], which states that Nanoflann\ncan \"[Build] KD-trees with a single index (no randomized KD-trees, no\napproximate searches).\"\n\n[0]: https://github.com/jlblancoc/nanoflann/blob/v1.2.3/README.md", "idx": 531}
+{"target": 0, "func": "[PATCH] Update performance section", "idx": 315}
+{"target": 0, "func": "[PATCH] Made FAST CPU results match CUDA results", "idx": 32}
+{"target": 0, "func": "[PATCH] Remove OpenMP compile flag in CUDA backend\n\nThis flag isn't needed based on recent tests. If it is causing any\nperformance regression, it will be reverted and the following\nflag to disable two-phase lookup for cuda backend on windows will\nbe added back.\n\n/permissive flag does not work with two-phase-lookup enabled\nfor projects with openmp support enabled.", "idx": 423}
+{"target": 1, "func": "[PATCH] WIP Fixed performance of multi-range subarray result\n estimation", "idx": 1248}
+{"target": 1, "func": "[PATCH] Improve performance of random number generator calls", "idx": 1514}
+{"target": 1, "func": "[PATCH] QN update is now in Base class - subclasses onla compute the\n update. Changed computation of QN-Update for MVQN: use QR decomposition of\n matrix V, instead of LU decomposition of VTV. More efficient and more robust\n implementation", "idx": 954}
+{"target": 1, "func": "[PATCH] new implemenation using boost CSR graph, it can be 1.5x\n faster from prev implementation but there is a performance problem that I\n couldn't solve using public functionality of graph (however there might be a\n solution) will look it back.", "idx": 1266}
+{"target": 1, "func": "[PATCH] Improve meanshift filter performance on CPU\n\nImproved the meanshift filter performance on the CPU backend by replacing\nvectors with std::arrays and moving them out of the for loops. Also\nreduced a few conversion operations.", "idx": 288}
+{"target": 1, "func": "[PATCH] 128-bit AVX2 SIMD for AMD Ryzen\n\nWhile Ryzen supports 256-bit AVX2, the internal units are organized\nto execute either a single 256-bit instruction or two 128-bit SIMD\ninstruction per cycle. Since most of our kernels are slightly\nless efficient for wider SIMD, this improves performance by roughly\n10%.\n\nChange-Id: Ie601b1dbe13d70334cdf9284e236ad9132951ec9", "idx": 1453}
+{"target": 1, "func": "[PATCH] gauge_field::FloatNOrder can now use __ldg loads.  Generally\n improves performance across the board, but some regressions at 12/8\n reconstruct so left switched off for now (USE_LDG macro in\n include/gauge_field_order.h).", "idx": 962}
+{"target": 1, "func": "[PATCH] Better performance (~10-15% better)\n\nBy removing several tests (and use CGAL::max instead), the generated\nassembly is more efficient.", "idx": 772}
+{"target": 1, "func": "[PATCH] Replacing dynamic_cast with libmesh_cast where appropriate -\n depending on the error checking replaced this will either lead to slightly\n more efficient NDEBUG runs or slightly more run-time checking in debug mode\n runs.", "idx": 1291}
+{"target": 1, "func": "[PATCH] Improve performance of PFMG", "idx": 1357}
+{"target": 1, "func": "[PATCH] Adds sfence and lfence options if user builds with assembly\n to enable more efficient use of out of order engines", "idx": 1384}
+{"target": 1, "func": "[PATCH] Fix IR issue causing very slow to no convergence when using\n an inaccurate inner solver", "idx": 1560}
+{"target": 1, "func": "[PATCH] Add new constructor to Iso_rectangle_2(Point_2, Point_2,\n int). The additional dummy \"int\" specifies that the 2 points are the\n lower-left and upper-right corner.  This is more efficient when one knows\n they are already in this configuration.\n\nSame thing for Iso_cuboid_3, and the functors.\n\nUse them in Cartesian_converter and Homogeneous_converter.", "idx": 1393}
+{"target": 1, "func": "[PATCH] improved performance of free energy runs in water\n significantly by allowing water-water loops and added a slight speed up for\n neighborsearching for free-energy runs", "idx": 859}
+{"target": 1, "func": "[PATCH] Adjusted scheduling, removed slow atomic section", "idx": 820}
+{"target": 1, "func": "[PATCH] Enforced rotation: Minor performance optimization in\n do_flex[1,2]_lowlevel", "idx": 1141}
+{"target": 1, "func": "[PATCH] allows requesting of multiple moments. restructures blocks\n for performance improvements", "idx": 1099}
+{"target": 1, "func": "[PATCH] some more performance improvements", "idx": 1537}
+{"target": 0, "func": "[PATCH] Refactor threading model\n\nRemoves:\n- global_tp_ (the replacement TBB thread pool)\n- StorageManager::async_thread_pool_\n- StorageManager::reader_thread_pool_\n- StorageManager::writer_thread_pool_\n- VFS::thread_pool_\n\nAdds:\n- StorageManager::compute_tp_\n- StorageManager::io_tp_\n\nUsage changes:\n1. Our three parallel functions (`parallel_sort`, `parallel_for[_2d]`) now use\n   the `StorageManager::compute_tp_`.\n2. Both the `Reader::read_tiles()` and `Writer::write_tiles()` now execute on\n   `StorageManager::io_tp_`.\n3. The VFS is now initalized with a thread pool, where the storage manager\n   initializes it with the `StorageManager::io_tp_`. This means that both the\n   VFS and Reader/Writer io paths execute on the same thread pool. There was\n   previously a deadlock scenario if both used the same thread pool, but that\n   is no longer an issue now that the threadpools are recursive.\n4. The async queries are executed on `StorageManager::compute_tp_`.\n\nConfig changes:\n- Adds configuration parameters for the compute and IO thread pool \"concurrency\n  levels\". A level of \"1\" is serial execution while all other levels have a\n  maximum concurrency of N but allocate N-1 OS threads.\n- Deprecate the async/reader/writer/vfs thread num configurations. If any of\n  these are set and larger than the new \"sm.compute_concurrency_level\" and\n  \"sm.io_concurrency_level\", the old values will be used instead. The motiviation\n  is so that existing users will not see a drop in performance if they are\n  currently using larger-than-default values.", "idx": 192}
+{"target": 1, "func": "[PATCH] Modified nb_accv to improve performance of accumulates to\n processors on the same SMP node.", "idx": 1504}
+{"target": 1, "func": "[PATCH] fast dynamic_cast in Lazy_kernel::Construct_point_3", "idx": 773}
+{"target": 1, "func": "[PATCH] Disable CUDA textures on NVIDIA Volta\n\nThis has significant performance benefit for the nbnxm kernels with\ntabulated Ewald correction and it has negligible impact on the PME kernels.\n\nPartially addresses #3845", "idx": 542}
+{"target": 1, "func": "[PATCH] Increase memory allocation for NPT simulation in Ewald.cpp.\n Assign linear molecule with 3 and more atoms to DCGraph at higher level to\n improve code performance", "idx": 986}
+{"target": 1, "func": "[PATCH] #1295 Refactored MX::setSub(single argument) This should be\n significantly more efficient, though I didn't do any performance testing", "idx": 959}
+{"target": 1, "func": "[PATCH] Performance improvments. BuildWeightMatrix() is probably\n unnecessary entirely.", "idx": 171}
+{"target": 1, "func": "[PATCH] Move orb LUT in CUDA backend to texture memory\n\ncuda::kernel::extract_orb is the CUDA kernel that uses the orb\nlookup table. Shared below is performance of the kernel using constant\nmemory vs texture memory. There is neglible to no difference between two\nversions. Hence, shifted to texture memory LUT to reduce global constant\nmemory usage.\n\nPerformance using constant memory LUT\n-------------------------------------\n\nTime(%)  Time   Calls      Avg       Min       Max  Name\n\n3.02%  292.26us   24  12.177us  11.360us  14.528us  void cuda::kernel::extract_orb<float>\n2.16%  209.00us   16  13.062us  11.616us  16.033us  void cuda::kernel::extract_orb<double>\n\nPerformance using texture LUT\n-----------------------------\n\nTime(%)    Time   Calls      Avg       Min       Max  Name\n\n2.84%  270.63us     24  11.276us  9.6970us  15.040us  void cuda::kernel::extract_orb<float>\n2.20%  209.28us     16  13.080us  10.688us  16.960us  void cuda::kernel::extract_orb<double>", "idx": 1566}
+{"target": 1, "func": "[PATCH] thermophysicalModels: Added caching of Cp and Cv for\n efficiency\n\nIn multi-specie systems the calculation of Cp and Cv is relatively expensive due\nto the cost of mixing the coefficients of the specie thermo model, e.g. JANAF,\nand it is significantly more efficient if these are calculated at the same time\nas the rest of the thermo-physical properties following the energy solution.\n\nAlso the need for CpByCpv is also avoided by the specie thermo providing the\nenergy type in the form of a boolean 'enthalpy()' function which returns true if\nthe energy type is an enthalpy form.", "idx": 42}
+{"target": 1, "func": "[PATCH] performance improvements on grid_ssw", "idx": 1060}
+{"target": 1, "func": "[PATCH] fast pool allocator", "idx": 560}
+{"target": 1, "func": "[PATCH] Improved performance mostly by using hints to insert to the\n status line.", "idx": 744}
+{"target": 1, "func": "[PATCH] Made gmx_numzero static for performance reasons.", "idx": 565}
+{"target": 1, "func": "[PATCH] PERFFIX: improved 2d convolve perf in cuda by 33%\n\n* templating cuda kernel for filter lengths increased\n  performance by 30% which is 93% of closed-source ArrayFire\n  implementation of 2d convolution\n* templating separable cuda kernel improved performance by 20%\n* separated separable convolution kernel and wrapper into their\n  own file to speed up compilation time", "idx": 393}
+{"target": 1, "func": "[PATCH] modified to have 3c and 2c two electron eri sums also prints\n outer loop index to show user about where the computation is.  Also\n statically (modulo) parallelized for better performance\n\nRick Kendall", "idx": 790}
+{"target": 1, "func": "[PATCH] lagrangian: Rationalized the handling of multi-component\n liquids and solids Ensures consistency between the mixture thermodynamics and\n composition specifications for the parcels. Simpler more efficient\n implementation. Resolves bug-report\n http://www.openfoam.org/mantisbt/view.php?id=1395 as well as other\n consistency issues not yet reported.", "idx": 362}
+{"target": 1, "func": "[PATCH] Optimize the performance of rot by using universal intrinsics", "idx": 109}
+{"target": 1, "func": "[PATCH] reverted commit 76d5bddd5c3dfdef76beaab8222231624eb75e89.\n Split ga_acc in moints2x_trf2K in smaller ga_acc on MPI-PR since gives large\n performance improvement on NERSC Cori", "idx": 1516}
+{"target": 1, "func": "[PATCH] This commit introduces VariableGroups as an optimization when\n there are repeated variables of the same type inside a system.  Presently,\n these are only activated through the system.add_variables() API, but in the\n future there may be provisions for automatically identifying groups.\n\nThe memory usage for DofObjects now scales like\nN_sys+N_var_group_per_sys instead of N_sys+N_vars.  The DofMap\ndistribution code has been refactored to use VariableGroups.\n\nAll existing loops over Variables within a system will work unchanged,\nbut can be replaced with more efficient loops over VariableGroups.", "idx": 1452}
+{"target": 1, "func": "[PATCH] Simplifying ARMv8 build parameters\n\nARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode\n(which is not right because TX2 is ARMv8.1) as well as requiring a few\nredundancies in the defines, making it harder to maintain and understand\nwhat core has what. A few other minor issues were also fixed.\n\nTests were made on the following cores: A53, A57, A72, Falkor, ThunderX,\nThunderX2, and XGene.\n\nTests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester.\n\nA summary:\n * Removed TX2 code from ARMv8 build, to make sure it is compatible with\n   all ARMv8 cores, not just v8.1. Also, the TX2 code has actually\n   harmed performance on big cores.\n * Commoned up ARMv8 architectures' defines in params.h, to make sure\n   that all will benefit from ARMv8 settings, in addition to their own.\n * Adding a few more cores, using ARMv8's include strategy, to benefit\n   from compiler optimisations using mtune. Also updated cache\n   information from the manuals, making sure we set good conservative\n   values by default. Removed Vulcan, as it's an alias to TX2.\n * Auto-detecting most of those cores, but also updating the forced\n   compilation in getarch.c, to make sure the parameters are the same\n   whether compiled natively or forced arch.\n\nBenefits:\n * ARMv8 build is now guaranteed to work on all ARMv8 cores\n * Improved performance for ARMv8 builds on some cores (A72, Falkor,\n   ThunderX1 and 2: up to 11%) over current develop\n * Improved performance for *all* cores comparing to develop branch\n   before TX2's patch (9% ~ 36%)\n * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than\n   current develop's branch and 8% faster than deveop before tx2 patches\n\nIssues:\n * Regression from current develop branch for A53 (-12%) and A57 (-3%)\n   with ARMv8 builds, but still faster than before TX2's commit (+15%\n   and +24% respectively). This can be improved with a simplification of\n   TX2's code, to be done in future patches. At least the code is\n   guaranteed to be ARMv8.0 now.\n\nComments:\n * CortexA57 builds are unchanged on A57 hardware from develop's branch,\n   which makes sense, as it's untouched.\n * CortexA72 builds improve over A57 on A72 hardware, even if they're\n   using the same includes due to new compiler tunning in the makefile.", "idx": 932}
+{"target": 1, "func": "[PATCH] Fixed performance regression on Kepler", "idx": 1422}
+{"target": 1, "func": "[PATCH] Optimize load_tile_offsets for only relevant frags\n\nThis optimization is to adjust the `Reader::load_tile_offsets` class to\nonly loop over the relevant fragments as computed by the subarray class\nbased on intersection. This is a performance optimization for arrays\nwhich have a large number of fragments and which we are incorrectly\nfetching a large amount of unneeded data.", "idx": 775}
+{"target": 1, "func": "[PATCH] Removed debug code with would slow down mdrun when writing\n trajectories", "idx": 1082}
+{"target": 1, "func": "[PATCH] Fix performances of Triangulation_2 with EPEC\n\nThere was a performance degradation between CGAL-3.7 and CGAL-3.8, when\nTriangulation_2 is used with EPEC. This patch fixes the issue. Using a\nfunctor that is specialized for EPEC, in inexact_orientation, to_double is\nnot called on p.x() but on p.approx().x().", "idx": 268}
+{"target": 1, "func": "[PATCH] USER-DPD: performance optimizations to ssa_update() in\n fix_shardlow Overall improvements range from 2% to 18% on our benchmarks 1)\n Newton has to be turned on for SSA, so remove those conditionals 2) Rework\n the math in ssa_update() to eliminate many ops and temporaries 3) Split\n ssa_update() into two versions, based on DPD vs. DPDE 4) Reorder code in\n ssa_update_*() to reduce register pressure", "idx": 152}
+{"target": 1, "func": "[PATCH] Sparse refactored readers: disable filtered buffer tile\n cache. (#2702)\n\nFrom tests, it's been found that writing the cache for the filter\npipeline takes a significant amount of time for the tile unfiltering\noperation. For example, 2.25 seconds with and 1.88 seconds without in\nsome cases. The cache improved performance before multi-range subarrays\nwere implemented, so dropping it is fine at least for the refactored\nreaders.", "idx": 499}
+{"target": 1, "func": "[PATCH] more efficient jacobian of mapping modes", "idx": 934}
+{"target": 1, "func": "[PATCH] rocSPARSE does not require sorted columns for csrgemm\n\nThis is specific to the rocSPARSE CSR implementation of SpGEMM.\nBut this is a substantial performance savings.", "idx": 408}
+{"target": 1, "func": "[PATCH] improve the ell performance", "idx": 1197}
+{"target": 1, "func": "[PATCH] Step towards better performance regarding second convergence\n test in bicgstab", "idx": 917}
+{"target": 1, "func": "[PATCH] Added code to improve performance for the structure factor\n calculations.....Works in serial...Still need to check parallel\n performance......EJB", "idx": 512}
+{"target": 1, "func": "[PATCH] EA:increased MAXMEM size to 512 since it gives large\n performance improvement on big matrix multiplies", "idx": 1215}
+{"target": 0, "func": "[PATCH] Robust retries for S3 request-limit retries (#1651)\n\nThis patch provides a retry handler that is identical to the existing, default\nhandler with an exception for CoreErrors::SLOW_DOWN. For this error, we will\nunconditionally retry every 1.25-1.75 seconds.\n\nThe motivation for this patch is to allow the TileDB client to remain functional\neven when performance may be bottlenecked on this error. The server returns\nthis error when we exceed a fixed number of requests per second. The client\nwill eventually make progress.", "idx": 1088}
+{"target": 1, "func": "[PATCH] Improved GB performance in mixed precision", "idx": 62}
+{"target": 1, "func": "[PATCH] added option to keep transpose to hybrid for better gpu\n performance", "idx": 612}
+{"target": 1, "func": "[PATCH] changed DofMap::build_constraint_matrix to be more efficient\n in the (usual) case that the element has no constraints.  Also fixed for the\n case that an element has constraints in terms of its *own* dofs, (not others)\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@870 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 1429}
+{"target": 1, "func": "[PATCH] Use more efficient iterators in MeshCommunication", "idx": 442}
+{"target": 1, "func": "[PATCH] Check in changes to improve performance for nb_puts and\n nb_gets on GPU-hosted global arrays.", "idx": 783}
+{"target": 1, "func": "[PATCH] Fixed race condition in GPU restrictor.  Improved performance\n of both prolongator and restrictor using compile-time evaluated fine-spin ->\n coarse-spin mapper instead of an array.", "idx": 1410}
+{"target": 0, "func": "[PATCH] Improve mdrun performance user guide\n\n- Extended GPU update related section;\n- Tweaked the bonded GPU offload related section to reflect the current\n  state of the code.\n\nChange-Id: I89ed39750d449df8b273f36ee6530df29ff31b7e", "idx": 27}
+{"target": 1, "func": "[PATCH] Modified locate region to use more efficient algorithms for\n most block-cyclic distributions.", "idx": 902}
+{"target": 1, "func": "[PATCH] MultiReduce kernels now instantiate a series of power of two\n block sizes: this improves performance significantly", "idx": 225}
+{"target": 1, "func": "[PATCH] Unroll dynamic indexing in SYCL gather kernel\n\nThread ID-based dynamic indexing of constant memory data in the SYCL\ngather kernel caused a large amount of register spills and poor\nperformance of the gather kernel on AMD. Avoiding dynamic indexing\neliminates spills and improves performance of this kernel >10x.\n\nRefs #3927", "idx": 244}
+{"target": 1, "func": "[PATCH] More efficient generation of random numbers", "idx": 1206}
+{"target": 1, "func": "[PATCH] replaced std::endl with \\n in all file IO and stringstreams. \n std::endl forces a flush, which kills performance on some machines", "idx": 711}
+{"target": 1, "func": "[PATCH] Performance improvements for find_*_neighbors, and a new\n find_point_neighbors version for finding neighbors at just one point", "idx": 378}
+{"target": 1, "func": "[PATCH] Improved performance of Python State objects", "idx": 921}
+{"target": 1, "func": "[PATCH] On behalf of Ryan Olson: Checking in the changes for server\n side registration to improve performance", "idx": 395}
+{"target": 1, "func": "[PATCH] pathf90 v2.1 better performance with ro=1 vs. ro=2", "idx": 1538}
+{"target": 1, "func": "[PATCH] reworked the project_vector to be more efficient for Lagrange\n elements.  Changed the corresponding calls in reinit().  amr.cc now tests the\n projection stuff.\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@279 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 1346}
+{"target": 1, "func": "[PATCH] use local variables for more efficient force accumulation", "idx": 1112}
+{"target": 1, "func": "[PATCH] Optimize the performance of daxpy by using universal\n intrinsics", "idx": 493}
+{"target": 1, "func": "[PATCH] Improved consistency of the ApplyPackedReflectors routines\n (as well as performance in several cases) and several more implementations,\n fixed mistakes in the build section of the documentation, added a short\n description of the new SVD function, and fixed mistakes in HouseholderSolve\n after adding a simple example driver.", "idx": 1230}
+{"target": 1, "func": "[PATCH] issue #2389: re-adding MapSum Function fore efficient\n reduce_in/out", "idx": 891}

classify/train.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

classify/valid.jsonl ADDED Viewed

	@@ -0,0 +1,158 @@

+{"target": 1, "func": "[PATCH] s390x: Use new sgemm kernel also for strmm on Z14 and newer\n\nEmploy the newly added GEMM kernel also for STRMM on Z14. The\nimplementation in C with vector intrinsics exploits FP32 SIMD operations\nand thereby gains performance over the existing assembly code. Extend\nthe implementation for handling triangular matrix multiplication,\naccordingly. As added benefit, the more flexible C code enables us to\nadjust register blocking in the subsequent commit.\n\nTested via make -C test / ctest / utest and by a couple of additional\nunit tests that exercise blocking.\n\nSigned-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>", "idx": 495}
+{"target": 0, "func": "[PATCH] added benchmark comparing the performance of the two traits\n classes (CK vs CORE::Expr)", "idx": 1040}
+{"target": 0, "func": "[PATCH] Auto cache compiled CUDA kernels on disk to speed up\n compilation (#2848)\n\n* Adds CMake variable AF_CACHE_KERNELS_TO_DISK to enable kernel caching. It is turned ON by default.\n* cuda::buildKernel() now dumps cubin to disk for reuse\n* Adds cuda::loadKernel() for loading cached cubin files\n* cuda::loadKernel() returns empty kernel on failure\n* Uses XDG_CACHE_HOME as cache directory for Linux\n* Adds common::deterministicHash() - This uses the FNV-1a hashing algorithm for fast and reproducible hashing of string or binary data. This is meant to replace the use of std::hash in some place, since std::hash does not guarantee its return value will be the same in subsequence executions of the program.\n* Write cached kernel to temporary file before moving into final file. This prevents data races where two threads or two processes might write to the same file.\n* Uses deterministicHash() for hashing kernel names and kernel binary data.\n* Adds kernel binary data file integrity check upon loading from disk", "idx": 719}
+{"target": 0, "func": "[PATCH] Workaround for libHilbert bug until we manage to get it\n fixed. Part of this workaround may be permanent - the long term fix may\n require efficient user code to call some reinitialization, renumbering\n function manually after reading in a mesh and solution.", "idx": 561}
+{"target": 1, "func": "[PATCH] Wrote the compute_children_node_keys() function in elem.h\n which allows one to generate appropriate node keys while reading in a mesh\n with multiple refinement levels. This allows us to avoid a linear search in\n the MeshRefinement::add_point routine since all the nodes can now be found in\n the nodes hash table.  The resulting performance improvement was significant.", "idx": 287}
+{"target": 0, "func": "[PATCH] Mesh_3: Address TBB performance warning on hashing", "idx": 12}
+{"target": 0, "func": "[PATCH] Minor fixes to mdrun performance documentation\n\nIn \"Examples for mdrun on one node,\" third example description, the respective number\nof thread-mpi ranks and OpenMP threads per rank were reversed.\n\nIn \"Examples for mdrun on one node,\" 6th example. For 12 logical cores, the pinoffsets\nshould be 0 and 6, respectively (I think)\n\nA few command line examples of running mdrun with more than 1 node used gmx rather\nthan gmx_mpi\n\nSeveral spelling/grammar/tense error/linking issues addressed.\n\nChange-Id: I014bc52d55cda1cbd05843cb8e960c2a2d7cbb47", "idx": 925}
+{"target": 0, "func": "[PATCH] Added workaround for performance issue in CasADi 2.4", "idx": 1161}
+{"target": 0, "func": "[PATCH] performance analysis, use of multipoles for long range bits,\n lots of additional screening", "idx": 1353}
+{"target": 0, "func": "[PATCH] In the performance miniapps, print the SIMD width in terms of\n \"doubles\".", "idx": 1194}
+{"target": 0, "func": "[PATCH] Refactor UpdateTree() to sometimes Hamerly prune. We aren't\n properly retaining pruned nodes between iterations, but this is definitely a\n start and it's basically as fast as any of these attempted algorithms I've\n written.", "idx": 233}
+{"target": 0, "func": "[PATCH] Only use AVX512 in own-FFTW if GROMACS also uses it\n\nBuilding the own FFTW with AVX512 enabled for all AVX-flavors means that\nan AVX2 build can end up loosing a significant amount of performance due\nto clock throttle if the FFTW auto-tuner inadvertently picks and AVX512\nkernel. This is not unlikely as measurements at startup are very noisy\nand often lead to inconsistent kernel choice (observed in practice).\n\nChange-Id: I857326a13a7c4dd1a6f5ab44360211301b05d3ac", "idx": 666}
+{"target": 0, "func": "[PATCH] Performance updates...EJB", "idx": 384}
+{"target": 0, "func": "[PATCH] Fix clang-3.7 build warnings in core unit and performance\n tests\n\nFix mismatched use of array new and scalar delete\nT * data =  new T[1];\n...\ndelete [] data;", "idx": 1114}
+{"target": 0, "func": "[PATCH] Code for ordinary, slow, ewald electrostatics", "idx": 1546}
+{"target": 0, "func": "[PATCH] Accelerate KSInitialization tests by using fewer CV folds and\n fewer training epochs as well as relaxed tolerances.", "idx": 1364}
+{"target": 1, "func": "[PATCH] changed the ndelta to setting the number of cells per cut-off\n radius to per cut-off diameter and changed the default value to 3 in\n do_inputrec, this gives at most 64 cells per icg iso 125, this gives a few\n percent performance increase in ns and domain decomposition (the cg sorting)", "idx": 56}
+{"target": 0, "func": "[PATCH] CDGW now working; slow", "idx": 372}
+{"target": 0, "func": "[PATCH] Increase release build timeout, macos is being a bit slow on\n Azure (#2495)", "idx": 1165}
+{"target": 0, "func": "[PATCH] Initial check in of performance test for read-only property", "idx": 858}
+{"target": 1, "func": "[PATCH] Possible performance enhancement in Mesh::delete_elem. We use\n the passed Elems id() as a guess for the location of the Elem in the\n _elements vector. If the guess does not succeed, then we revert to the linear\n search.", "idx": 1029}
+{"target": 0, "func": "[PATCH] rPolynomial: New equation of state for liquids and solids\n\nDescription\n    Reciprocal polynomial equation of state for liquids and solids\n\n    \\f[\n        1/\\rho = C_0 + C_1 T + C_2 T^2 - C_3 p - C_4 p T\n    \\f]\n\n    This polynomial for the reciprocal of the density provides a much better fit\n    than the equivalent polynomial for the density and has the advantage that it\n    support coefficient mixing to support liquid and solid mixtures in an\n    efficient manner.\n\nUsage\n    \\table\n        Property     | Description\n        C            | Density polynomial coefficients\n    \\endtable\n\n    Example of the specification of the equation of state for pure water:\n    \\verbatim\n    equationOfState\n    {\n        C (0.001278 -2.1055e-06 3.9689e-09 4.3772e-13 -2.0225e-16);\n    }\n    \\endverbatim\n    Note: This fit is based on the small amount of data which is freely\n    available for the range 20-65degC and 1-100bar.\n\nThis equation of state is a much better fit for water and other liquids than\nperfectFluid and in general polynomials for the reciprocal of the density\nconverge much faster than polynomials of the density.  Currently rPolynomial is\nquadratic in the temperature and linear in the pressure which is sufficient for\nmodest ranges of pressure typically encountered in CFD but could be extended to\nhigher order in pressure and/temperature if necessary.  The other huge advantage\nin formulating the equation of state in terms of the reciprocal of the density\nis that coefficient mixing is simple.\n\nGiven these advantages over the perfectFluid equation of state the libraries and\ntutorial cases have all been updated to us rPolynomial rather than perfectFluid\nfor liquids and water in particular.", "idx": 46}
+{"target": 0, "func": "[PATCH] Adding performance patch for trmm, just like #2836", "idx": 723}
+{"target": 0, "func": "[PATCH] fewer qas for slow mpi-pt", "idx": 910}
+{"target": 0, "func": "[PATCH] BJP: Initial checkin of test program for performance\n evaluation of DRA routines in 3 dimensions.", "idx": 643}
+{"target": 0, "func": "[PATCH] Add TaskDAG performance test to CMake", "idx": 137}
+{"target": 0, "func": "[PATCH] Random engines & distributions as proper C++11 classes\n\nThis change implements the ThreeFry2x64 random engine with flexible\nnumber of encryption rounds and internal counter bits. The class is\ncompatible with the C++11 random number generators, and the GROMACS\ntabulated normal distribution has likewise been turned into a\nrandom distribution compatible with C++11, meaning they can be used in\nalmost any combination with the standard library distributions.\n- The ThreeFry2x64 implementation uses John Salmon's idea of a template-\n  selected internal counter so a number of bits are reserved to generate\n  an arbitrary random stream. This makes it possible to use ThreeFry as\n  a normal random engine, and even in counter mode it is possible to\n  draw an arbitrary amount of random numbers before restarting counters.\n- Both accurate (20-round) and fast (13-round) versions are available.\n- There is a gmx::DefaultRandomEngine when we don't care about details.\n- gmx::GammaDistribution has been added to work around bugs in\n  libstdc++-4.4.7 headers, and to avoid getting different results\n  for libstdc++ vs. libc++.\n- Custom Uniform, normal, and exponential distributions have been added\n  to make all results reproducible across platforms since stdlibc++ and\n  libc++ do not use the same generating algorithms.\n- Code using random numbers has been updated, but no changes have been\n  made to turn random seeds into 64bits yet.\n- The selection nbsearch unit test was a bit fragile and very sensitive\n  to the coordinate specific values; this has been fixed so it should\n  be resilient no matter what RNG is used in the future.\n\nChange-Id: I47a04d03e2f264e1a6ef0aa0a2174cb464ed9af7", "idx": 1155}
+{"target": 0, "func": "[PATCH] performance miniapps: don't enable -march=native on ARM\n\nSee e.g. https://stackoverflow.com/questions/65966969/why-does-march-native-not-work-on-apple-m1", "idx": 1553}
+{"target": 0, "func": "[PATCH] Made a pass on Performance tutorials in docs.", "idx": 622}
+{"target": 0, "func": "[PATCH] Add Z-batch to fast kernels", "idx": 1052}
+{"target": 0, "func": "[PATCH] a slow version of triangle split", "idx": 204}
+{"target": 0, "func": "[PATCH] Enable fp-exceptions\n\nThis can help with finding errors quicker because mdrun crashes as soon\nas a floating point value overflows or is invalid. fp-exceptions are\nonly enabled for builds with asserts (without NDEBUG), mainly because\nit isn't always possible to avoid invalid fp operations for SIMD math\nwithout a performance penalty.\n\nAlso, fix a few places where we had 1/0 or other invalid fp operations.\n\nFixes #1582\n\nChange-Id: Ib1b3afc525706f4b171564fcaf08ebf3b2be3122", "idx": 763}
+{"target": 0, "func": "[PATCH] SYCL NBNXM offload support\n\nAssociated changes:\n\n- Added function stubs to PME: necessary for compilation.\n- Stricter SYCL hardware compatibility checks: limits on subgroup size\n  and the availability of local memory.\n- The kernel implementation and overall logic closely follow the OpenCL\n  implementation. Divergences are documented locally.\n\nLimitations:\n\n- No fine-grained timings yet.\n- Code-duplication with CUDA and OpenCL: see #2608.\n- Minor differences in local/nonlocal synchronization: see #3895,\n  related to #2608.\n- Only the OpenCL backend was extensively tested. LevelZero works fine\n  without MPI but stalls due to a known bug. The fix for DPCPP runtime\n  is available, but not yet part of any OneAPI release:\n  https://github.com/intel/llvm/pull/3045.\n- The complex/position-restraints regression test fails: see #3846.\n- No performance tuning: see #3847.\n\nPerformance on rnase-cubic system is similar to OpenCL implementation.", "idx": 1277}
+{"target": 0, "func": "[PATCH] Added parentheses to performance logging messages\n\ngit-svn-id: file:///Users/petejw/Documents/libmesh_svn_bak@1518 434f946d-2f3d-0410-ba4c-cb9f52fb0dbf", "idx": 496}
+{"target": 0, "func": "[PATCH] Modified operations so that MPI_Type_free is called after\n wait. Also tried adding and MPI_Win_flush_local call to force both local and\n remote completion before calling MPI_Type_free, but does not seem to get rid\n of failures in some of the performance tests.", "idx": 1130}
+{"target": 0, "func": "[PATCH] Updated article reference. Added configuration flags to\n support altivec on powerpc. Updated configuration files from gnu.org Removed\n the fast x86 truncation when double precision was used - it would just crash\n since the registers are 32bit only. Added powerpc altivec innerloop support\n to fnbf.c Removed x86 assembly truncation when double precision is used - the\n registers can only handle 32 bits. Dont use general solvent loops when we\n have altivec support in the normal ones. Added workaround for clash with\n altivec keyword. Added workaround for clash with altivec keyword.", "idx": 104}
+{"target": 0, "func": "[PATCH] MDRange: Minor perf test fixes for KNL\n\nTests are still commented out for now, but removed running a slow test,\nreduced the number of tests, and made minor fixes when checking results\nduring the first iteration (vectorization may have occurred and made\nthe check no longer bitwise correct; epsilon comparison should be added\nas well as option to not check correctness which will speed up the tests)", "idx": 1556}
+{"target": 0, "func": "[PATCH] Knowing when the tree fails and we're stuck with a linear\n search is useful for performance testing too", "idx": 886}
+{"target": 0, "func": "[PATCH] Continued stats refactor + subarray stats (#2200)\n\nTYPE: IMPROVEMENT\nDESC: Added additional stats for subarrays and subarray partitioners", "idx": 255}
+{"target": 0, "func": "[PATCH] Fixup re-enable core performance tests\n\nThese had been inadvertently disbaled in #3839\n\nCo-Authored-By: Nick Curtis <nicholas.curtis@amd.com>\nCo-Authored-By: Bruno Turcksin <bruno.turcksin@gmail.com>", "idx": 281}
+{"target": 0, "func": "[PATCH] Fixing a bug in measuring performance (didn't reset the\n timer)", "idx": 731}
+{"target": 0, "func": "[PATCH] Mark closed flag in EigenSparseVector regardless of METHOD\n\nI don't think it's a performance hit to set these flags\nregardless of METHOD, and it's a nice state flag that the user\ncan check", "idx": 810}
+{"target": 1, "func": "[PATCH] Fix performance for range copy", "idx": 1320}
+{"target": 0, "func": "[PATCH] Update performance test case (to use polar_decomposition)\n Make polar default closest rotation computer", "idx": 426}
+{"target": 0, "func": "[PATCH] Added upper triangular fast tridiagonalization routines.", "idx": 704}
+{"target": 0, "func": "[PATCH] fast nary union in OFF2nef3 also constructor finds more\n problems and handles them", "idx": 541}
+{"target": 0, "func": "[PATCH] wallDist/patchDistMethods/Poisson: New method for fast\n calculation of an approximate wall-distance field by solving Poisson's\n equation", "idx": 660}
+{"target": 0, "func": "[PATCH] Added support for gettimeofday() when available to get\n microsecond resolution for wallclock time. This enables accurate performance\n benchmarks from short simulations even in parallel. When gettimeofday() is\n not available, we use time(). Time is still stored as seconds, but now as\n double instead of time_t.", "idx": 852}
+{"target": 0, "func": "[PATCH] some more prints to debug comex_malloc performance", "idx": 700}
+{"target": 0, "func": "[PATCH] Avoiding redundant metadata calculations in order to\n accelerate AbstractDistMatrix::QueueUpdate and\n AbstractDistMatrix::ProcessQueues (as well as generalizing\n AbstractDistMatrix::ProcessPullQueue to support not including 'viewer'\n processes)", "idx": 515}
+{"target": 0, "func": "[PATCH] Expanded the GetIrTest behavior to enable efficient testing", "idx": 251}
+{"target": 0, "func": "[PATCH] IQN-ILS modified such that the coomunication of the rhs uses\n the efficient MPI_Reduce operation if MasterSlave comm is configured with\n mpi-single, i.e., MPIDirect. All tests are working ... still check coupling\n iterations, i.e., performance of IQN-ILS", "idx": 893}
+{"target": 0, "func": "[PATCH] Added support for manual load balancing with a -load option\n for grompp. It takes the relative performance of each of the processors in\n your system in arbitrary units, and normalizes it.", "idx": 1343}
+{"target": 0, "func": "[PATCH] Performance updates ....EJB", "idx": 818}
+{"target": 0, "func": "[PATCH] added driver for testing performance", "idx": 1168}
+{"target": 0, "func": "[PATCH] WIP fast single-element neighbor calculation.", "idx": 1359}
+{"target": 0, "func": "[PATCH] Move performance logging of solve()s into solver classes", "idx": 792}
+{"target": 0, "func": "[PATCH] Rename GPU launch/wait cycle counters\n\nIn preparation for the PME GPU task and GPU launch overhead to be\ncounted together in the same counter for all GPU tasks, the current main\ncounters have been renamed to be more general. The label of GPU waits in\nthe performance table have also been renamed to reflect the task name.\nAdditionally a non-bonded specific sub-counter is been added.\n\nChange-Id: I65a15b0090c1ccebb300cf425c7b3be4100e17a0", "idx": 1360}
+{"target": 0, "func": "[PATCH] 1 : Store performance function 2 : Pass error into Error\n function 3 : Pass network into Error function 4 : Add public api to access\n underlying network 5 : Use perfect forwarding to accept LayerTypes", "idx": 990}
+{"target": 0, "func": "[PATCH] Add a CMake warning about FFTW with --enable-avx\n\nFFTW_MEASURE runs single-threaded tests for FFT performance, which is\nvery different from the GROMACS usage pattern, particularly with how\nthe cache access pattern works. In practice, with FFTW 3.3.2 and\n3.3.3, the performance of FFTW with --enable-avx is considerably\nworse than that of FFTW with --enable-sse or --enable-sse2. It's\nunlikely but theoretically possible that performance might change,\nso we prompt the user both to avoid --enable-avx now, and to\nperhaps consider it in the future.\n\nChange-Id: Ib4906645587cfc6a6306a7f7f46d612a6446b156", "idx": 952}
+{"target": 0, "func": "[PATCH] Create even less contention\n\nAll the readers are waiting on our writer thread to mark our condition\n(`_array_is_present`) as ready before they can even attempt to read, so\nthere is no data race for our writer thread. Hence we can remove the\nlock on the mutex the readers are using. And it's much better this way\nbecause one could imagine that our writer hits the `std::unique_lock`\nfirst which would then prevent our reader threads from even getting to\nthe condition variable `wait`, which is the logical place we want them\nto get to while the writer thread is doing its job.\n\nAnd the second change is unlocking as soon as we're through waiting\nbecause we are through the read-write portion of the program and are\nonly reading so it's safe to let everyone through at once", "idx": 1507}
+{"target": 0, "func": "[PATCH] remove int to bool conversion performance warning with VC", "idx": 86}
+{"target": 0, "func": "[PATCH] Read tiles: fixing preallocation size for var and validity\n buffers. (#2781) (#2782)\n\nPreallocation size for var buffer and validity buffer was not using the\ncorrect size, which will have a performance impact.", "idx": 47}
+{"target": 0, "func": "[PATCH] --enable-distmesh, --with-mapvector-chunk-size\n\nThe parmesh argument probably should have been deprecated when the\nParallelMesh name was.\n\nWe'll want to select chunked_mapvector array size at configure time,\nsince the exact performance optimization/pessimization results are\nlikely to be system-dependent.", "idx": 812}
+{"target": 0, "func": "[PATCH] vector<vector> specialization for parallel_sync\n\nWe can't do this the efficient way in the general algorithm, so let's\ndo it as best we can right now, with blocking receives.\n\nThis should be replaced by Derek's algorithm in #1684 as soon as we\ncan support that.", "idx": 282}
+{"target": 0, "func": "[PATCH] Interactive Molecular Dynamics (IMD)\n\nIMD allows to interact with and to monitor a running molecular dynamics\nsimulation. The protocol goes back to 2001 (\"A system for interactive\nmolecular dynamics simulation\", JE Stone, J Gullingsrud, K Schulten,\nP Grayson, in: ACM symposium on interactive 3D graphics, Ed. JF Hughes\nand CH Sequin, pp. 191--194, ACM SIGGRAPH). The user can watch the\nrunning simulation (e.g. using VMD) and optionally interact with\nit by pulling atoms or residues with a mouse or a force-feedback\ndevice.\nCommunitcation between GROMACS and VMD is achieved via TCP sockets\nand thus enables controlling an mdrun locally or one running on a\nremote cluster. Every N steps, mdrun receives the applied forces from\nthe VMD client and sends the new positions to VMD.\nOther features:\n- correct PBC treatment, molecules of a (parallel) simulation are made\n  whole (with respect to the configuration found in the .tpr file)\n- in the .mdp file, one can define an IMD group (including the protein\n  but not the water for example is useful). Only the coordinates of\n  atoms belonging to this group are then transferred between mdrun and\n  VMD. This can be used to reduce the performance impact to an almost\n  negligible level.\n- adds only two single-line function calls in the main MD loop\n- and mdrun test fixture checks whether grompp and mdrun understand\n  the IMD options\n\nChange-Id: I235e07e204f2fb77f05c2f06a14b37efca5e70ea", "idx": 854}
+{"target": 0, "func": "[PATCH] Python 3 does not have dict.itervalues\n\nReally we should move to using six and importing the iterator versions of dict\nmethods and range/zip from there, but for now just use \"values\" since using a\nraw list doesn't seem like it will cause memory or performance problems here.", "idx": 578}
+{"target": 0, "func": "[PATCH] DLB can now turn off, when slower\n\nUnder certain conditions, especially with (shared) GPUs, DLB can\ndecrease the performance. We now measure the cycles per step before\nturning on DLB. When the running average of cycles per step with DLB\ngets above the average without DLB, we turn off DLB. We then measure\nagain without DLB. DLB can then turn on again. If we turn on DLB of\nDLB multiple times in close succesion and we measure performance loss,\nwe keep DLB off for the remainder of the run. This procedure ensures\nthat the performance will never deteriorate due to DLB.\nUpdated and expanded the DLB section in the manual.\n\nChange-Id: I6e0291c1a41adf6da94fae46d36e0fcb95585a02", "idx": 689}
+{"target": 0, "func": "[PATCH] Added configure flag to disable FFTW measurements, to enable\n binary reproducible runs. Note that this typically WILL deteriorate\n performance, so it is usually better to run the optimized versions and use\n the -reprod flag to mdrun when you need binary identity. However, if you\n compile FFTW3 with SSE support (which is NOT the default) the selected\n kernels seems to be close-to-optimal even without measurements, and then you\n can use this option to always get binary reproducible runs.", "idx": 1317}
+{"target": 0, "func": "[PATCH] reformatted the flops and performance output", "idx": 1413}
+{"target": 0, "func": "[PATCH] Fix fast for CUDA 9. Use CUB library for reductions\n\nFAST was failing on CUDA 9 because of insufficient synchronization in the\nreduction of the non_max_count function. The reduction is now implemented\nusing BlockReduce from CUB.\n\nThis also adds CUB as a dependency which is brought in as a submodule.", "idx": 953}
+{"target": 0, "func": "[PATCH] 1.  BoomerAMG keeps track of the number of iterations\n accumulated over all calls.  This is needed for user-level performance\n monitoring if it is a preconditioner for a Krylov method such as PCG.  The\n regular iteration count only tells you about the last time PCG invoked\n BoomerAMG.  There are ifdefs so you can eliminate this if you like - remove\n #define CUMNUMIT.\n\n2.  very minor code fixes, comments, etc.", "idx": 729}
+{"target": 0, "func": "[PATCH] problem with c-macro? but slow fabs works fine", "idx": 624}
+{"target": 0, "func": "[PATCH] Use Evaluate(const arma::mat& parameters) instead of\n Evaluate(const arma::mat& parameters, const size_t i) to calculate the\n objective and to accelerate the evaluation process.", "idx": 530}
+{"target": 1, "func": "[PATCH] Better workaround of g++ 4.1 optimizer bug:\n -fno-strict-aliasing. Performance penalty is 5% vs 24% with -O", "idx": 971}
+{"target": 0, "func": "[PATCH] Reducing dependencies.  Print functions are generally not\n fast anyway, inlining them leads to unnecessary dependencies and larger\n headers.  Removing print functions from headers.", "idx": 286}
+{"target": 0, "func": "[PATCH] -added support for the new version of RS -fixed some minor\n bugs -now the kernel uses directly the extremely fast RS refinement function\n -updated the generic kernel tests accordingly", "idx": 922}
+{"target": 0, "func": "[PATCH] additional pre-compiler command for performance testing", "idx": 16}
+{"target": 1, "func": "[PATCH] Add doSetup parameter to Matrix::init\n\nNot calling doSetup can give you performance gains when using\npreallocation on the matrix.", "idx": 1054}
+{"target": 0, "func": "[PATCH] Don't resize() as that assembles the matrix and makes\n add_coef() slow", "idx": 200}
+{"target": 0, "func": "[PATCH] update test (Cactus_deformation_session.cpp): make it\n suitable for test performance (not active by default) make it suitable for\n test suite (precomputed mesh difs active by default)", "idx": 201}
+{"target": 0, "func": "[PATCH] output detailed multi-thread performance data only with\n \"timer full\"", "idx": 1177}
+{"target": 0, "func": "[PATCH] log_name would be unused without perf_log on\n\nSo if we're not performance logging, we need to comment out that\nvariable entirely to avoid an unused variable warning.", "idx": 529}
+{"target": 0, "func": "[PATCH] Cleanup of the performance test", "idx": 1265}
+{"target": 0, "func": "[PATCH] Disks are too fast, fix formatting of speed as *****", "idx": 397}
+{"target": 1, "func": "[PATCH] Deprecate version of BoundaryInfo::boundary_ids(const Node*)\n that returns a vector.\n\nAdd new version that must be passed a std::set.  The new version\nshould be more efficient for making repeated calls to boundary_ids(),\nsince the container does not need to be created and destroyed\nrepeatedly...", "idx": 1391}
+{"target": 1, "func": "[PATCH] performance improvement through avoiding function call and\n dereference overhead\n\n- make i_to_potl() and ij_to_potl() functions inline and const\n- don't dereference inside the functions, but cache, if possible in external variables\n=> up to 15% speedup.", "idx": 1372}
+{"target": 0, "func": "[PATCH] turned performance fix for 7.30 compilers off by default", "idx": 806}
+{"target": 1, "func": "[PATCH] Use range for in dof_map.C\n\nWe can use more efficient iterators in a couple places here too.", "idx": 191}
+{"target": 1, "func": "[PATCH] Greatly improved the performance of copy::GeneralPurpose by\n exploiting the tensor product structure of the integer metadata calculations", "idx": 626}
+{"target": 1, "func": "[PATCH] Update RAS-IR.\n\nNotes:\n1. Overlap and row ordering have significant effects on convergence.\n2. Lower the matrix bandwidth, better the performance ?\n3. Sync is lower, but communication becomes higher, tradeoff!\n4. Parallelism is higher, but overhead is also higher, tradeoff!\n5. Very high accurate solves, do not provide anything.\n6. Two level and multi-levels might significantly reduce number of\niterations to converge.", "idx": 657}
+{"target": 1, "func": "[PATCH] resolved performance degrating changed introduced in revision\n 1319", "idx": 1022}
+{"target": 1, "func": "[PATCH] Tiny libmesh_assert_valid_boundary_ids speedup\n\nThis is actually pretty slow in parallel.  Truly making it faster\nwould require lumping multiple verification communications together,\nbut eliminating redundant verification is a start, at a slightly lower\npenalty to readability and usability.", "idx": 3}
+{"target": 1, "func": "[PATCH] Added preconditions and made it more efficient", "idx": 655}
+{"target": 1, "func": "[PATCH] Added single-accuracy SIMD double math functions\n\nApart from double SIMD variables typically being\nhalf the width of single, the math functions are\nconsiderably more expensive due to higher-order\npolynomials, which can drop the throughput to 25%\nof single. In some cases we do not need the full\ndouble precision in SIMD operations, so these\nnew math functions use double precision\nSIMD variables but only target single precision\naccuracy, which can improve performance twofold.\nThe patch also makes the target precision in\nsingle and double SIMD an advanced CMake variable,\nand the unit test tolerance is set based on these\nvariables. This can be used (decided by the user)\nfor a few platforms where the rsqrt/inv table\nlookups provide one bit too little to get by\nwith a single N-R iteration based on our default\ntarget accuracy of 22 bits.\n\nChange-Id: Id4b1c7800e16cb0eb3d564e89a368b4db6eede3e", "idx": 89}
+{"target": 1, "func": "[PATCH] Performance improvements to kd_tree_test, added peer bounds\n checking.", "idx": 1021}
+{"target": 1, "func": "[PATCH] Reworked function 'get_best_weight()' of Slivers_exuder.h\n\n- Avoid computing incident cells multiple times\n- Make it more efficient for P3M3 by not having to use\n  tr.min_squared_distance() to compute the distance\n  between neighboring vertices", "idx": 716}
+{"target": 1, "func": "[PATCH] #1295 Refactored Matrix::setSub(IMatrix,IMatrix) The new\n implementation should be much more efficient and handle non-monotone indices\n correctly", "idx": 117}
+{"target": 1, "func": "[PATCH] add (T) kernels optimized for OpenMP+SIMD\n\nThese kernels are taken from https://github.com/jeffhammond/nwchem-tce-triples-kernels/,\nwhich were previously part of private development branch of NWChem hosted by Argonne.\nThe code was developed by Jeff Hammond from 2013-2014 with help from Karol Kowalski.\n\nThese kernels have been tested on Intel Xeon, Intel Xeon Phi, IBM Blue Gene/Q,\nIBM POWER7, AMD Bulldozer and ARM32 processors using the Intel, Cray, IBM XL,\nand GCC compilers.  In rare instances, the optimal loop order is different between\nIntel, Cray and IBM compilers.  In such cases, we default to the Intel compiler case\nbecause it is the most commonly used Fortran compiler for NWChem.  In particular,\nNWChem as a whole cannot be compiled with Cray Fortran, so the only context in which\nit would be used for these kernels is if someone did a mixed build.\nThe performance differences with XLF were observed on POWER7, which is a relatively\nrare platform for NWChem.\n\nIn any case, these optimizations are better than the serial version any time OpenMP\nis used.  Detailed performance information for some platforms can be found at\nhttps://github.com/jeffhammond/nwchem-tce-triples-kernels/tree/master/results.\n\nFinally, it should be noted that all of Jeff Hammond's developments for non-Intel\narchitectures were done prior to his employment at Intel, which can be verified\nfrom the Github commit log associated with the aforementioned repo.", "idx": 737}
+{"target": 1, "func": "[PATCH] Improved performance of interaction groups on CPU", "idx": 1419}
+{"target": 1, "func": "[PATCH] reverted commit 76d5bddd5c3dfdef76beaab8222231624eb75e89.\n Split ga_acc in moints2x_trf2K in smaller ga_acc on MPI-PR since gives large\n performance improvement on NERSC Cori", "idx": 989}
+{"target": 1, "func": "[PATCH] wmkdep: Added path string substitution support\n\nto avoid the need for sed'ing the output.  This improves performance by avoiding\nthe need for calling additional commands and generating a temporary file.", "idx": 1554}
+{"target": 1, "func": "[PATCH] Replaced Vertex_handle and Cell_handle by const & versions  \n in order to regain performance", "idx": 338}
+{"target": 1, "func": "[PATCH] performance improvement for DD assignment of settles with\n cg's", "idx": 1039}
+{"target": 0, "func": "[PATCH] AABB tree: the projection does not construct the KD-tree at\n the first projection query anymore. for efficient projection queries either\n the user calls for its explicit construction during the AABB construction or\n calls \"construct_search_Tree()\". otherwise the first primitive reference\n point is used as (naive) hint.", "idx": 235}
+{"target": 1, "func": "[PATCH] Improved ORB performance and memory usage on CUDA backend", "idx": 635}
+{"target": 1, "func": "[PATCH] Introduce self-pairs search in nbsearch\n\nMake it possible to search for all pairs within a single set of\npositions using AnalysisNeighborhood.  This effectively excludes half of\nthe pairs from the search, speeding things up.\n\nNot used yet anywhere, but this makes the code a better reference for\nperformance comparisons, and for places where this is applicable it has\npotential for speeding things up quite a bit.\n\nChange-Id: Ib0e6f36460b8dbda97704447222c864c149d8e56", "idx": 127}
+{"target": 1, "func": "[PATCH] Improve the performance of the `Record` logger by using\n deques of `std::unique_ptr` instead of plain object.", "idx": 811}
+{"target": 1, "func": "[PATCH] Used sparse identity for more efficient memory utilization", "idx": 1502}
+{"target": 1, "func": "[PATCH] bond/react: efficient competing reactions", "idx": 280}
+{"target": 1, "func": "[PATCH] Fixed typo in HegstRLVar3/HegstRUVar3 and improved\n performance of Trmm and Symm for relatively small numbers of right-hand\n sides.", "idx": 399}
+{"target": 0, "func": "[PATCH] Add initial reinit_func(1.) call in Euler2Solver\n\nWe need the call to reinit_func to set the time, t, in the context\nto the correct value. Also added clarifying comments in EulerSolver\nand Euler2Solver that we're also setting the time in addition to\npossibly resetting the mesh if there's mesh motion.\n\nThere is probably a way to make this more efficient such that we\nonly call reinit_func twice in Euler2Solver, but I didn't put any\nthought into it.", "idx": 371}
+{"target": 1, "func": "[PATCH] Changed number of nonbonded thread blocks to improve\n performance", "idx": 788}
+{"target": 0, "func": "[PATCH] Removed nbnxn kernel blendv optimization\n\nThe nbnxn simd kernel blendv optmization, which was accidentally\ndeactivated since 5.0, has been removed. It made assumptions about\nthe internal storage of SIMD representations. With gcc 4.x blendv\nwould give a small performance improvement, but with gcc 5 performance\nis equal or deteriorates.\n\nChange-Id: I2b07895257a2fde0ade2a627369ed22683dd89e1", "idx": 1158}
+{"target": 1, "func": "[PATCH] Fix performance problems for large molecular systems", "idx": 1061}
+{"target": 1, "func": "[PATCH] Optimize the script (from 20 minutes to 0.3 seconds!)\n\n- avoid opening and reading the file `processed_test_results` thousands\n  of time: its content is stored once in a hash, for fast lookup,\n\n- do not call `fuser` for files that are already processed", "idx": 978}
+{"target": 1, "func": "[PATCH] more efficient live variables in SX virtual machine", "idx": 226}
+{"target": 1, "func": "[PATCH] #1295 Refactored MX::setSub(IMatrix,IMatrix) The new\n implementation should be much more efficient and handle non-monotone indices\n correctly", "idx": 210}
+{"target": 1, "func": "[PATCH] LJ combination rule kernels for OpenCL\n\nThe current implementation enables combination rules for both AMD and\nNVIDIA OpenCL (also ports the changes to the \"nowarp\" test/CPU kernel).\n\nLike in the CUDA implementation, all kernels support it, but only for\nplain cut-off are combination rules used.\n\nNotes:\n- On AMD tested on Hawaii, Fiji, Spectre and Oland devices;\n  combination rules in all cases improve performance, although combined\n  with the i-prefetching, the improvement is typically only ~10%.\n- On NVIDIA tested on Kepler and Maxwell; in most cases the combination\n  rule kernels are fastest.\n  However, with certain inputs these kernels are 25% slower on Maxwell\n  (e.g. pure water box, cut-off LJ, pot shift), but not on Kepler.\n  This is likely a compiler mis-optimization, so we'll just leave the\n  defaults the same as AMD.\n\nChange-Id: I05396e000cdf93c1d872729e6b477192af152495", "idx": 1337}
+{"target": 1, "func": "[PATCH] Performance Improvements - changed Cartesian to\n Simple_cartesian in the examples - changed list to vector in the code -\n removed unnecessary includes - introduced multipass_distance", "idx": 883}
+{"target": 1, "func": "[PATCH] more efficient comparison function", "idx": 99}
+{"target": 1, "func": "[PATCH] new, more efficient jacobian calculation for integrator", "idx": 447}
+{"target": 1, "func": "[PATCH] Fix AMD OpenCL float3 array optimization bug\n\nBecause float3 by OpenCL spec is 16-byte, when used as an array type\nthe allocation needs to optimized to avoid unnecessary register use.\nThe nbnxm kernels use a float3 i-force accumulator array in registers.\n\nStarting with ROCm 2.3 the AMD OpenCL compiler regressed and lost\nits ability to effectively optimize code that uses float3 register\narrays. The large amount of extra registers used limits the kernel\noccupancy and significantly impacts performance.\nOnly the AMD platform is affected, other vendors' compilers are able to\ndo the necessary transformations to avoid the extra register use.\n\nThis change converts the float3 array to a float[3] saving 8*4 bytes\nregister space. This improves nonbonded kernel performance\non an AMD Vega GPU by 25% and 40% for the most common flavor of the\nEwald and RF force-only kernels, respectively.\n\nNote that eliminating the rest of the non-array use of float3 has no\nsignificant impact.", "idx": 1545}
+{"target": 1, "func": "[PATCH] increased granularity of performance logging, fixed a bug in\n DofMap::add_neighbors_to_send_list() which caused the _send_list to become\n excessively large.  Further, this slowed the DofMap::sort_send_list() method\n considerably.", "idx": 881}
+{"target": 1, "func": "[PATCH] Removed unnecassary flush of trn,xtc,edr. Important for\n performance for very frequent writes (on small systems) Fixed a bug related\n to setting the duty of pp/pme/io", "idx": 1145}
+{"target": 1, "func": "[PATCH] Use a fixed-length arrays, avoid heap allocation.\n\nAlso reduce the default number of elements so that it runs fast enough in DBG mode.", "idx": 840}
+{"target": 1, "func": "[PATCH] Improve performance of GEMM for small matrices when SMP is\n defined.\n\nAlways checking num_cpu_avail() regardless of whether threading will actually\nbe used adds noticeable overhead for small matrices.  Most other uses of\nnum_cpu_avail() do so only if threading will be used, so do the same here.", "idx": 1492}
+{"target": 0, "func": "[PATCH] Grid-based utility nbsearch implementation.\n\nMore efficient implementations are possible, but the present one should\nwork reasonably well in most cases, also for triclinic cells, without too\nmuch complexity.", "idx": 517}
+{"target": 1, "func": "[PATCH] Reduced the cost of the pull communication\n\nWith more than 32 ranks, a sub-communicator will be used\nfor the pull communication. This reduces the pull communication\nsignificantly with small pull groups. With large pull groups the total\nsimulation performance might not improve much, because ranks\nthat are not in the sub-communicator will later wait for the pull\nranks during the communication for the constraints.\n\nAdded a pull_comm_t struct to separate the data used for communication.\n\nChange-Id: I92b64d098b508b11718ef3ae175b771032ad7be2", "idx": 1008}
+{"target": 1, "func": "[PATCH] make skylakex sgemm code more friendly for readers\n\nBTW some kernels were adjusted to improve performance", "idx": 1457}
+{"target": 1, "func": "[PATCH] Minor performance improments\n\nMostly useful as lesson learned.\n\n1) The double precision constant forces the compiler to convert\n   the single precision variable to double, then do the multiplication\n   in double and then convert back. Using the single precsion\n   constant in double reduces the accuracy (the calculation is still done\n   double but the constant has only single precision).\n2) Using a temporary array instead of a temporary scalar causes ICC14 to\n   generate an extra store.\n\nChange-Id: Ib320ac2ae4ff80ce48277544abff468c483cc83a", "idx": 20}
+{"target": 1, "func": "[PATCH] efficient sparsity pattern computation for the case when the\n user specifies the DOF coupling", "idx": 793}
+{"target": 1, "func": "[PATCH] Beginning to add support for freezing the sparsity pattern of\n graphs and sparse matrices to improve the performance of subsequent updates", "idx": 289}
+{"target": 1, "func": "[PATCH] Improving performance of BigInt/BigFloat routines (such as\n Cholesky) by more than a factor of three by avoiding allocations within the\n templated BLAS routines", "idx": 559}
+{"target": 0, "func": "[PATCH] Encapsulate code in ifdef NUMPY clauses. Efficient pythoncode\n for toArray.", "idx": 71}
+{"target": 1, "func": "[PATCH] Enforce memory alignment to improve performance of vector\n operations.  Also fixed bugs in an earlier optimization.", "idx": 1458}
+{"target": 0, "func": "[PATCH] propagate lower bound for culling on TM1 to accelerate\n symmetric distance", "idx": 1011}
+{"target": 1, "func": "[PATCH] Removed Reaction-Field-nec\n\nThe RF no exclusion correction option was only introduced for\nbackward compatibility and a performance advantage for systems\nwith only rigid molecules (e.g. water). For all other systems\nthe forces are incorrect. The Verlet scheme did not support this\noption and if it would, it wouldn't even improve performance.\n\nChange-Id: Ic22ccf76d50b5bb7951fcac2293621b5eef285c5", "idx": 1443}
+{"target": 1, "func": "[PATCH] Replace vpermpd with vpermilpd in the Haswell DTRMM kernel\n\nto improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186", "idx": 142}
+{"target": 0, "func": "[PATCH] query t on side of bounded square\n\nI moved a lot of the functionality for deciding the Linf incircle\ntest for four points to the side of bounded square predicate.\n\nIn the case of query point t being on one of the sides of the\nbounded square, I use the predicate test1d. Maybe even this can\nbe optimized, or made even more robust with some more checks.\n\nA bug that is fixed with the current commit is in the following\ninput:\n\n$ cat ~/Dropbox/cgal/sdg/panos/sqch1a.cin\np -51 -180\np -180 -30\np -180 20\np -7 -180\n\nI also fixed a small bug when expanding both sides of the bounded\nsquare.\n\nThe next step is to completely remove the slow \"side of oriented\nsquare\" test.\n\nSigned-off-by: Panagiotis Cheilaris <philaris@cs.ntua.gr>", "idx": 762}
+{"target": 0, "func": "[PATCH] all gonzalez stuff uploaded for trying to fix gonzalez, make\n it fast and accurate", "idx": 754}
+{"target": 1, "func": "[PATCH] #1285 Refactred substituteInPlace. Now more efficient and has\n same signature for SX and MX.", "idx": 418}
+{"target": 1, "func": "[PATCH] made GMX_FORCE_ENERGY a separate flag\n\nGMX_FORCE_ENERGY was (temporarily) defined as GMX_FORCE_VIRIAL.\nNow it is a separate flags, which is less confusing. This allows\nnstcalcenergy to be larger than nstpcouple, which improves performance\nwith the SSE and CUDA kernels.", "idx": 950}
+{"target": 1, "func": "[PATCH] sbgemm: cooperlake: reorder ptr increase for performance", "idx": 1005}
+{"target": 1, "func": "[PATCH] Adds a URIManager to manage all URIs within an array\n directory. This introduces several performance improvements, especially\n around redundant URI listings, parallelizing URI listings, etc. Also makes\n VFS::ls a noop for POSIX and HDFS when the listed directory does not exist\n instead of throwing an error, matching the functionality of the object\n stores. Finally, it removes partial vacuuming, as that leads to incorrect\n behavior with time traveling.", "idx": 1561}
+{"target": 1, "func": "[PATCH] using int instead of size_t should be more efficient and\n range doesn't seem to be needed", "idx": 801}
+{"target": 1, "func": "[PATCH] Stage bonded kernel atomics through shared memory\n\nFixes performance bug introduced in 01b2f20bd5 by staging energy step\natomics through shared memory rather than have all threads write\natomically directly to global memory.\n\nFixes #3443", "idx": 654}
+{"target": 1, "func": "[PATCH] improved performance MatvecCommPkgCreate", "idx": 870}
+{"target": 1, "func": "[PATCH] 2d convolve performance improvements\n\nchanged the shared memory loading access pattern in 2d convolve\nkernel for cuda and opencl backends", "idx": 1297}
+{"target": 1, "func": "[PATCH] Use new style with make_array(), more compact and efficient", "idx": 1395}
+{"target": 1, "func": "[PATCH] Restructured nonbonded calculation to allow more efficient\n vectorization", "idx": 1356}
+{"target": 1, "func": "[PATCH] tutorials: Changed compressed ascii output to binary to\n improve IO performance\n\nalso rationalized the writeCompression specification", "idx": 1257}
+{"target": 1, "func": "[PATCH] NumericVector::add_vector refactoring\n\nSimilar to #411 and #413\n\nThis was originally intended to be just another additional T* API plus\na refactoring; however, the new PetscVector::add_vector(DenseVector)\ncode path should be a performance improvement as well.", "idx": 5}
+{"target": 1, "func": "[PATCH] Edit for faster performance", "idx": 1238}
+{"target": 1, "func": "[PATCH] Made some performance improvements and fixed a bug when\n running on a single processor but compiled with mpi.", "idx": 435}
+{"target": 1, "func": "[PATCH] bond/react: performance improvement", "idx": 1396}
+{"target": 1, "func": "[PATCH] 128-bit AVX2 SIMD support\n\nAdd 128 bit support for AVX2. Similar to AVX-128, this\nimproves slightly on SSE2 due to more efficient instructions,\nand the shorter SIMD width is beneficial in some cases. Both\n128- and 256-bit flavors will be built automatically with\n--enable-avx2, and the timing routines will chose the best one\nautomatically.", "idx": 452}
+{"target": 1, "func": "[PATCH] Use std::make_shared instead of new...\n\nIt is more efficient, since it requires only one memory allocation in\ncontrast to two.", "idx": 548}

data1.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba93763a8a0cfc650cf16cfe0bab2f4e89ec24a700c59c297928ead8429a6eb0
+size 71201

test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff