| | #include "ggml.h"
|
| | #include "ggml-cpu.h"
|
| |
|
| | #include <chrono>
|
| | #include <iostream>
|
| | #include <cstdio>
|
| | #include <cstdlib>
|
| | #include <cassert>
|
| | #include <vector>
|
| | #include <thread>
|
| |
|
| | #define MAX_NARGS 2
|
| |
|
| | static void test_barrier(int n_threads, int n_rounds) {
|
| | struct ggml_init_params params = {
|
| | 1024*1024*1024,
|
| | NULL,
|
| | false,
|
| | };
|
| |
|
| | struct ggml_context * ctx = ggml_init(params);
|
| |
|
| |
|
| | struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
| |
|
| |
|
| | struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
|
| | for (int i = 0; i < 1000; i++) {
|
| | struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
|
| | out = ggml_mul_mat(ctx, a, out);
|
| |
|
| | struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
|
| | out = ggml_mul_mat(ctx, d, out);
|
| | }
|
| |
|
| | ggml_build_forward_expand(gf, out);
|
| | int n_nodes = ggml_graph_n_nodes(gf);
|
| |
|
| |
|
| | struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
|
| | struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
| | if (!threadpool) {
|
| | fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
|
| | exit(1);
|
| | }
|
| |
|
| |
|
| | struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
|
| |
|
| | std::vector<uint8_t> work_data(cplan.work_size);
|
| | cplan.work_data = work_data.data();
|
| |
|
| | std::cerr << "graph-compute with"
|
| | << "\n n_threads: " << n_threads
|
| | << "\n n_nodes: " << n_nodes
|
| | << "\n n_rounds: " << n_rounds
|
| | << "\n";
|
| |
|
| |
|
| |
|
| | ggml_graph_compute(gf, &cplan);
|
| |
|
| | auto t0 = std::chrono::high_resolution_clock::now();
|
| |
|
| | for (int i=0; i < n_rounds; i++) {
|
| | ggml_graph_compute(gf, &cplan);
|
| | }
|
| |
|
| | auto t1 = std::chrono::high_resolution_clock::now();
|
| |
|
| | auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
|
| | auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
|
| | std::cerr << "graph-compute took " << usec << " usec "
|
| | << "\n " << (float) usec / n_rounds << " usec per-iter"
|
| | << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
|
| | << "\n";
|
| |
|
| | ggml_threadpool_free(threadpool);
|
| | ggml_free(ctx);
|
| | }
|
| |
|
| | static void test_active(int n_threads, int n_rounds) {
|
| | struct ggml_init_params params = {
|
| | 1024*1024*1024,
|
| | NULL,
|
| | false,
|
| | };
|
| |
|
| | struct ggml_context * ctx = ggml_init(params);
|
| |
|
| |
|
| | struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
| |
|
| |
|
| | struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
|
| | for (int i = 0; i < 2; i++) {
|
| | struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
|
| | out = ggml_mul_mat(ctx, a, out);
|
| |
|
| | struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
|
| | out = ggml_mul_mat(ctx, d, out);
|
| | }
|
| |
|
| | ggml_build_forward_expand(gf, out);
|
| | int n_nodes = ggml_graph_n_nodes(gf);
|
| |
|
| |
|
| | struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
|
| | struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
| | if (!threadpool) {
|
| | fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
|
| | exit(1);
|
| | }
|
| |
|
| | std::cerr << "graph-compute with"
|
| | << "\n n_threads: " << n_threads
|
| | << "\n n_nodes: " << n_nodes
|
| | << "\n n_rounds: " << n_rounds
|
| | << "\n";
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | for (int i=0; i < n_rounds; i++) {
|
| | struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
| |
|
| | std::vector<uint8_t> work_data(cplan.work_size);
|
| | cplan.work_data = work_data.data();
|
| |
|
| | ggml_graph_compute(gf, &cplan);
|
| | }
|
| |
|
| | ggml_threadpool_free(threadpool);
|
| | ggml_free(ctx);
|
| | }
|
| |
|
| | static void test_multi_graph(int n_threads, int n_rounds) {
|
| | struct ggml_init_params params = {
|
| | 1024*1024*1024,
|
| | NULL,
|
| | false,
|
| | };
|
| |
|
| | struct ggml_context * ctx = ggml_init(params);
|
| |
|
| |
|
| | struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
|
| | {
|
| |
|
| | struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
|
| | for (int i = 0; i < 2; i++) {
|
| | struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
|
| | out = ggml_mul_mat(ctx, a, out);
|
| |
|
| | struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
|
| | out = ggml_mul_mat(ctx, d, out);
|
| | }
|
| |
|
| | ggml_build_forward_expand(gf0, out);
|
| | }
|
| |
|
| | struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
|
| | {
|
| |
|
| |
|
| | struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 256);
|
| | for (int i = 0; i < 4; i++) {
|
| | struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
|
| | out = ggml_mul_mat(ctx, a, out);
|
| |
|
| | struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
|
| | out = ggml_mul_mat(ctx, d, out);
|
| | }
|
| |
|
| | ggml_build_forward_expand(gf1, out);
|
| | }
|
| |
|
| |
|
| |
|
| | struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
|
| | struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
| | if (!threadpool) {
|
| | fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
|
| | exit(1);
|
| | }
|
| |
|
| | std::cerr << "graph-compute with"
|
| | << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
|
| | << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
|
| | << "\n n_threads: " << n_threads
|
| | << "\n n_rounds: " << n_rounds
|
| | << "\n";
|
| |
|
| |
|
| |
|
| |
|
| | for (int i=0; i < n_rounds; i++) {
|
| | struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
| | std::vector<uint8_t> work_data0(cplan0.work_size);
|
| | cplan0.work_data = work_data0.data();
|
| |
|
| | struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
| | std::vector<uint8_t> work_data1(cplan1.work_size);
|
| | cplan1.work_data = work_data1.data();
|
| |
|
| | ggml_graph_compute(gf0, &cplan0);
|
| | ggml_graph_compute(gf1, &cplan1);
|
| | }
|
| |
|
| | ggml_threadpool_free(threadpool);
|
| | ggml_free(ctx);
|
| | }
|
| |
|
| |
|
| | int main(int argc, char *argv[]) {
|
| |
|
| | int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
|
| | int n_rounds = 100;
|
| |
|
| | if (argc > 1) {
|
| | n_threads = std::atoi(argv[1]);
|
| | }
|
| |
|
| | if (argc > 2) {
|
| | n_rounds = std::atoi(argv[2]);
|
| | }
|
| |
|
| | test_barrier(n_threads, n_rounds);
|
| |
|
| | test_active(n_threads, n_rounds * 100);
|
| |
|
| | test_multi_graph(n_threads, n_rounds * 10);
|
| |
|
| | return 0;
|
| | }
|
| |
|