ncnn / src /cpu.cpp
camenduru's picture
thanks to ncnn ❤
be903e2
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#include "cpu.h"
#include "platform.h"
#include <limits.h>
#include <stdio.h>
#include <string.h>
#ifdef _OPENMP
#if NCNN_SIMPLEOMP
#include "simpleomp.h"
#else
#include <omp.h>
#endif
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#ifdef _MSC_VER
#include <intrin.h> // __cpuid()
#include <immintrin.h> // _xgetbv()
#endif
#if defined(__clang__) || defined(__GNUC__)
#include <cpuid.h> // __get_cpuid() and __cpuid_count()
#endif
#endif
#ifdef __EMSCRIPTEN__
#include <emscripten/threading.h>
#endif
#if defined _WIN32 && !(defined __MINGW32__)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <powerbase.h>
#endif
#if defined __ANDROID__ || defined __linux__
#if defined __ANDROID__
#if __ANDROID_API__ >= 18
#include <sys/auxv.h> // getauxval()
#endif
#include <sys/system_properties.h> // __system_property_get()
#include <dlfcn.h>
#endif
#include <stdint.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include <mach/mach.h>
#include <mach/machine.h>
#include <mach/thread_act.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#define __IOS__ 1
#endif
// define missing cpu model for old sdk
#ifndef CPUFAMILY_ARM_HURRICANE
#define CPUFAMILY_ARM_HURRICANE 0x67ceee93
#endif
// A11
#ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
#endif
// A12
#ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
#endif
// A13
#ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
#endif
// A14
#ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
#define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
#endif
// A15
#ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
#define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
#endif
// A16
#ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH
#define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
#endif
// M1
#ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
#define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
#endif
// M2
#ifndef CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD
#define CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD 0xda33d83d
#endif
#endif // __APPLE__
#if defined(__SSE3__)
#include <immintrin.h>
#endif
// topology info
static int g_cpucount;
static int g_physical_cpucount;
static int g_powersave;
static ncnn::CpuSet g_cpu_affinity_mask_all;
static ncnn::CpuSet g_cpu_affinity_mask_little;
static ncnn::CpuSet g_cpu_affinity_mask_big;
// isa info
#if defined __ANDROID__ || defined __linux__
static unsigned int g_hwcaps;
static unsigned int g_hwcaps2;
#endif // defined __ANDROID__ || defined __linux__
#if __APPLE__
static unsigned int g_hw_cpufamily;
static cpu_type_t g_hw_cputype;
static cpu_subtype_t g_hw_cpusubtype;
static int g_hw_optional_arm_FEAT_FP16;
static int g_hw_optional_arm_FEAT_DotProd;
static int g_hw_optional_arm_FEAT_FHM;
static int g_hw_optional_arm_FEAT_BF16;
static int g_hw_optional_arm_FEAT_I8MM;
#endif // __APPLE__
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static int g_cpu_support_x86_avx;
static int g_cpu_support_x86_fma;
static int g_cpu_support_x86_xop;
static int g_cpu_support_x86_f16c;
static int g_cpu_support_x86_avx2;
static int g_cpu_support_x86_avx_vnni;
static int g_cpu_support_x86_avx512;
static int g_cpu_support_x86_avx512_vnni;
static int g_cpu_support_x86_avx512_bf16;
static int g_cpu_support_x86_avx512_fp16;
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static int g_cpu_level2_cachesize;
static int g_cpu_level3_cachesize;
// misc info
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
static int g_cpu_is_arm_a53_a55;
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
#if defined __ANDROID__ || defined __linux__
#define AT_HWCAP 16
#define AT_HWCAP2 26
#if __aarch64__
// from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD (1 << 1)
#define HWCAP_ASIMDHP (1 << 10)
#define HWCAP_CPUID (1 << 11)
#define HWCAP_ASIMDDP (1 << 20)
#define HWCAP_SVE (1 << 22)
#define HWCAP_ASIMDFHM (1 << 23)
#define HWCAP2_SVE2 (1 << 1)
#define HWCAP2_SVEI8MM (1 << 9)
#define HWCAP2_SVEF32MM (1 << 10)
#define HWCAP2_SVEBF16 (1 << 12)
#define HWCAP2_I8MM (1 << 13)
#define HWCAP2_BF16 (1 << 14)
#else
// from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_EDSP (1 << 7)
#define HWCAP_NEON (1 << 12)
#define HWCAP_VFPv4 (1 << 16)
#endif
#if __mips__
// from arch/mips/include/uapi/asm/hwcap.h
#define HWCAP_MIPS_MSA (1 << 1)
#define HWCAP_LOONGSON_MMI (1 << 11)
#endif
#if __loongarch64
// from arch/loongarch/include/uapi/asm/hwcap.h
#define HWCAP_LOONGARCH_LSX (1 << 4)
#define HWCAP_LOONGARCH_LASX (1 << 5)
#endif
#if __riscv
// from arch/riscv/include/uapi/asm/hwcap.h
#define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif
#if defined __ANDROID__
// Probe the system's C library for a 'getauxval' function and call it if
// it exits, or return 0 for failure. This function is available since API
// level 18.
//
// Note that getauxval() can't really be re-implemented here, because
// its implementation does not parse /proc/self/auxv. Instead it depends
// on values that are passed by the kernel at process-init time to the
// C runtime initialization layer.
static unsigned int get_elf_hwcap_from_getauxval(unsigned int type)
{
#if __ANDROID_API__ >= 18
unsigned int hwcap = getauxval(type);
if (hwcap)
return hwcap;
#endif
typedef unsigned long getauxval_func_t(unsigned long);
dlerror();
void* libc_handle = dlopen("libc.so", RTLD_NOW);
if (!libc_handle)
{
NCNN_LOGE("dlopen libc.so failed %s", dlerror());
return 0;
}
unsigned int result = 0;
getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
if (!func)
{
NCNN_LOGE("dlsym getauxval failed");
}
else
{
// Note: getauxval() returns 0 on failure. Doesn't touch errno.
result = (unsigned int)(*func)(type);
}
dlclose(libc_handle);
return result;
}
#endif // defined __ANDROID__
// extract the ELF HW capabilities bitmap from /proc/self/auxv
static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
{
FILE* fp = fopen("/proc/self/auxv", "rb");
if (!fp)
{
NCNN_LOGE("fopen /proc/self/auxv failed");
return 0;
}
#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
struct
{
uint64_t tag;
uint64_t value;
} entry;
#else
struct
{
unsigned int tag;
unsigned int value;
} entry;
#endif
unsigned int result = 0;
while (!feof(fp))
{
int nread = fread((char*)&entry, sizeof(entry), 1, fp);
if (nread != 1)
break;
if (entry.tag == 0 && entry.value == 0)
break;
if (entry.tag == type)
{
result = entry.value;
break;
}
}
fclose(fp);
return result;
}
static unsigned int get_elf_hwcap(unsigned int type)
{
unsigned int hwcap = 0;
#if defined __ANDROID__
hwcap = get_elf_hwcap_from_getauxval(type);
#endif
if (!hwcap)
hwcap = get_elf_hwcap_from_proc_self_auxv(type);
#if defined __ANDROID__
#if __aarch64__
if (type == AT_HWCAP)
{
// samsung exynos9810 on android pre-9 incorrectly reports armv8.2
// for little cores, but big cores only support armv8.0
// drop all armv8.2 features used by ncnn for preventing SIGILLs
// ref https://reviews.llvm.org/D114523
char arch[PROP_VALUE_MAX];
int len = __system_property_get("ro.arch", arch);
if (len > 0 && strncmp(arch, "exynos9810", 10) == 0)
{
hwcap &= ~HWCAP_ASIMDHP;
hwcap &= ~HWCAP_ASIMDDP;
}
}
#endif // __aarch64__
#endif // defined __ANDROID__
return hwcap;
}
#endif // defined __ANDROID__ || defined __linux__
#if __APPLE__
static unsigned int get_hw_cpufamily()
{
unsigned int value = 0;
size_t len = sizeof(value);
sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
return value;
}
static cpu_type_t get_hw_cputype()
{
cpu_type_t value = 0;
size_t len = sizeof(value);
sysctlbyname("hw.cputype", &value, &len, NULL, 0);
return value;
}
static cpu_subtype_t get_hw_cpusubtype()
{
cpu_subtype_t value = 0;
size_t len = sizeof(value);
sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
return value;
}
static int get_hw_capability(const char* cap)
{
int64_t value = 0;
size_t len = sizeof(value);
sysctlbyname(cap, &value, &len, NULL, 0);
return value;
}
#endif // __APPLE__
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static inline void x86_cpuid(int level, unsigned int out[4])
{
#if defined(_MSC_VER) && !defined(__clang__)
__cpuid((int*)out, level);
#elif defined(__clang__) || defined(__GNUC__)
__get_cpuid(level, out, out + 1, out + 2, out + 3);
#else
NCNN_LOGE("x86_cpuid is unknown for current compiler");
out[0] = 0;
out[1] = 0;
out[2] = 0;
out[3] = 0;
#endif
}
static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
{
#if defined(_MSC_VER)
__cpuidex((int*)out, level, sublevel);
#elif defined(__clang__) || defined(__GNUC__)
__cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
#else
NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
out[0] = 0;
out[1] = 0;
out[2] = 0;
out[3] = 0;
#endif
}
static inline int x86_get_xcr0()
{
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
return _xgetbv(0);
#elif defined(__i386__) || defined(__x86_64__)
int xcr0 = 0;
asm(".byte 0x0f, 0x01, 0xd0"
: "=a"(xcr0)
: "c"(0)
: "%edx");
return xcr0;
#else
NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
return 0xffffffff; // assume it will work
#endif
}
static int get_cpu_support_x86_avx()
{
#if !NCNN_AVX
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 1)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
return 1;
}
static int get_cpu_support_x86_fma()
{
#if !NCNN_FMA
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
return cpu_info[2] & (1u << 12);
}
static int get_cpu_support_x86_xop()
{
#if !NCNN_XOP
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0x80000000, cpu_info);
if (cpu_info[0] < 0x80000001)
return 0;
x86_cpuid(0x80000001, cpu_info);
return cpu_info[2] & (1u << 11);
}
static int get_cpu_support_x86_f16c()
{
#if !NCNN_F16C
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 1)
return 0;
x86_cpuid(1, cpu_info);
return cpu_info[2] & (1u << 29);
}
static int get_cpu_support_x86_avx2()
{
#if !NCNN_AVX2
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
x86_cpuid_sublevel(7, 0, cpu_info);
return cpu_info[1] & (1u << 5);
}
static int get_cpu_support_x86_avx_vnni()
{
#if !NCNN_AVXVNNI
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
x86_cpuid_sublevel(7, 1, cpu_info);
return cpu_info[0] & (1u << 4);
}
static int get_cpu_support_x86_avx512()
{
#if !NCNN_AVX512
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
// check avx512 XSAVE enabled by kernel
if ((x86_get_xcr0() & 0xe0) != 0xe0)
return 0;
x86_cpuid_sublevel(7, 0, cpu_info);
return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
}
static int get_cpu_support_x86_avx512_vnni()
{
#if !NCNN_AVX512VNNI
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
// check avx512 XSAVE enabled by kernel
if ((x86_get_xcr0() & 0xe0) != 0xe0)
return 0;
x86_cpuid_sublevel(7, 0, cpu_info);
return cpu_info[2] & (1u << 11);
}
static int get_cpu_support_x86_avx512_bf16()
{
#if !NCNN_AVX512BF16
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
x86_cpuid_sublevel(7, 1, cpu_info);
return cpu_info[0] & (1u << 5);
}
static int get_cpu_support_x86_avx512_fp16()
{
#if !NCNN_AVX512FP16
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
int nIds = cpu_info[0];
if (nIds < 7)
return 0;
x86_cpuid(1, cpu_info);
// check AVX XSAVE OSXSAVE
if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
return 0;
// check XSAVE enabled by kernel
if ((x86_get_xcr0() & 6) != 6)
return 0;
// check avx512 XSAVE enabled by kernel
if ((x86_get_xcr0() & 0xe0) != 0xe0)
return 0;
x86_cpuid_sublevel(7, 0, cpu_info);
return cpu_info[3] & (1u << 23);
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static int get_cpucount()
{
int count = 0;
#ifdef __EMSCRIPTEN__
if (emscripten_has_threading_support())
count = emscripten_num_logical_cores();
else
count = 1;
#elif (defined _WIN32 && !(defined __MINGW32__))
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
count = system_info.dwNumberOfProcessors;
#elif defined __ANDROID__ || defined __linux__
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp)
return 1;
char line[1024];
while (!feof(fp))
{
char* s = fgets(line, 1024, fp);
if (!s)
break;
if (memcmp(line, "processor", 9) == 0)
{
count++;
}
}
fclose(fp);
#elif __APPLE__
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
#else
#ifdef _OPENMP
count = omp_get_max_threads();
#else
count = 1;
#endif // _OPENMP
#endif
if (count < 1)
count = 1;
return count;
}
#if defined __ANDROID__ || defined __linux__
static int get_thread_siblings(int cpuid)
{
char path[256];
sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
FILE* fp = fopen(path, "rb");
if (!fp)
return -1;
int thread_siblings = -1;
int nscan = fscanf(fp, "%x", &thread_siblings);
if (nscan != 1)
{
// ignore
}
fclose(fp);
return thread_siblings;
}
#endif // defined __ANDROID__ || defined __linux__
static int get_physical_cpucount()
{
int count = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi == NULL)
{
NCNN_LOGE("GetLogicalProcessorInformation is not supported");
return g_cpucount;
}
DWORD return_length = 0;
glpi(NULL, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationProcessorCore)
{
count++;
}
byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
#elif defined __ANDROID__ || defined __linux__
std::vector<int> thread_set;
for (int i = 0; i < g_cpucount; i++)
{
int thread_siblings = get_thread_siblings(i);
if (thread_siblings == -1)
{
// ignore malformed one
continue;
}
bool thread_siblings_exists = false;
for (size_t j = 0; j < thread_set.size(); j++)
{
if (thread_set[j] == thread_siblings)
{
thread_siblings_exists = true;
break;
}
}
if (!thread_siblings_exists)
{
thread_set.push_back(thread_siblings);
count++;
}
}
#elif __APPLE__
size_t len = sizeof(count);
sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
#else
count = g_cpucount;
#endif
if (count > g_cpucount)
count = g_cpucount;
return count;
}
#if defined __ANDROID__ || defined __linux__
static int get_data_cache_size(int cpuid, int level)
{
char path[256];
// discover sysfs cache entry
int indexid = -1;
for (int i = 0;; i++)
{
// check level
{
sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
FILE* fp = fopen(path, "rb");
if (!fp)
break;
int cache_level = -1;
int nscan = fscanf(fp, "%d", &cache_level);
fclose(fp);
if (nscan != 1 || cache_level != level)
continue;
}
// check type
{
sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
FILE* fp = fopen(path, "rb");
if (!fp)
break;
char type[32];
int nscan = fscanf(fp, "%31s", type);
fclose(fp);
if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
continue;
}
indexid = i;
break;
}
if (indexid == -1)
{
// no sysfs entry
return 0;
}
// get size
int cache_size_K = 0;
{
sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
FILE* fp = fopen(path, "rb");
if (!fp)
return 0;
int nscan = fscanf(fp, "%dK", &cache_size_K);
fclose(fp);
if (nscan != 1)
{
NCNN_LOGE("fscanf cache_size_K error %d", nscan);
return 0;
}
}
// parse shared_cpu_map mask
ncnn::CpuSet shared_cpu_map;
{
sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
FILE* fp = fopen(path, "rb");
if (!fp)
return 0;
char shared_cpu_map_str[256];
int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
fclose(fp);
if (nscan != 1)
{
NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
return 0;
}
int len = strlen(shared_cpu_map_str);
if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
{
// skip leading 0x
len -= 2;
}
int ci = 0;
for (int i = len - 1; i >= 0; i--)
{
char x = shared_cpu_map_str[i];
if (x & 1) shared_cpu_map.enable(ci + 0);
if (x & 2) shared_cpu_map.enable(ci + 1);
if (x & 4) shared_cpu_map.enable(ci + 2);
if (x & 8) shared_cpu_map.enable(ci + 3);
ci += 4;
}
}
if (shared_cpu_map.num_enabled() == 1)
return cache_size_K * 1024;
// resolve physical cpu count in the shared_cpu_map
int shared_physical_cpu_count = 0;
{
std::vector<int> thread_set;
for (int i = 0; i < g_cpucount; i++)
{
if (!shared_cpu_map.is_enabled(i))
continue;
int thread_siblings = get_thread_siblings(i);
if (thread_siblings == -1)
{
// ignore malformed one
continue;
}
bool thread_siblings_exists = false;
for (size_t j = 0; j < thread_set.size(); j++)
{
if (thread_set[j] == thread_siblings)
{
thread_siblings_exists = true;
break;
}
}
if (!thread_siblings_exists)
{
thread_set.push_back(thread_siblings);
shared_physical_cpu_count++;
}
}
}
// return per-physical-core cache size with 4K aligned
cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;
return cache_size_K * 1024;
}
static int get_big_cpu_data_cache_size(int level)
{
if (g_cpu_affinity_mask_big.num_enabled() == 0)
{
// smp cpu
return get_data_cache_size(0, level);
}
for (int i = 0; i < g_cpucount; i++)
{
if (g_cpu_affinity_mask_big.is_enabled(i))
{
return get_data_cache_size(i, level);
}
}
// should never reach here, fallback to cpu0
return get_data_cache_size(0, level);
}
#endif // defined __ANDROID__ || defined __linux__
static int get_cpu_level2_cachesize()
{
int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi != NULL)
{
DWORD return_length = 0;
glpi(NULL, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationCache)
{
PCACHE_DESCRIPTOR Cache = &ptr->Cache;
if (Cache->Level == 2)
{
size = std::max(size, (int)Cache->Size);
}
}
byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
}
#elif defined __ANDROID__ || defined __linux__
size = get_big_cpu_data_cache_size(2);
#if defined(_SC_LEVEL2_CACHE_SIZE)
if (size <= 0)
size = sysconf(_SC_LEVEL2_CACHE_SIZE);
#endif
#elif __APPLE__
// perflevel 0 is the higher performance cluster
int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
#endif
// fallback to a common value
if (size <= 0)
{
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
size = 64 * 1024;
if (g_cpu_support_x86_avx)
size = 128 * 1024;
if (g_cpu_support_x86_avx2)
size = 256 * 1024;
if (g_cpu_support_x86_avx512)
size = 1024 * 1024;
#elif __aarch64__
size = 256 * 1024;
#elif __arm__
size = 128 * 1024;
#else
// is 64k still too large here ?
size = 64 * 1024;
#endif
}
return size;
}
static int get_cpu_level3_cachesize()
{
int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi != NULL)
{
DWORD return_length = 0;
glpi(NULL, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationCache)
{
PCACHE_DESCRIPTOR Cache = &ptr->Cache;
if (Cache->Level == 3)
{
size = std::max(size, (int)Cache->Size);
}
}
byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
}
#elif defined __ANDROID__ || defined __linux__
size = get_big_cpu_data_cache_size(3);
#if defined(_SC_LEVEL3_CACHE_SIZE)
if (size <= 0)
size = sysconf(_SC_LEVEL3_CACHE_SIZE);
#endif
#elif __APPLE__
// perflevel 0 is the higher performance cluster
// get the size shared among all cpus
size = get_hw_capability("hw.perflevel0.l3cachesize");
#endif
// l3 cache size can be zero
return size;
}
#if (defined _WIN32 && !(defined __MINGW32__))
static ncnn::CpuSet get_smt_cpu_mask()
{
ncnn::CpuSet smt_cpu_mask;
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi == NULL)
{
NCNN_LOGE("GetLogicalProcessorInformation is not supported");
return smt_cpu_mask;
}
DWORD return_length = 0;
glpi(NULL, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationProcessorCore)
{
ncnn::CpuSet smt_set;
smt_set.mask = ptr->ProcessorMask;
if (smt_set.num_enabled() > 1)
{
// this core is smt
smt_cpu_mask.mask |= smt_set.mask;
}
}
byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
return smt_cpu_mask;
}
static std::vector<int> get_max_freq_mhz()
{
typedef struct _PROCESSOR_POWER_INFORMATION
{
ULONG Number;
ULONG MaxMhz;
ULONG CurrentMhz;
ULONG MhzLimit;
ULONG MaxIdleState;
ULONG CurrentIdleState;
} PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
if (cnpi == NULL)
{
NCNN_LOGE("CallNtPowerInformation is not supported");
FreeLibrary(powrprof);
return std::vector<int>(g_cpucount, 0);
}
DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
std::vector<int> ret;
for (int i = 0; i < g_cpucount; i++)
{
ULONG max_mhz = buffer[i].MaxMhz;
ret.push_back(max_mhz);
}
free(buffer);
FreeLibrary(powrprof);
return ret;
}
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
if (prev_mask == 0)
{
NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
return -1;
}
return 0;
}
#endif // (defined _WIN32 && !(defined __MINGW32__))
#if defined __ANDROID__ || defined __linux__
static int get_max_freq_khz(int cpuid)
{
// first try, for all possible cpu
char path[256];
sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
FILE* fp = fopen(path, "rb");
if (!fp)
{
// second try, for online cpu
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
fp = fopen(path, "rb");
if (fp)
{
int max_freq_khz = 0;
while (!feof(fp))
{
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
if (nscan != 1)
break;
if (freq_khz > max_freq_khz)
max_freq_khz = freq_khz;
}
fclose(fp);
if (max_freq_khz != 0)
return max_freq_khz;
fp = NULL;
}
if (!fp)
{
// third try, for online cpu
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
fp = fopen(path, "rb");
if (!fp)
return -1;
int max_freq_khz = -1;
int nscan = fscanf(fp, "%d", &max_freq_khz);
if (nscan != 1)
{
NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
}
fclose(fp);
return max_freq_khz;
}
}
int max_freq_khz = 0;
while (!feof(fp))
{
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
if (nscan != 1)
break;
if (freq_khz > max_freq_khz)
max_freq_khz = freq_khz;
}
fclose(fp);
return max_freq_khz;
}
static bool is_smt_cpu(int cpuid)
{
// https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
char path[256];
sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
FILE* fp = fopen(path, "rb");
if (!fp)
{
sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
fp = fopen(path, "rb");
if (!fp)
return false;
}
bool is_smt = false;
while (!feof(fp))
{
char ch = fgetc(fp);
if (ch == ',' || ch == '-')
{
is_smt = true;
break;
}
}
fclose(fp);
return is_smt;
}
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
// set affinity for thread
#if defined(__BIONIC__)
pid_t pid = gettid();
#else
pid_t pid = syscall(SYS_gettid);
#endif
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
if (syscallret)
{
NCNN_LOGE("syscall error %d", syscallret);
return -1;
}
return 0;
}
#endif // defined __ANDROID__ || defined __linux__
#if __APPLE__
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
// https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
// http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
// https://gist.github.com/Coneko/4234842
// This is a quite outdated document. Apple will not allow developers to set CPU affinity.
// In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
// see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
int affinity_tag = THREAD_AFFINITY_TAG_NULL;
for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
{
if (thread_affinity_mask.is_enabled(i))
{
affinity_tag = i + 1;
break;
}
}
mach_port_t tid = pthread_mach_thread_np(pthread_self());
thread_affinity_policy_data_t policy_data;
policy_data.affinity_tag = affinity_tag;
int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
if (ret && ret != KERN_NOT_SUPPORTED)
{
NCNN_LOGE("thread_policy_set error %d", ret);
return -1;
}
return 0;
}
#endif // __APPLE__
static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
{
mask_all.disable_all();
#if (defined _WIN32 && !(defined __MINGW32__))
// get max freq mhz for all cores
int max_freq_mhz_min = INT_MAX;
int max_freq_mhz_max = 0;
std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
for (int i = 0; i < g_cpucount; i++)
{
int max_freq_mhz = cpu_max_freq_mhz[i];
// NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
if (max_freq_mhz > max_freq_mhz_max)
max_freq_mhz_max = max_freq_mhz;
if (max_freq_mhz < max_freq_mhz_min)
max_freq_mhz_min = max_freq_mhz;
}
int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
if (max_freq_mhz_medium == max_freq_mhz_max)
{
mask_little.disable_all();
mask_big = mask_all;
return;
}
ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask();
for (int i = 0; i < g_cpucount; i++)
{
if (smt_cpu_mask.is_enabled(i))
{
// always treat smt core as big core
mask_big.enable(i);
continue;
}
if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
mask_little.enable(i);
else
mask_big.enable(i);
}
#elif defined __ANDROID__ || defined __linux__
int max_freq_khz_min = INT_MAX;
int max_freq_khz_max = 0;
std::vector<int> cpu_max_freq_khz(g_cpucount);
for (int i = 0; i < g_cpucount; i++)
{
int max_freq_khz = get_max_freq_khz(i);
// NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
cpu_max_freq_khz[i] = max_freq_khz;
if (max_freq_khz > max_freq_khz_max)
max_freq_khz_max = max_freq_khz;
if (max_freq_khz < max_freq_khz_min)
max_freq_khz_min = max_freq_khz;
}
int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
if (max_freq_khz_medium == max_freq_khz_max)
{
mask_little.disable_all();
mask_big = mask_all;
return;
}
for (int i = 0; i < g_cpucount; i++)
{
if (is_smt_cpu(i))
{
// always treat smt core as big core
mask_big.enable(i);
continue;
}
if (cpu_max_freq_khz[i] < max_freq_khz_medium)
mask_little.enable(i);
else
mask_big.enable(i);
}
#elif __APPLE__
int nperflevels = get_hw_capability("hw.nperflevels");
if (nperflevels == 1)
{
// smp models
mask_little.disable_all();
mask_big = mask_all;
}
else
{
// two or more clusters, level0 is the high-performance cluster
int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
for (int i = 0; i < perflevel0_logicalcpu; i++)
{
mask_big.enable(i);
}
for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
{
mask_little.enable(i);
}
}
#else
// TODO implement me for other platforms
mask_little.disable_all();
mask_big = mask_all;
#endif
}
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
union midr_info_t
{
struct __attribute__((packed))
{
unsigned int revision : 4;
unsigned int part : 12;
unsigned int architecture : 4;
unsigned int variant : 4;
unsigned int implementer : 8;
};
unsigned int midr;
midr_info_t(unsigned int _midr)
: midr(_midr)
{
}
};
static unsigned int get_midr_from_sysfs(int cpuid)
{
char path[256];
sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);
FILE* fp = fopen(path, "rb");
if (!fp)
return 0;
unsigned int midr_el1 = 0;
int nscan = fscanf(fp, "%x", &midr_el1);
if (nscan != 1)
{
// ignore
}
fclose(fp);
return midr_el1;
}
static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
{
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp)
return -1;
midrs.resize(g_cpucount, 0);
int cpuid = -1;
midr_info_t midr_info(0);
char line[1024];
while (!feof(fp))
{
char* s = fgets(line, 1024, fp);
if (!s)
break;
if (memcmp(line, "processor", 9) == 0)
{
// processor : 4
int id = -1;
int nscan = sscanf(line, "%*[^:]: %d", &id);
if (nscan != 1)
continue;
if (cpuid >= 0 && cpuid < g_cpucount)
{
if (midr_info.midr == 0)
{
// shared midr
midrs[cpuid] = (unsigned int)-1;
}
else
{
// save midr and reset
midrs[cpuid] = midr_info.midr;
for (int i = 0; i < g_cpucount; i++)
{
if (midrs[i] == (unsigned int)-1)
midrs[i] = midr_info.midr;
}
}
midr_info.midr = 0;
}
cpuid = id;
}
if (cpuid == -1)
continue;
if (memcmp(line, "CPU implementer", 15) == 0)
{
// CPU implementer : 0x51
unsigned int id = 0;
int nscan = sscanf(line, "%*[^:]: %x", &id);
if (nscan != 1)
continue;
midr_info.implementer = id;
}
else if (memcmp(line, "CPU architecture", 16) == 0)
{
// CPU architecture: 8
int id = 0;
int nscan = sscanf(line, "%*[^:]: %d", &id);
if (nscan != 1)
continue;
midr_info.architecture = id;
}
else if (memcmp(line, "CPU variant", 11) == 0)
{
// CPU variant : 0xd
int id = 0;
int nscan = sscanf(line, "%*[^:]: %x", &id);
if (nscan != 1)
continue;
midr_info.variant = id;
}
else if (memcmp(line, "CPU part", 8) == 0)
{
// CPU part : 0x804
int id = 0;
int nscan = sscanf(line, "%*[^:]: %x", &id);
if (nscan != 1)
continue;
midr_info.part = id;
}
else if (memcmp(line, "CPU revision", 12) == 0)
{
// CPU revision : 14
int id = 0;
int nscan = sscanf(line, "%*[^:]: %d", &id);
if (nscan != 1)
continue;
midr_info.revision = id;
}
}
fclose(fp);
if (cpuid >= 0 && cpuid < g_cpucount)
{
if (midr_info.midr == 0)
{
// shared midr
midrs[cpuid] = (unsigned int)-1;
}
else
{
// save midr and reset
midrs[cpuid] = midr_info.midr;
for (int i = 0; i < g_cpucount; i++)
{
if (midrs[i] == (unsigned int)-1)
midrs[i] = midr_info.midr;
}
}
midr_info.midr = 0;
}
// /proc/cpuinfo may only report little/online cores on old kernel
if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount)
{
// assign the remaining unknown midrs for smp cpu
for (int i = 0; i < g_cpucount; i++)
{
if (midrs[i] == 0)
midrs[i] = midr_info.midr;
}
}
else
{
// clear the big core midrs for hmp cpu if they are the same as little cores
unsigned int little_midr = 0;
for (int i = 0; i < g_cpucount; i++)
{
if (g_cpu_affinity_mask_little.is_enabled(i))
{
little_midr = midrs[i];
break;
}
}
for (int i = 0; i < g_cpucount; i++)
{
if (g_cpu_affinity_mask_big.is_enabled(i))
{
if (midrs[i] == little_midr)
{
midrs[i] = 0;
}
}
}
}
return 0;
}
// return midr for the current running core
static unsigned int get_midr_from_register()
{
uint64_t midr;
asm volatile("mrs %0, MIDR_EL1"
: "=r"(midr));
return (unsigned int)midr;
}
static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
{
// get affinity for thread
#if defined(__BIONIC__)
pid_t pid = gettid();
#else
pid_t pid = syscall(SYS_gettid);
#endif
thread_affinity_mask.disable_all();
int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
if (syscallret)
{
// handle get error silently
return -1;
}
return 0;
}
static int midr_is_a53_a55(unsigned int midr)
{
// 0x 41 ? f d03 ? = arm cortex-a53
// 0x 51 ? f 801 ? = qcom kryo200 a53
// 0x 41 ? f d04 ? = arm cortex-a35
// 0x 41 ? f d05 ? = arm cortex-a55
// 0x 51 ? f 803 ? = qcom kryo300 a55
// 0x 51 ? f 805 ? = qcom kryo400 a55
midr_info_t midr_info(midr);
return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
|| (midr_info.implementer == 0x51 && midr_info.part == 0x801)
|| (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
|| (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
|| (midr_info.implementer == 0x51 && midr_info.part == 0x803)
|| (midr_info.implementer == 0x51 && midr_info.part == 0x805);
}
static int detect_cpu_is_arm_a53_a55()
{
int a53_a55_cpu_count = 0;
// first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
bool sysfs_midr = true;
for (int i = 0; i < g_cpucount; i++)
{
unsigned int midr = 0;
// for kernel 4.7+
midr = get_midr_from_sysfs(i);
if (midr == 0)
{
sysfs_midr = false;
break;
}
if (midr_is_a53_a55(midr))
{
a53_a55_cpu_count++;
}
}
if (!sysfs_midr)
{
// second try, collect midr from /proc/cpuinfo
std::vector<unsigned int> midrs;
int ret = get_midr_from_proc_cpuinfo(midrs);
if (ret == 0 && (int)midrs.size() == g_cpucount)
{
for (int i = 0; i < g_cpucount; i++)
{
if (midr_is_a53_a55(midrs[i]))
{
a53_a55_cpu_count++;
}
}
}
else
{
// third try, assume all aarch64 little cores are a53/a55
a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled();
}
}
if (a53_a55_cpu_count == 0)
return 0; // all non a53/a55
if (a53_a55_cpu_count == g_cpucount)
return 1; // all a53/a55
// little cores are a53/a55
return 2;
}
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
// the initialization
static void initialize_global_cpu_info()
{
g_cpucount = get_cpucount();
g_physical_cpucount = get_physical_cpucount();
g_powersave = 0;
initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
#if defined __ANDROID__ || defined __linux__
g_hwcaps = get_elf_hwcap(AT_HWCAP);
g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
#endif // defined __ANDROID__ || defined __linux__
#if __APPLE__
g_hw_cpufamily = get_hw_cpufamily();
g_hw_cputype = get_hw_cputype();
g_hw_cpusubtype = get_hw_cpusubtype();
g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
#endif // __APPLE__
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
g_cpu_support_x86_avx = get_cpu_support_x86_avx();
g_cpu_support_x86_fma = get_cpu_support_x86_fma();
g_cpu_support_x86_xop = get_cpu_support_x86_xop();
g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
g_cpu_level2_cachesize = get_cpu_level2_cachesize();
g_cpu_level3_cachesize = get_cpu_level3_cachesize();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
}
static int g_cpu_info_initialized = 0;
static inline void try_initialize_global_cpu_info()
{
if (!g_cpu_info_initialized)
{
initialize_global_cpu_info();
g_cpu_info_initialized = 1;
}
}
namespace ncnn {
#if (defined _WIN32 && !(defined __MINGW32__))
CpuSet::CpuSet()
{
disable_all();
}
void CpuSet::enable(int cpu)
{
mask |= (1 << cpu);
}
void CpuSet::disable(int cpu)
{
mask &= ~(1 << cpu);
}
void CpuSet::disable_all()
{
mask = 0;
}
bool CpuSet::is_enabled(int cpu) const
{
return mask & (1 << cpu);
}
int CpuSet::num_enabled() const
{
int num_enabled = 0;
for (int i = 0; i < (int)sizeof(mask) * 8; i++)
{
if (is_enabled(i))
num_enabled++;
}
return num_enabled;
}
#elif defined __ANDROID__ || defined __linux__
CpuSet::CpuSet()
{
disable_all();
}
void CpuSet::enable(int cpu)
{
CPU_SET(cpu, &cpu_set);
}
void CpuSet::disable(int cpu)
{
CPU_CLR(cpu, &cpu_set);
}
void CpuSet::disable_all()
{
CPU_ZERO(&cpu_set);
}
bool CpuSet::is_enabled(int cpu) const
{
return CPU_ISSET(cpu, &cpu_set);
}
int CpuSet::num_enabled() const
{
int num_enabled = 0;
for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
{
if (is_enabled(i))
num_enabled++;
}
return num_enabled;
}
#elif __APPLE__
CpuSet::CpuSet()
{
disable_all();
}
void CpuSet::enable(int cpu)
{
policy |= (1 << cpu);
}
void CpuSet::disable(int cpu)
{
policy &= ~(1 << cpu);
}
void CpuSet::disable_all()
{
policy = 0;
}
bool CpuSet::is_enabled(int cpu) const
{
return policy & (1 << cpu);
}
int CpuSet::num_enabled() const
{
int num_enabled = 0;
for (int i = 0; i < (int)sizeof(policy) * 8; i++)
{
if (is_enabled(i))
num_enabled++;
}
return num_enabled;
}
#else
CpuSet::CpuSet()
{
}
void CpuSet::enable(int /* cpu */)
{
}
void CpuSet::disable(int /* cpu */)
{
}
void CpuSet::disable_all()
{
}
bool CpuSet::is_enabled(int /* cpu */) const
{
return true;
}
int CpuSet::num_enabled() const
{
return get_cpu_count();
}
#endif
int cpu_support_arm_edsp()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return 0;
#else
return g_hwcaps & HWCAP_EDSP;
#endif
#elif __APPLE__
#if __aarch64__
return 0;
#else
return g_hw_cputype == CPU_TYPE_ARM;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_neon()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_NEON;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_vfpv4()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
// neon always enable fma and fp16
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_VFPv4;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_asimdhp()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDHP;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_optional_arm_FEAT_FP16
|| g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL
|| g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
|| g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
|| g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_cpuid()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_CPUID;
#else
return 0;
#endif
#elif __APPLE__
return 0;
#else
return 0;
#endif
}
int cpu_support_arm_asimddp()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDDP;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_optional_arm_FEAT_DotProd
|| g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
|| g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_asimdfhm()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDFHM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_optional_arm_FEAT_FHM
|| g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
|| g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_bf16()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_BF16;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_optional_arm_FEAT_BF16
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_i8mm()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_I8MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_optional_arm_FEAT_I8MM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_sve()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_SVE;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 sve
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_sve2()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVE2;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 sve2
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_svebf16()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEBF16;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svebf16
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_svei8mm()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEI8MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svei8mm
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_arm_svef32mm()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEF32MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svef32mm
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_x86_avx()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx;
#else
return 0;
#endif
}
int cpu_support_x86_fma()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_fma;
#else
return 0;
#endif
}
int cpu_support_x86_xop()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_xop;
#else
return 0;
#endif
}
int cpu_support_x86_f16c()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_f16c;
#else
return 0;
#endif
}
int cpu_support_x86_avx2()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx2;
#else
return 0;
#endif
}
int cpu_support_x86_avx_vnni()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx_vnni;
#else
return 0;
#endif
}
int cpu_support_x86_avx512()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx512;
#else
return 0;
#endif
}
int cpu_support_x86_avx512_vnni()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx512_vnni;
#else
return 0;
#endif
}
int cpu_support_x86_avx512_bf16()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx512_bf16;
#else
return 0;
#endif
}
int cpu_support_x86_avx512_fp16()
{
try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
return g_cpu_support_x86_avx512_fp16;
#else
return 0;
#endif
}
int cpu_support_mips_msa()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
return g_hwcaps & HWCAP_MIPS_MSA;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_loongarch_lsx()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LSX;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_loongarch_lasx()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LASX;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_loongson_mmi()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
return g_hwcaps & HWCAP_LOONGSON_MMI;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_riscv_v()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
return g_hwcaps & COMPAT_HWCAP_ISA_V;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_riscv_zfh()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
// v + f does not imply zfh, but how to discover zfh properly ?
// upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_riscv_vlenb()
{
try_initialize_global_cpu_info();
#if __riscv
if (!cpu_support_riscv_v())
return 0;
int a = 0;
asm volatile(
".word 0xc22026f3 \n" // csrr a3, vlenb
"mv %0, a3 \n"
: "=r"(a)
:
: "memory", "a3");
return a;
#else
return 0;
#endif
}
int get_cpu_count()
{
try_initialize_global_cpu_info();
return g_cpucount;
}
int get_little_cpu_count()
{
try_initialize_global_cpu_info();
return get_cpu_thread_affinity_mask(1).num_enabled();
}
int get_big_cpu_count()
{
try_initialize_global_cpu_info();
int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
return big_cpu_count ? big_cpu_count : g_cpucount;
}
int get_physical_cpu_count()
{
try_initialize_global_cpu_info();
return g_physical_cpucount;
}
int get_physical_little_cpu_count()
{
try_initialize_global_cpu_info();
if (g_physical_cpucount == g_cpucount)
return get_little_cpu_count();
return g_physical_cpucount * 2 - g_cpucount;
}
int get_physical_big_cpu_count()
{
try_initialize_global_cpu_info();
if (g_physical_cpucount == g_cpucount)
return get_big_cpu_count();
return g_cpucount - g_physical_cpucount;
}
int get_cpu_level2_cache_size()
{
try_initialize_global_cpu_info();
return g_cpu_level2_cachesize;
}
int get_cpu_level3_cache_size()
{
try_initialize_global_cpu_info();
return g_cpu_level3_cachesize;
}
int get_cpu_powersave()
{
try_initialize_global_cpu_info();
return g_powersave;
}
int set_cpu_powersave(int powersave)
{
try_initialize_global_cpu_info();
if (powersave < 0 || powersave > 2)
{
NCNN_LOGE("powersave %d not supported", powersave);
return -1;
}
const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
int ret = set_cpu_thread_affinity(thread_affinity_mask);
if (ret != 0)
return ret;
g_powersave = powersave;
return 0;
}
const CpuSet& get_cpu_thread_affinity_mask(int powersave)
{
try_initialize_global_cpu_info();
if (powersave == 0)
return g_cpu_affinity_mask_all;
if (powersave == 1)
return g_cpu_affinity_mask_little;
if (powersave == 2)
return g_cpu_affinity_mask_big;
NCNN_LOGE("powersave %d not supported", powersave);
// fallback to all cores anyway
return g_cpu_affinity_mask_all;
}
int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
#ifdef _OPENMP
int num_threads = thread_affinity_mask.num_enabled();
// set affinity for each thread
set_omp_num_threads(num_threads);
std::vector<int> ssarets(num_threads, 0);
#pragma omp parallel for num_threads(num_threads)
for (int i = 0; i < num_threads; i++)
{
ssarets[i] = set_sched_affinity(thread_affinity_mask);
}
for (int i = 0; i < num_threads; i++)
{
if (ssarets[i] != 0)
return -1;
}
#else
int ssaret = set_sched_affinity(thread_affinity_mask);
if (ssaret != 0)
return -1;
#endif
return 0;
#elif __APPLE__
#ifdef _OPENMP
int num_threads = thread_affinity_mask.num_enabled();
// set affinity for each thread
set_omp_num_threads(num_threads);
std::vector<int> ssarets(num_threads, 0);
#pragma omp parallel for num_threads(num_threads)
for (int i = 0; i < num_threads; i++)
{
// assign one core for each thread
int core = -1 - i;
for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
{
if (thread_affinity_mask.is_enabled(j))
{
if (core == -1)
{
core = j;
break;
}
else
{
core++;
}
}
}
CpuSet this_thread_affinity_mask;
if (core != -1 - i)
{
this_thread_affinity_mask.enable(core);
}
ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
}
for (int i = 0; i < num_threads; i++)
{
if (ssarets[i] != 0)
return -1;
}
#else
int ssaret = set_sched_affinity(thread_affinity_mask);
if (ssaret != 0)
return -1;
#endif
return 0;
#else
// TODO
(void)thread_affinity_mask;
return -1;
#endif
}
int is_current_thread_running_on_a53_a55()
{
try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
if (g_cpu_is_arm_a53_a55 == 0)
return 0; // all non a53/a55
if (g_cpu_is_arm_a53_a55 == 1)
return 1; // all a53/a55
if (g_powersave == 2)
return 0; // big clusters
if (g_powersave == 1)
return 1; // little clusters
// little cores are a53/a55
// use cpuid for retrieving midr since kernel 4.7+
if (cpu_support_arm_cpuid())
{
unsigned int midr = get_midr_from_register();
if (midr)
return midr_is_a53_a55(midr);
}
// check if affinity cpuid is in the little ones
CpuSet thread_cs;
int ret = get_sched_affinity(thread_cs);
if (ret != 0)
{
// no affinity capability
return 0;
}
const CpuSet& little_cs = get_cpu_thread_affinity_mask(1);
for (int i = 0; i < g_cpucount; i++)
{
if (!thread_cs.is_enabled(i))
continue;
if (!little_cs.is_enabled(i))
return 0;
}
// all affinity cpuids are little core
return 1;
#else
return 0;
#endif // __aarch64__
#else
return 0;
#endif // defined __ANDROID__ || defined __linux__
}
int get_omp_num_threads()
{
#ifdef _OPENMP
return omp_get_num_threads();
#else
return 1;
#endif
}
void set_omp_num_threads(int num_threads)
{
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#else
(void)num_threads;
#endif
}
int get_omp_dynamic()
{
#ifdef _OPENMP
return omp_get_dynamic();
#else
return 0;
#endif
}
void set_omp_dynamic(int dynamic)
{
#ifdef _OPENMP
omp_set_dynamic(dynamic);
#else
(void)dynamic;
#endif
}
int get_omp_thread_num()
{
#ifdef _OPENMP
return omp_get_thread_num();
#else
return 0;
#endif
}
int get_kmp_blocktime()
{
#if defined(_OPENMP) && __clang__
return kmp_get_blocktime();
#else
return 0;
#endif
}
void set_kmp_blocktime(int time_ms)
{
#if defined(_OPENMP) && __clang__
kmp_set_blocktime(time_ms);
#else
(void)time_ms;
#endif
}
static ncnn::ThreadLocalStorage tls_flush_denormals;
int get_flush_denormals()
{
#if defined(__SSE3__)
return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
#else
return 0;
#endif
}
int set_flush_denormals(int flush_denormals)
{
if (flush_denormals < 0 || flush_denormals > 3)
{
NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
return -1;
}
#if defined(__SSE3__)
if (flush_denormals == 0)
{
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
}
else if (flush_denormals == 1)
{
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
}
else if (flush_denormals == 2)
{
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
}
else if (flush_denormals == 3)
{
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
}
tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
return 0;
#else
return 0;
#endif
}
} // namespace ncnn
#if defined __ANDROID__ && defined(_OPENMP) && __clang__
#ifdef __cplusplus
extern "C" {
#endif
void __wrap___kmp_affinity_determine_capable(const char* /*env_var*/)
{
// the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails
// ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h
// and cpu core goes offline in powersave mode on android, which triggers abort
// ATM there is no known api for controlling the abort behavior
// override __kmp_affinity_determine_capable with empty body to disable affinity regardless of KMP_AFFINITY env_var
// ugly hack works >.< --- nihui
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif