// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "cpu.h"

#include "platform.h"

#include <limits.h>
#include <stdio.h>
#include <string.h>

#ifdef _OPENMP
#if NCNN_SIMPLEOMP
#include "simpleomp.h"
#else
#include <omp.h>
#endif
#endif

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#ifdef _MSC_VER
#include <intrin.h>    // __cpuid()
#include <immintrin.h> // _xgetbv()
#endif
#if defined(__clang__) || defined(__GNUC__)
#include <cpuid.h> // __get_cpuid() and __cpuid_count()
#endif
#endif

#ifdef __EMSCRIPTEN__
#include <emscripten/threading.h>
#endif

#if defined _WIN32 && !(defined __MINGW32__)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <powerbase.h>
#endif

#if defined __ANDROID__ || defined __linux__
#if defined __ANDROID__
#if __ANDROID_API__ >= 18
#include <sys/auxv.h> // getauxval()
#endif
#include <sys/system_properties.h> // __system_property_get()
#include <dlfcn.h>
#endif
#include <stdint.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif

#if __APPLE__
#include <mach/mach.h>
#include <mach/machine.h>
#include <mach/thread_act.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#define __IOS__ 1
#endif
// define missing cpu model for old sdk
#ifndef CPUFAMILY_ARM_HURRICANE
#define CPUFAMILY_ARM_HURRICANE 0x67ceee93
#endif
// A11
#ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
#endif
// A12
#ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
#endif
// A13
#ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
#endif
// A14
#ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
#define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
#endif
// A15
#ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
#define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
#endif
// A16
#ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH
#define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
#endif
// M1
#ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
#define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
#endif
// M2
#ifndef CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD
#define CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD 0xda33d83d
#endif
#endif // __APPLE__

#if defined(__SSE3__)
#include <immintrin.h>
#endif

// topology info
static int g_cpucount;
static int g_physical_cpucount;
static int g_powersave;
static ncnn::CpuSet g_cpu_affinity_mask_all;
static ncnn::CpuSet g_cpu_affinity_mask_little;
static ncnn::CpuSet g_cpu_affinity_mask_big;

// isa info
#if defined __ANDROID__ || defined __linux__
static unsigned int g_hwcaps;
static unsigned int g_hwcaps2;
#endif // defined __ANDROID__ || defined __linux__
#if __APPLE__
static unsigned int g_hw_cpufamily;
static cpu_type_t g_hw_cputype;
static cpu_subtype_t g_hw_cpusubtype;
static int g_hw_optional_arm_FEAT_FP16;
static int g_hw_optional_arm_FEAT_DotProd;
static int g_hw_optional_arm_FEAT_FHM;
static int g_hw_optional_arm_FEAT_BF16;
static int g_hw_optional_arm_FEAT_I8MM;
#endif // __APPLE__
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static int g_cpu_support_x86_avx;
static int g_cpu_support_x86_fma;
static int g_cpu_support_x86_xop;
static int g_cpu_support_x86_f16c;
static int g_cpu_support_x86_avx2;
static int g_cpu_support_x86_avx_vnni;
static int g_cpu_support_x86_avx512;
static int g_cpu_support_x86_avx512_vnni;
static int g_cpu_support_x86_avx512_bf16;
static int g_cpu_support_x86_avx512_fp16;
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

static int g_cpu_level2_cachesize;
static int g_cpu_level3_cachesize;

// misc info
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
static int g_cpu_is_arm_a53_a55;
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__

#if defined __ANDROID__ || defined __linux__

#define AT_HWCAP  16
#define AT_HWCAP2 26

#if __aarch64__
// from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD     (1 << 1)
#define HWCAP_ASIMDHP   (1 << 10)
#define HWCAP_CPUID     (1 << 11)
#define HWCAP_ASIMDDP   (1 << 20)
#define HWCAP_SVE       (1 << 22)
#define HWCAP_ASIMDFHM  (1 << 23)
#define HWCAP2_SVE2     (1 << 1)
#define HWCAP2_SVEI8MM  (1 << 9)
#define HWCAP2_SVEF32MM (1 << 10)
#define HWCAP2_SVEBF16  (1 << 12)
#define HWCAP2_I8MM     (1 << 13)
#define HWCAP2_BF16     (1 << 14)
#else
// from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_EDSP  (1 << 7)
#define HWCAP_NEON  (1 << 12)
#define HWCAP_VFPv4 (1 << 16)
#endif

#if __mips__
// from arch/mips/include/uapi/asm/hwcap.h
#define HWCAP_MIPS_MSA     (1 << 1)
#define HWCAP_LOONGSON_MMI (1 << 11)
#endif

#if __loongarch64
// from arch/loongarch/include/uapi/asm/hwcap.h
#define HWCAP_LOONGARCH_LSX  (1 << 4)
#define HWCAP_LOONGARCH_LASX (1 << 5)
#endif

#if __riscv
// from arch/riscv/include/uapi/asm/hwcap.h
#define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif

#if defined __ANDROID__
// Probe the system's C library for a 'getauxval' function and call it if
// it exits, or return 0 for failure. This function is available since API
// level 18.
//
// Note that getauxval() can't really be re-implemented here, because
// its implementation does not parse /proc/self/auxv. Instead it depends
// on values  that are passed by the kernel at process-init time to the
// C runtime initialization layer.
static unsigned int get_elf_hwcap_from_getauxval(unsigned int type)
{
#if __ANDROID_API__ >= 18
    unsigned int hwcap = getauxval(type);
    if (hwcap)
        return hwcap;
#endif

    typedef unsigned long getauxval_func_t(unsigned long);

    dlerror();
    void* libc_handle = dlopen("libc.so", RTLD_NOW);
    if (!libc_handle)
    {
        NCNN_LOGE("dlopen libc.so failed %s", dlerror());
        return 0;
    }

    unsigned int result = 0;
    getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
    if (!func)
    {
        NCNN_LOGE("dlsym getauxval failed");
    }
    else
    {
        // Note: getauxval() returns 0 on failure. Doesn't touch errno.
        result = (unsigned int)(*func)(type);
    }
    dlclose(libc_handle);

    return result;
}
#endif // defined __ANDROID__

// extract the ELF HW capabilities bitmap from /proc/self/auxv
static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
{
    FILE* fp = fopen("/proc/self/auxv", "rb");
    if (!fp)
    {
        NCNN_LOGE("fopen /proc/self/auxv failed");
        return 0;
    }

#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
    struct
    {
        uint64_t tag;
        uint64_t value;
    } entry;
#else
    struct
    {
        unsigned int tag;
        unsigned int value;
    } entry;

#endif

    unsigned int result = 0;
    while (!feof(fp))
    {
        int nread = fread((char*)&entry, sizeof(entry), 1, fp);
        if (nread != 1)
            break;

        if (entry.tag == 0 && entry.value == 0)
            break;

        if (entry.tag == type)
        {
            result = entry.value;
            break;
        }
    }

    fclose(fp);

    return result;
}

static unsigned int get_elf_hwcap(unsigned int type)
{
    unsigned int hwcap = 0;

#if defined __ANDROID__
    hwcap = get_elf_hwcap_from_getauxval(type);
#endif

    if (!hwcap)
        hwcap = get_elf_hwcap_from_proc_self_auxv(type);

#if defined __ANDROID__
#if __aarch64__
    if (type == AT_HWCAP)
    {
        // samsung exynos9810 on android pre-9 incorrectly reports armv8.2
        // for little cores, but big cores only support armv8.0
        // drop all armv8.2 features used by ncnn for preventing SIGILLs
        // ref https://reviews.llvm.org/D114523
        char arch[PROP_VALUE_MAX];
        int len = __system_property_get("ro.arch", arch);
        if (len > 0 && strncmp(arch, "exynos9810", 10) == 0)
        {
            hwcap &= ~HWCAP_ASIMDHP;
            hwcap &= ~HWCAP_ASIMDDP;
        }
    }
#endif // __aarch64__
#endif // defined __ANDROID__

    return hwcap;
}
#endif // defined __ANDROID__ || defined __linux__

#if __APPLE__
static unsigned int get_hw_cpufamily()
{
    unsigned int value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
    return value;
}

static cpu_type_t get_hw_cputype()
{
    cpu_type_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cputype", &value, &len, NULL, 0);
    return value;
}

static cpu_subtype_t get_hw_cpusubtype()
{
    cpu_subtype_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
    return value;
}

static int get_hw_capability(const char* cap)
{
    int64_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname(cap, &value, &len, NULL, 0);
    return value;
}
#endif // __APPLE__

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static inline void x86_cpuid(int level, unsigned int out[4])
{
#if defined(_MSC_VER) && !defined(__clang__)
    __cpuid((int*)out, level);
#elif defined(__clang__) || defined(__GNUC__)
    __get_cpuid(level, out, out + 1, out + 2, out + 3);
#else
    NCNN_LOGE("x86_cpuid is unknown for current compiler");
    out[0] = 0;
    out[1] = 0;
    out[2] = 0;
    out[3] = 0;
#endif
}

static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
{
#if defined(_MSC_VER)
    __cpuidex((int*)out, level, sublevel);
#elif defined(__clang__) || defined(__GNUC__)
    __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
#else
    NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
    out[0] = 0;
    out[1] = 0;
    out[2] = 0;
    out[3] = 0;
#endif
}

static inline int x86_get_xcr0()
{
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
    return _xgetbv(0);
#elif defined(__i386__) || defined(__x86_64__)
    int xcr0 = 0;
    asm(".byte 0x0f, 0x01, 0xd0"
        : "=a"(xcr0)
        : "c"(0)
        : "%edx");
    return xcr0;
#else
    NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
    return 0xffffffff; // assume it will work
#endif
}

static int get_cpu_support_x86_avx()
{
#if !NCNN_AVX
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 1)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    return 1;
}

static int get_cpu_support_x86_fma()
{
#if !NCNN_FMA
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    return cpu_info[2] & (1u << 12);
}

static int get_cpu_support_x86_xop()
{
#if !NCNN_XOP
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0x80000000, cpu_info);

    if (cpu_info[0] < 0x80000001)
        return 0;

    x86_cpuid(0x80000001, cpu_info);

    return cpu_info[2] & (1u << 11);
}

static int get_cpu_support_x86_f16c()
{
#if !NCNN_F16C
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 1)
        return 0;

    x86_cpuid(1, cpu_info);

    return cpu_info[2] & (1u << 29);
}

static int get_cpu_support_x86_avx2()
{
#if !NCNN_AVX2
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[1] & (1u << 5);
}

static int get_cpu_support_x86_avx_vnni()
{
#if !NCNN_AVXVNNI
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[0] & (1u << 4);
}

static int get_cpu_support_x86_avx512()
{
#if !NCNN_AVX512
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
}

static int get_cpu_support_x86_avx512_vnni()
{
#if !NCNN_AVX512VNNI
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[2] & (1u << 11);
}

static int get_cpu_support_x86_avx512_bf16()
{
#if !NCNN_AVX512BF16
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[0] & (1u << 5);
}

static int get_cpu_support_x86_avx512_fp16()
{
#if !NCNN_AVX512FP16
    return 0;
#endif
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[3] & (1u << 23);
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

static int get_cpucount()
{
    int count = 0;
#ifdef __EMSCRIPTEN__
    if (emscripten_has_threading_support())
        count = emscripten_num_logical_cores();
    else
        count = 1;
#elif (defined _WIN32 && !(defined __MINGW32__))
    SYSTEM_INFO system_info;
    GetSystemInfo(&system_info);
    count = system_info.dwNumberOfProcessors;
#elif defined __ANDROID__ || defined __linux__
    // get cpu count from /proc/cpuinfo
    FILE* fp = fopen("/proc/cpuinfo", "rb");
    if (!fp)
        return 1;

    char line[1024];
    while (!feof(fp))
    {
        char* s = fgets(line, 1024, fp);
        if (!s)
            break;

        if (memcmp(line, "processor", 9) == 0)
        {
            count++;
        }
    }

    fclose(fp);
#elif __APPLE__
    size_t len = sizeof(count);
    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
#else
#ifdef _OPENMP
    count = omp_get_max_threads();
#else
    count = 1;
#endif // _OPENMP
#endif

    if (count < 1)
        count = 1;

    return count;
}

#if defined __ANDROID__ || defined __linux__
static int get_thread_siblings(int cpuid)
{
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);

    FILE* fp = fopen(path, "rb");
    if (!fp)
        return -1;

    int thread_siblings = -1;
    int nscan = fscanf(fp, "%x", &thread_siblings);
    if (nscan != 1)
    {
        // ignore
    }

    fclose(fp);

    return thread_siblings;
}
#endif // defined __ANDROID__ || defined __linux__

static int get_physical_cpucount()
{
    int count = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi == NULL)
    {
        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
        return g_cpucount;
    }

    DWORD return_length = 0;
    glpi(NULL, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
    glpi(buffer, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
    DWORD byte_offset = 0;
    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
    {
        if (ptr->Relationship == RelationProcessorCore)
        {
            count++;
        }

        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
        ptr++;
    }

    free(buffer);
#elif defined __ANDROID__ || defined __linux__
    std::vector<int> thread_set;
    for (int i = 0; i < g_cpucount; i++)
    {
        int thread_siblings = get_thread_siblings(i);
        if (thread_siblings == -1)
        {
            // ignore malformed one
            continue;
        }

        bool thread_siblings_exists = false;
        for (size_t j = 0; j < thread_set.size(); j++)
        {
            if (thread_set[j] == thread_siblings)
            {
                thread_siblings_exists = true;
                break;
            }
        }

        if (!thread_siblings_exists)
        {
            thread_set.push_back(thread_siblings);
            count++;
        }
    }
#elif __APPLE__
    size_t len = sizeof(count);
    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
#else
    count = g_cpucount;
#endif

    if (count > g_cpucount)
        count = g_cpucount;

    return count;
}

#if defined __ANDROID__ || defined __linux__
static int get_data_cache_size(int cpuid, int level)
{
    char path[256];

    // discover sysfs cache entry
    int indexid = -1;
    for (int i = 0;; i++)
    {
        // check level
        {
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
            FILE* fp = fopen(path, "rb");
            if (!fp)
                break;

            int cache_level = -1;
            int nscan = fscanf(fp, "%d", &cache_level);
            fclose(fp);
            if (nscan != 1 || cache_level != level)
                continue;
        }

        // check type
        {
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
            FILE* fp = fopen(path, "rb");
            if (!fp)
                break;

            char type[32];
            int nscan = fscanf(fp, "%31s", type);
            fclose(fp);
            if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
                continue;
        }

        indexid = i;
        break;
    }

    if (indexid == -1)
    {
        // no sysfs entry
        return 0;
    }

    // get size
    int cache_size_K = 0;
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
        FILE* fp = fopen(path, "rb");
        if (!fp)
            return 0;

        int nscan = fscanf(fp, "%dK", &cache_size_K);
        fclose(fp);
        if (nscan != 1)
        {
            NCNN_LOGE("fscanf cache_size_K error %d", nscan);
            return 0;
        }
    }

    // parse shared_cpu_map mask
    ncnn::CpuSet shared_cpu_map;
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
        FILE* fp = fopen(path, "rb");
        if (!fp)
            return 0;

        char shared_cpu_map_str[256];
        int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
        fclose(fp);
        if (nscan != 1)
        {
            NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
            return 0;
        }

        int len = strlen(shared_cpu_map_str);

        if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
        {
            // skip leading 0x
            len -= 2;
        }

        int ci = 0;
        for (int i = len - 1; i >= 0; i--)
        {
            char x = shared_cpu_map_str[i];
            if (x & 1) shared_cpu_map.enable(ci + 0);
            if (x & 2) shared_cpu_map.enable(ci + 1);
            if (x & 4) shared_cpu_map.enable(ci + 2);
            if (x & 8) shared_cpu_map.enable(ci + 3);

            ci += 4;
        }
    }

    if (shared_cpu_map.num_enabled() == 1)
        return cache_size_K * 1024;

    // resolve physical cpu count in the shared_cpu_map
    int shared_physical_cpu_count = 0;
    {
        std::vector<int> thread_set;
        for (int i = 0; i < g_cpucount; i++)
        {
            if (!shared_cpu_map.is_enabled(i))
                continue;

            int thread_siblings = get_thread_siblings(i);
            if (thread_siblings == -1)
            {
                // ignore malformed one
                continue;
            }

            bool thread_siblings_exists = false;
            for (size_t j = 0; j < thread_set.size(); j++)
            {
                if (thread_set[j] == thread_siblings)
                {
                    thread_siblings_exists = true;
                    break;
                }
            }

            if (!thread_siblings_exists)
            {
                thread_set.push_back(thread_siblings);
                shared_physical_cpu_count++;
            }
        }
    }

    // return per-physical-core cache size with 4K aligned
    cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;

    return cache_size_K * 1024;
}

static int get_big_cpu_data_cache_size(int level)
{
    if (g_cpu_affinity_mask_big.num_enabled() == 0)
    {
        // smp cpu
        return get_data_cache_size(0, level);
    }

    for (int i = 0; i < g_cpucount; i++)
    {
        if (g_cpu_affinity_mask_big.is_enabled(i))
        {
            return get_data_cache_size(i, level);
        }
    }

    // should never reach here, fallback to cpu0
    return get_data_cache_size(0, level);
}
#endif // defined __ANDROID__ || defined __linux__

static int get_cpu_level2_cachesize()
{
    int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 2)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
#elif defined __ANDROID__ || defined __linux__
    size = get_big_cpu_data_cache_size(2);
#if defined(_SC_LEVEL2_CACHE_SIZE)
    if (size <= 0)
        size = sysconf(_SC_LEVEL2_CACHE_SIZE);
#endif
#elif __APPLE__
    // perflevel 0 is the higher performance cluster
    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
#endif

    // fallback to a common value
    if (size <= 0)
    {
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
        size = 64 * 1024;
        if (g_cpu_support_x86_avx)
            size = 128 * 1024;
        if (g_cpu_support_x86_avx2)
            size = 256 * 1024;
        if (g_cpu_support_x86_avx512)
            size = 1024 * 1024;
#elif __aarch64__
        size = 256 * 1024;
#elif __arm__
        size = 128 * 1024;
#else
        // is 64k still too large here ?
        size = 64 * 1024;
#endif
    }

    return size;
}

static int get_cpu_level3_cachesize()
{
    int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 3)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
#elif defined __ANDROID__ || defined __linux__
    size = get_big_cpu_data_cache_size(3);
#if defined(_SC_LEVEL3_CACHE_SIZE)
    if (size <= 0)
        size = sysconf(_SC_LEVEL3_CACHE_SIZE);
#endif
#elif __APPLE__
    // perflevel 0 is the higher performance cluster
    // get the size shared among all cpus
    size = get_hw_capability("hw.perflevel0.l3cachesize");
#endif

    // l3 cache size can be zero

    return size;
}

#if (defined _WIN32 && !(defined __MINGW32__))
static ncnn::CpuSet get_smt_cpu_mask()
{
    ncnn::CpuSet smt_cpu_mask;

    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi == NULL)
    {
        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
        return smt_cpu_mask;
    }

    DWORD return_length = 0;
    glpi(NULL, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
    glpi(buffer, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
    DWORD byte_offset = 0;
    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
    {
        if (ptr->Relationship == RelationProcessorCore)
        {
            ncnn::CpuSet smt_set;
            smt_set.mask = ptr->ProcessorMask;
            if (smt_set.num_enabled() > 1)
            {
                // this core is smt
                smt_cpu_mask.mask |= smt_set.mask;
            }
        }

        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
        ptr++;
    }

    free(buffer);

    return smt_cpu_mask;
}

static std::vector<int> get_max_freq_mhz()
{
    typedef struct _PROCESSOR_POWER_INFORMATION
    {
        ULONG Number;
        ULONG MaxMhz;
        ULONG CurrentMhz;
        ULONG MhzLimit;
        ULONG MaxIdleState;
        ULONG CurrentIdleState;
    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;

    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));

    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
    if (cnpi == NULL)
    {
        NCNN_LOGE("CallNtPowerInformation is not supported");
        FreeLibrary(powrprof);
        return std::vector<int>(g_cpucount, 0);
    }

    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);

    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);

    std::vector<int> ret;
    for (int i = 0; i < g_cpucount; i++)
    {
        ULONG max_mhz = buffer[i].MaxMhz;
        ret.push_back(max_mhz);
    }

    free(buffer);
    FreeLibrary(powrprof);
    return ret;
}

static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
    if (prev_mask == 0)
    {
        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
        return -1;
    }

    return 0;
}
#endif // (defined _WIN32 && !(defined __MINGW32__))

#if defined __ANDROID__ || defined __linux__
static int get_max_freq_khz(int cpuid)
{
    // first try, for all possible cpu
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);

    FILE* fp = fopen(path, "rb");

    if (!fp)
    {
        // second try, for online cpu
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
        fp = fopen(path, "rb");

        if (fp)
        {
            int max_freq_khz = 0;
            while (!feof(fp))
            {
                int freq_khz = 0;
                int nscan = fscanf(fp, "%d %*d", &freq_khz);
                if (nscan != 1)
                    break;

                if (freq_khz > max_freq_khz)
                    max_freq_khz = freq_khz;
            }

            fclose(fp);

            if (max_freq_khz != 0)
                return max_freq_khz;

            fp = NULL;
        }

        if (!fp)
        {
            // third try, for online cpu
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
            fp = fopen(path, "rb");

            if (!fp)
                return -1;

            int max_freq_khz = -1;
            int nscan = fscanf(fp, "%d", &max_freq_khz);
            if (nscan != 1)
            {
                NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
            }
            fclose(fp);

            return max_freq_khz;
        }
    }

    int max_freq_khz = 0;
    while (!feof(fp))
    {
        int freq_khz = 0;
        int nscan = fscanf(fp, "%d %*d", &freq_khz);
        if (nscan != 1)
            break;

        if (freq_khz > max_freq_khz)
            max_freq_khz = freq_khz;
    }

    fclose(fp);

    return max_freq_khz;
}

static bool is_smt_cpu(int cpuid)
{
    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);

    FILE* fp = fopen(path, "rb");

    if (!fp)
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
        fp = fopen(path, "rb");

        if (!fp)
            return false;
    }

    bool is_smt = false;
    while (!feof(fp))
    {
        char ch = fgetc(fp);
        if (ch == ',' || ch == '-')
        {
            is_smt = true;
            break;
        }
    }

    fclose(fp);

    return is_smt;
}

static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    // set affinity for thread
#if defined(__BIONIC__)
    pid_t pid = gettid();
#else
    pid_t pid = syscall(SYS_gettid);
#endif

    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    if (syscallret)
    {
        NCNN_LOGE("syscall error %d", syscallret);
        return -1;
    }

    return 0;
}
#endif // defined __ANDROID__ || defined __linux__

#if __APPLE__
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
    // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
    // https://gist.github.com/Coneko/4234842

    // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
    // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
    // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio

    int affinity_tag = THREAD_AFFINITY_TAG_NULL;
    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
    {
        if (thread_affinity_mask.is_enabled(i))
        {
            affinity_tag = i + 1;
            break;
        }
    }

    mach_port_t tid = pthread_mach_thread_np(pthread_self());

    thread_affinity_policy_data_t policy_data;
    policy_data.affinity_tag = affinity_tag;
    int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
    if (ret && ret != KERN_NOT_SUPPORTED)
    {
        NCNN_LOGE("thread_policy_set error %d", ret);
        return -1;
    }

    return 0;
}
#endif // __APPLE__

static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
{
    mask_all.disable_all();

#if (defined _WIN32 && !(defined __MINGW32__))
    // get max freq mhz for all cores
    int max_freq_mhz_min = INT_MAX;
    int max_freq_mhz_max = 0;
    std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
    for (int i = 0; i < g_cpucount; i++)
    {
        int max_freq_mhz = cpu_max_freq_mhz[i];

        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);

        if (max_freq_mhz > max_freq_mhz_max)
            max_freq_mhz_max = max_freq_mhz;
        if (max_freq_mhz < max_freq_mhz_min)
            max_freq_mhz_min = max_freq_mhz;
    }

    int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
    if (max_freq_mhz_medium == max_freq_mhz_max)
    {
        mask_little.disable_all();
        mask_big = mask_all;
        return;
    }

    ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask();

    for (int i = 0; i < g_cpucount; i++)
    {
        if (smt_cpu_mask.is_enabled(i))
        {
            // always treat smt core as big core
            mask_big.enable(i);
            continue;
        }

        if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
            mask_little.enable(i);
        else
            mask_big.enable(i);
    }
#elif defined __ANDROID__ || defined __linux__
    int max_freq_khz_min = INT_MAX;
    int max_freq_khz_max = 0;
    std::vector<int> cpu_max_freq_khz(g_cpucount);
    for (int i = 0; i < g_cpucount; i++)
    {
        int max_freq_khz = get_max_freq_khz(i);

        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);

        cpu_max_freq_khz[i] = max_freq_khz;

        if (max_freq_khz > max_freq_khz_max)
            max_freq_khz_max = max_freq_khz;
        if (max_freq_khz < max_freq_khz_min)
            max_freq_khz_min = max_freq_khz;
    }

    int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
    if (max_freq_khz_medium == max_freq_khz_max)
    {
        mask_little.disable_all();
        mask_big = mask_all;
        return;
    }

    for (int i = 0; i < g_cpucount; i++)
    {
        if (is_smt_cpu(i))
        {
            // always treat smt core as big core
            mask_big.enable(i);
            continue;
        }

        if (cpu_max_freq_khz[i] < max_freq_khz_medium)
            mask_little.enable(i);
        else
            mask_big.enable(i);
    }
#elif __APPLE__
    int nperflevels = get_hw_capability("hw.nperflevels");
    if (nperflevels == 1)
    {
        // smp models
        mask_little.disable_all();
        mask_big = mask_all;
    }
    else
    {
        // two or more clusters, level0 is the high-performance cluster
        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
        for (int i = 0; i < perflevel0_logicalcpu; i++)
        {
            mask_big.enable(i);
        }
        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
        {
            mask_little.enable(i);
        }
    }
#else
    // TODO implement me for other platforms
    mask_little.disable_all();
    mask_big = mask_all;
#endif
}

#if defined __ANDROID__ || defined __linux__
#if __aarch64__
union midr_info_t
{
    struct __attribute__((packed))
    {
        unsigned int revision : 4;
        unsigned int part : 12;
        unsigned int architecture : 4;
        unsigned int variant : 4;
        unsigned int implementer : 8;
    };
    unsigned int midr;

    midr_info_t(unsigned int _midr)
        : midr(_midr)
    {
    }
};

static unsigned int get_midr_from_sysfs(int cpuid)
{
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);

    FILE* fp = fopen(path, "rb");
    if (!fp)
        return 0;

    unsigned int midr_el1 = 0;
    int nscan = fscanf(fp, "%x", &midr_el1);
    if (nscan != 1)
    {
        // ignore
    }

    fclose(fp);

    return midr_el1;
}

static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
{
    FILE* fp = fopen("/proc/cpuinfo", "rb");
    if (!fp)
        return -1;

    midrs.resize(g_cpucount, 0);

    int cpuid = -1;
    midr_info_t midr_info(0);

    char line[1024];
    while (!feof(fp))
    {
        char* s = fgets(line, 1024, fp);
        if (!s)
            break;

        if (memcmp(line, "processor", 9) == 0)
        {
            // processor       : 4
            int id = -1;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            if (cpuid >= 0 && cpuid < g_cpucount)
            {
                if (midr_info.midr == 0)
                {
                    // shared midr
                    midrs[cpuid] = (unsigned int)-1;
                }
                else
                {
                    // save midr and reset
                    midrs[cpuid] = midr_info.midr;
                    for (int i = 0; i < g_cpucount; i++)
                    {
                        if (midrs[i] == (unsigned int)-1)
                            midrs[i] = midr_info.midr;
                    }
                }

                midr_info.midr = 0;
            }

            cpuid = id;
        }

        if (cpuid == -1)
            continue;

        if (memcmp(line, "CPU implementer", 15) == 0)
        {
            // CPU implementer : 0x51
            unsigned int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.implementer = id;
        }
        else if (memcmp(line, "CPU architecture", 16) == 0)
        {
            // CPU architecture: 8
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            midr_info.architecture = id;
        }
        else if (memcmp(line, "CPU variant", 11) == 0)
        {
            // CPU variant     : 0xd
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.variant = id;
        }
        else if (memcmp(line, "CPU part", 8) == 0)
        {
            // CPU part        : 0x804
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.part = id;
        }
        else if (memcmp(line, "CPU revision", 12) == 0)
        {
            // CPU revision    : 14
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            midr_info.revision = id;
        }
    }

    fclose(fp);

    if (cpuid >= 0 && cpuid < g_cpucount)
    {
        if (midr_info.midr == 0)
        {
            // shared midr
            midrs[cpuid] = (unsigned int)-1;
        }
        else
        {
            // save midr and reset
            midrs[cpuid] = midr_info.midr;
            for (int i = 0; i < g_cpucount; i++)
            {
                if (midrs[i] == (unsigned int)-1)
                    midrs[i] = midr_info.midr;
            }
        }

        midr_info.midr = 0;
    }

    // /proc/cpuinfo may only report little/online cores on old kernel
    if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount)
    {
        // assign the remaining unknown midrs for smp cpu
        for (int i = 0; i < g_cpucount; i++)
        {
            if (midrs[i] == 0)
                midrs[i] = midr_info.midr;
        }
    }
    else
    {
        // clear the big core midrs for hmp cpu if they are the same as little cores
        unsigned int little_midr = 0;
        for (int i = 0; i < g_cpucount; i++)
        {
            if (g_cpu_affinity_mask_little.is_enabled(i))
            {
                little_midr = midrs[i];
                break;
            }
        }

        for (int i = 0; i < g_cpucount; i++)
        {
            if (g_cpu_affinity_mask_big.is_enabled(i))
            {
                if (midrs[i] == little_midr)
                {
                    midrs[i] = 0;
                }
            }
        }
    }

    return 0;
}

// return midr for the current running core
static unsigned int get_midr_from_register()
{
    uint64_t midr;
    asm volatile("mrs   %0, MIDR_EL1"
                 : "=r"(midr));

    return (unsigned int)midr;
}

static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
{
    // get affinity for thread
#if defined(__BIONIC__)
    pid_t pid = gettid();
#else
    pid_t pid = syscall(SYS_gettid);
#endif

    thread_affinity_mask.disable_all();

    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    if (syscallret)
    {
        // handle get error silently
        return -1;
    }

    return 0;
}

static int midr_is_a53_a55(unsigned int midr)
{
    // 0x 41 ? f d03 ? = arm cortex-a53
    // 0x 51 ? f 801 ? = qcom kryo200 a53
    // 0x 41 ? f d04 ? = arm cortex-a35
    // 0x 41 ? f d05 ? = arm cortex-a55
    // 0x 51 ? f 803 ? = qcom kryo300 a55
    // 0x 51 ? f 805 ? = qcom kryo400 a55

    midr_info_t midr_info(midr);

    return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x801)
           || (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
           || (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x803)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x805);
}

static int detect_cpu_is_arm_a53_a55()
{
    int a53_a55_cpu_count = 0;

    // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
    bool sysfs_midr = true;
    for (int i = 0; i < g_cpucount; i++)
    {
        unsigned int midr = 0;

        // for kernel 4.7+
        midr = get_midr_from_sysfs(i);
        if (midr == 0)
        {
            sysfs_midr = false;
            break;
        }

        if (midr_is_a53_a55(midr))
        {
            a53_a55_cpu_count++;
        }
    }

    if (!sysfs_midr)
    {
        // second try, collect midr from /proc/cpuinfo
        std::vector<unsigned int> midrs;
        int ret = get_midr_from_proc_cpuinfo(midrs);
        if (ret == 0 && (int)midrs.size() == g_cpucount)
        {
            for (int i = 0; i < g_cpucount; i++)
            {
                if (midr_is_a53_a55(midrs[i]))
                {
                    a53_a55_cpu_count++;
                }
            }
        }
        else
        {
            // third try, assume all aarch64 little cores are a53/a55
            a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled();
        }
    }

    if (a53_a55_cpu_count == 0)
        return 0; // all non a53/a55

    if (a53_a55_cpu_count == g_cpucount)
        return 1; // all a53/a55

    // little cores are a53/a55
    return 2;
}
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__

// the initialization
static void initialize_global_cpu_info()
{
    g_cpucount = get_cpucount();
    g_physical_cpucount = get_physical_cpucount();
    g_powersave = 0;
    initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);

#if defined __ANDROID__ || defined __linux__
    g_hwcaps = get_elf_hwcap(AT_HWCAP);
    g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
#endif // defined __ANDROID__ || defined __linux__

#if __APPLE__
    g_hw_cpufamily = get_hw_cpufamily();
    g_hw_cputype = get_hw_cputype();
    g_hw_cpusubtype = get_hw_cpusubtype();

    g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
    g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
    g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
    g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
    g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
#endif // __APPLE__

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    g_cpu_support_x86_avx = get_cpu_support_x86_avx();
    g_cpu_support_x86_fma = get_cpu_support_x86_fma();
    g_cpu_support_x86_xop = get_cpu_support_x86_xop();
    g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
    g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
    g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
    g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
    g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
    g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
    g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

    g_cpu_level2_cachesize = get_cpu_level2_cachesize();
    g_cpu_level3_cachesize = get_cpu_level3_cachesize();

#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
}

static int g_cpu_info_initialized = 0;

static inline void try_initialize_global_cpu_info()
{
    if (!g_cpu_info_initialized)
    {
        initialize_global_cpu_info();
        g_cpu_info_initialized = 1;
    }
}

namespace ncnn {

#if (defined _WIN32 && !(defined __MINGW32__))
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    mask |= (1 << cpu);
}

void CpuSet::disable(int cpu)
{
    mask &= ~(1 << cpu);
}

void CpuSet::disable_all()
{
    mask = 0;
}

bool CpuSet::is_enabled(int cpu) const
{
    return mask & (1 << cpu);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#elif defined __ANDROID__ || defined __linux__
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    CPU_SET(cpu, &cpu_set);
}

void CpuSet::disable(int cpu)
{
    CPU_CLR(cpu, &cpu_set);
}

void CpuSet::disable_all()
{
    CPU_ZERO(&cpu_set);
}

bool CpuSet::is_enabled(int cpu) const
{
    return CPU_ISSET(cpu, &cpu_set);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#elif __APPLE__
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    policy |= (1 << cpu);
}

void CpuSet::disable(int cpu)
{
    policy &= ~(1 << cpu);
}

void CpuSet::disable_all()
{
    policy = 0;
}

bool CpuSet::is_enabled(int cpu) const
{
    return policy & (1 << cpu);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#else
CpuSet::CpuSet()
{
}

void CpuSet::enable(int /* cpu */)
{
}

void CpuSet::disable(int /* cpu */)
{
}

void CpuSet::disable_all()
{
}

bool CpuSet::is_enabled(int /* cpu */) const
{
    return true;
}

int CpuSet::num_enabled() const
{
    return get_cpu_count();
}
#endif

int cpu_support_arm_edsp()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return 0;
#else
    return g_hwcaps & HWCAP_EDSP;
#endif
#elif __APPLE__
#if __aarch64__
    return 0;
#else
    return g_hw_cputype == CPU_TYPE_ARM;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_neon()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_ASIMD;
#else
    return g_hwcaps & HWCAP_NEON;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_cputype == CPU_TYPE_ARM64;
#else
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_vfpv4()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    // neon always enable fma and fp16
    return g_hwcaps & HWCAP_ASIMD;
#else
    return g_hwcaps & HWCAP_VFPv4;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_cputype == CPU_TYPE_ARM64;
#else
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_asimdhp()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_ASIMDHP;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_optional_arm_FEAT_FP16
           || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL
           || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_cpuid()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_CPUID;
#else
    return 0;
#endif
#elif __APPLE__
    return 0;
#else
    return 0;
#endif
}

int cpu_support_arm_asimddp()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_ASIMDDP;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_optional_arm_FEAT_DotProd
           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_asimdfhm()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_ASIMDFHM;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_optional_arm_FEAT_FHM
           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_bf16()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_BF16;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_optional_arm_FEAT_BF16
           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_i8mm()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_I8MM;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return g_hw_optional_arm_FEAT_I8MM
           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_sve()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps & HWCAP_SVE;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return 0; // no known apple cpu support armv8.6 sve
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_sve2()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_SVE2;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return 0; // no known apple cpu support armv8.6 sve2
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svebf16()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_SVEBF16;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return 0; // no known apple cpu support armv8.6 svebf16
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svei8mm()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_SVEI8MM;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return 0; // no known apple cpu support armv8.6 svei8mm
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svef32mm()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    return g_hwcaps2 & HWCAP2_SVEF32MM;
#else
    return 0;
#endif
#elif __APPLE__
#if __aarch64__
    return 0; // no known apple cpu support armv8.6 svef32mm
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_x86_avx()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx;
#else
    return 0;
#endif
}

int cpu_support_x86_fma()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_fma;
#else
    return 0;
#endif
}

int cpu_support_x86_xop()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_xop;
#else
    return 0;
#endif
}

int cpu_support_x86_f16c()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_f16c;
#else
    return 0;
#endif
}

int cpu_support_x86_avx2()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx2;
#else
    return 0;
#endif
}

int cpu_support_x86_avx_vnni()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx_vnni;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_vnni()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_vnni;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_bf16()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_bf16;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_fp16()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_fp16;
#else
    return 0;
#endif
}

int cpu_support_mips_msa()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
    return g_hwcaps & HWCAP_MIPS_MSA;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongarch_lsx()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
    return g_hwcaps & HWCAP_LOONGARCH_LSX;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongarch_lasx()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
    return g_hwcaps & HWCAP_LOONGARCH_LASX;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongson_mmi()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
    return g_hwcaps & HWCAP_LOONGSON_MMI;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_v()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    return g_hwcaps & COMPAT_HWCAP_ISA_V;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_zfh()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    // v + f does not imply zfh, but how to discover zfh properly ?
    // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
    return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_riscv_vlenb()
{
    try_initialize_global_cpu_info();
#if __riscv
    if (!cpu_support_riscv_v())
        return 0;

    int a = 0;
    asm volatile(
        ".word  0xc22026f3  \n" // csrr  a3, vlenb
        "mv     %0, a3      \n"
        : "=r"(a)
        :
        : "memory", "a3");
    return a;
#else
    return 0;
#endif
}

int get_cpu_count()
{
    try_initialize_global_cpu_info();
    return g_cpucount;
}

int get_little_cpu_count()
{
    try_initialize_global_cpu_info();
    return get_cpu_thread_affinity_mask(1).num_enabled();
}

int get_big_cpu_count()
{
    try_initialize_global_cpu_info();
    int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
    return big_cpu_count ? big_cpu_count : g_cpucount;
}

int get_physical_cpu_count()
{
    try_initialize_global_cpu_info();
    return g_physical_cpucount;
}

int get_physical_little_cpu_count()
{
    try_initialize_global_cpu_info();
    if (g_physical_cpucount == g_cpucount)
        return get_little_cpu_count();

    return g_physical_cpucount * 2 - g_cpucount;
}

int get_physical_big_cpu_count()
{
    try_initialize_global_cpu_info();
    if (g_physical_cpucount == g_cpucount)
        return get_big_cpu_count();

    return g_cpucount - g_physical_cpucount;
}

int get_cpu_level2_cache_size()
{
    try_initialize_global_cpu_info();
    return g_cpu_level2_cachesize;
}

int get_cpu_level3_cache_size()
{
    try_initialize_global_cpu_info();
    return g_cpu_level3_cachesize;
}

int get_cpu_powersave()
{
    try_initialize_global_cpu_info();
    return g_powersave;
}

int set_cpu_powersave(int powersave)
{
    try_initialize_global_cpu_info();
    if (powersave < 0 || powersave > 2)
    {
        NCNN_LOGE("powersave %d not supported", powersave);
        return -1;
    }

    const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);

    int ret = set_cpu_thread_affinity(thread_affinity_mask);
    if (ret != 0)
        return ret;

    g_powersave = powersave;

    return 0;
}

const CpuSet& get_cpu_thread_affinity_mask(int powersave)
{
    try_initialize_global_cpu_info();
    if (powersave == 0)
        return g_cpu_affinity_mask_all;

    if (powersave == 1)
        return g_cpu_affinity_mask_little;

    if (powersave == 2)
        return g_cpu_affinity_mask_big;

    NCNN_LOGE("powersave %d not supported", powersave);

    // fallback to all cores anyway
    return g_cpu_affinity_mask_all;
}

int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
#ifdef _OPENMP
    int num_threads = thread_affinity_mask.num_enabled();

    // set affinity for each thread
    set_omp_num_threads(num_threads);
    std::vector<int> ssarets(num_threads, 0);
    #pragma omp parallel for num_threads(num_threads)
    for (int i = 0; i < num_threads; i++)
    {
        ssarets[i] = set_sched_affinity(thread_affinity_mask);
    }
    for (int i = 0; i < num_threads; i++)
    {
        if (ssarets[i] != 0)
            return -1;
    }
#else
    int ssaret = set_sched_affinity(thread_affinity_mask);
    if (ssaret != 0)
        return -1;
#endif

    return 0;
#elif __APPLE__

#ifdef _OPENMP
    int num_threads = thread_affinity_mask.num_enabled();

    // set affinity for each thread
    set_omp_num_threads(num_threads);
    std::vector<int> ssarets(num_threads, 0);
    #pragma omp parallel for num_threads(num_threads)
    for (int i = 0; i < num_threads; i++)
    {
        // assign one core for each thread
        int core = -1 - i;
        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
        {
            if (thread_affinity_mask.is_enabled(j))
            {
                if (core == -1)
                {
                    core = j;
                    break;
                }
                else
                {
                    core++;
                }
            }
        }
        CpuSet this_thread_affinity_mask;
        if (core != -1 - i)
        {
            this_thread_affinity_mask.enable(core);
        }

        ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
    }
    for (int i = 0; i < num_threads; i++)
    {
        if (ssarets[i] != 0)
            return -1;
    }
#else
    int ssaret = set_sched_affinity(thread_affinity_mask);
    if (ssaret != 0)
        return -1;
#endif

    return 0;
#else
    // TODO
    (void)thread_affinity_mask;
    return -1;
#endif
}

int is_current_thread_running_on_a53_a55()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    if (g_cpu_is_arm_a53_a55 == 0)
        return 0; // all non a53/a55

    if (g_cpu_is_arm_a53_a55 == 1)
        return 1; // all a53/a55

    if (g_powersave == 2)
        return 0; // big clusters

    if (g_powersave == 1)
        return 1; // little clusters

    // little cores are a53/a55

    // use cpuid for retrieving midr since kernel 4.7+
    if (cpu_support_arm_cpuid())
    {
        unsigned int midr = get_midr_from_register();
        if (midr)
            return midr_is_a53_a55(midr);
    }

    // check if affinity cpuid is in the little ones
    CpuSet thread_cs;
    int ret = get_sched_affinity(thread_cs);
    if (ret != 0)
    {
        // no affinity capability
        return 0;
    }

    const CpuSet& little_cs = get_cpu_thread_affinity_mask(1);
    for (int i = 0; i < g_cpucount; i++)
    {
        if (!thread_cs.is_enabled(i))
            continue;

        if (!little_cs.is_enabled(i))
            return 0;
    }

    // all affinity cpuids are little core
    return 1;
#else
    return 0;
#endif // __aarch64__
#else
    return 0;
#endif // defined __ANDROID__ || defined __linux__
}

int get_omp_num_threads()
{
#ifdef _OPENMP
    return omp_get_num_threads();
#else
    return 1;
#endif
}

void set_omp_num_threads(int num_threads)
{
#ifdef _OPENMP
    omp_set_num_threads(num_threads);
#else
    (void)num_threads;
#endif
}

int get_omp_dynamic()
{
#ifdef _OPENMP
    return omp_get_dynamic();
#else
    return 0;
#endif
}

void set_omp_dynamic(int dynamic)
{
#ifdef _OPENMP
    omp_set_dynamic(dynamic);
#else
    (void)dynamic;
#endif
}

int get_omp_thread_num()
{
#ifdef _OPENMP
    return omp_get_thread_num();
#else
    return 0;
#endif
}

int get_kmp_blocktime()
{
#if defined(_OPENMP) && __clang__
    return kmp_get_blocktime();
#else
    return 0;
#endif
}

void set_kmp_blocktime(int time_ms)
{
#if defined(_OPENMP) && __clang__
    kmp_set_blocktime(time_ms);
#else
    (void)time_ms;
#endif
}

static ncnn::ThreadLocalStorage tls_flush_denormals;

int get_flush_denormals()
{
#if defined(__SSE3__)
    return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
#else
    return 0;
#endif
}

int set_flush_denormals(int flush_denormals)
{
    if (flush_denormals < 0 || flush_denormals > 3)
    {
        NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
        return -1;
    }
#if defined(__SSE3__)
    if (flush_denormals == 0)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
    }
    else if (flush_denormals == 1)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
    }
    else if (flush_denormals == 2)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    }
    else if (flush_denormals == 3)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    }

    tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
    return 0;
#else
    return 0;
#endif
}

} // namespace ncnn

#if defined __ANDROID__ && defined(_OPENMP) && __clang__
#ifdef __cplusplus
extern "C" {
#endif
void __wrap___kmp_affinity_determine_capable(const char* /*env_var*/)
{
    // the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails
    // ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h
    // and cpu core goes offline in powersave mode on android, which triggers abort
    // ATM there is no known api for controlling the abort behavior
    // override __kmp_affinity_determine_capable with empty body to disable affinity regardless of KMP_AFFINITY env_var
    // ugly hack works >.<    --- nihui
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif