File size: 7,995 Bytes
8ae5fc5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | // ----------------------------------------------------------------------
// CycleClock
// A CycleClock tells you the current time in Cycles. The "time"
// is actually time since power-on. This is like time() but doesn't
// involve a system call and is much more precise.
//
// NOTE: Not all cpu/platform/kernel combinations guarantee that this
// clock increments at a constant rate or is synchronized across all logical
// cpus in a system.
//
// If you need the above guarantees, please consider using a different
// API. There are efforts to provide an interface which provides a millisecond
// granularity and implemented as a memory read. A memory read is generally
// cheaper than the CycleClock for many architectures.
//
// Also, in some out of order CPU implementations, the CycleClock is not
// serializing. So if you're trying to count at cycles granularity, your
// data might be inaccurate due to out of order instruction execution.
// ----------------------------------------------------------------------
#ifndef BENCHMARK_CYCLECLOCK_H_
#define BENCHMARK_CYCLECLOCK_H_
#include <cstdint>
#include "benchmark/benchmark.h"
#include "internal_macros.h"
#if defined(BENCHMARK_OS_MACOSX)
#include <mach/mach_time.h>
#endif
// For MSVC, we want to use '_asm rdtsc' when possible (since it works
// with even ancient MSVC compilers), and when not possible the
// __rdtsc intrinsic, declared in <intrin.h>. Unfortunately, in some
// environments, <windows.h> and <intrin.h> have conflicting
// declarations of some other intrinsics, breaking compilation.
// Therefore, we simply declare __rdtsc ourselves. See also
// http://connect.microsoft.com/VisualStudio/feedback/details/262047
#if defined(COMPILER_MSVC) && !defined(_M_IX86)
extern "C" uint64_t __rdtsc();
#pragma intrinsic(__rdtsc)
#endif
#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
#include <sys/time.h>
#include <time.h>
#endif
#ifdef BENCHMARK_OS_EMSCRIPTEN
#include <emscripten.h>
#endif
namespace benchmark {
// NOTE: only i386 and x86_64 have been well tested.
// PPC, sparc, alpha, and ia64 are based on
// http://peter.kuscsik.com/wordpress/?p=14
// with modifications by m3b. See also
// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
namespace cycleclock {
// This should return the number of cycles since power-on. Thread-safe.
inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
#if defined(BENCHMARK_OS_MACOSX)
// this goes at the top because we need ALL Macs, regardless of
// architecture, to return the number of "mach time units" that
// have passed since startup. See sysinfo.cc where
// InitializeSystemInfo() sets the supposed cpu clock frequency of
// macs to the number of mach time units per second, not actual
// CPU clock frequency (which can change in the face of CPU
// frequency scaling). Also note that when the Mac sleeps, this
// counter pauses; it does not continue counting, nor does it
// reset to zero.
return mach_absolute_time();
#elif defined(BENCHMARK_OS_EMSCRIPTEN)
// this goes above x86-specific code because old versions of Emscripten
// define __x86_64__, although they have nothing to do with it.
return static_cast<int64_t>(emscripten_get_now() * 1e+6);
#elif defined(__i386__)
int64_t ret;
__asm__ volatile("rdtsc" : "=A"(ret));
return ret;
#elif defined(__x86_64__) || defined(__amd64__)
uint64_t low, high;
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
return (high << 32) | low;
#elif defined(__powerpc__) || defined(__ppc__)
// This returns a time-base, which is not always precisely a cycle-count.
int64_t tbl, tbu0, tbu1;
asm("mftbu %0" : "=r"(tbu0));
asm("mftb %0" : "=r"(tbl));
asm("mftbu %0" : "=r"(tbu1));
tbl &= -static_cast<int64_t>(tbu0 == tbu1);
// high 32 bits in tbu1; low 32 bits in tbl (tbu0 is garbage)
return (tbu1 << 32) | tbl;
#elif defined(__sparc__)
int64_t tick;
asm(".byte 0x83, 0x41, 0x00, 0x00");
asm("mov %%g1, %0" : "=r"(tick));
return tick;
#elif defined(__ia64__)
int64_t itc;
asm("mov %0 = ar.itc" : "=r"(itc));
return itc;
#elif defined(COMPILER_MSVC) && defined(_M_IX86)
// Older MSVC compilers (like 7.x) don't seem to support the
// __rdtsc intrinsic properly, so I prefer to use _asm instead
// when I know it will work. Otherwise, I'll use __rdtsc and hope
// the code is being compiled with a non-ancient compiler.
_asm rdtsc
#elif defined(COMPILER_MSVC)
return __rdtsc();
#elif defined(BENCHMARK_OS_NACL)
// Native Client validator on x86/x86-64 allows RDTSC instructions,
// and this case is handled above. Native Client validator on ARM
// rejects MRC instructions (used in the ARM-specific sequence below),
// so we handle it here. Portable Native Client compiles to
// architecture-agnostic bytecode, which doesn't provide any
// cycle counter access mnemonics.
// Native Client does not provide any API to access cycle counter.
// Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
// because is provides nanosecond resolution (which is noticable at
// least for PNaCl modules running on x86 Mac & Linux).
// Initialize to always return 0 if clock_gettime fails.
struct timespec ts = {0, 0};
clock_gettime(CLOCK_MONOTONIC, &ts);
return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
#elif defined(__aarch64__)
// System timer of ARMv8 runs at a different frequency than the CPU's.
// The frequency is fixed, typically in the range 1-50MHz. It can be
// read at CNTFRQ special register. We assume the OS has set up
// the virtual timer properly.
int64_t virtual_timer_value;
asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
return virtual_timer_value;
#elif defined(__ARM_ARCH)
// V6 is the earliest arch that has a standard cyclecount
// Native Client validator doesn't allow MRC instructions.
#if (__ARM_ARCH >= 6)
uint32_t pmccntr;
uint32_t pmuseren;
uint32_t pmcntenset;
// Read the user mode perf monitor counter access permissions.
asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
if (pmuseren & 1) { // Allows reading perfmon counters for user mode code.
asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
if (pmcntenset & 0x80000000ul) { // Is it counting?
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
// The counter is set up to count every 64th cycle
return static_cast<int64_t>(pmccntr) * 64; // Should optimize to << 6
}
}
#endif
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
#elif defined(__mips__)
// mips apparently only allows rdtsc for superusers, so we fall
// back to gettimeofday. It's possible clock_gettime would be better.
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
#elif defined(__s390__) // Covers both s390 and s390x.
// Return the CPU clock.
uint64_t tsc;
asm("stck %0" : "=Q"(tsc) : : "cc");
return tsc;
#elif defined(__riscv) // RISC-V
// Use RDCYCLE (and RDCYCLEH on riscv32)
#if __riscv_xlen == 32
uint64_t cycles_low, cycles_hi0, cycles_hi1;
asm("rdcycleh %0" : "=r"(cycles_hi0));
asm("rdcycle %0" : "=r"(cycles_lo));
asm("rdcycleh %0" : "=r"(cycles_hi1));
// This matches the PowerPC overflow detection, above
cycles_lo &= -static_cast<int64_t>(cycles_hi0 == cycles_hi1);
return (cycles_hi1 << 32) | cycles_lo;
#else
uint64_t cycles;
asm("rdcycle %0" : "=r"(cycles));
return cycles;
#endif
#else
// The soft failover to a generic implementation is automatic only for ARM.
// For other platforms the developer is expected to make an attempt to create
// a fast implementation and use generic version if nothing better is available.
#error You need to define CycleTimer for your OS and CPU
#endif
}
} // end namespace cycleclock
} // end namespace benchmark
#endif // BENCHMARK_CYCLECLOCK_H_
|