Spaces:
Sleeping
Sleeping
| """ | |
| Memory Benchmark Module | |
| 内存性能测试:带宽测试、延迟测试、缓存性能 | |
| Optimized with ctypes for raw C-level performance | |
| """ | |
| import time | |
| import ctypes | |
| import multiprocessing | |
| import mmap | |
| import os | |
| import numpy as np # Keep for latency/cache tests | |
| from concurrent.futures import ProcessPoolExecutor | |
| from typing import Dict, Any | |
| # Load C standard library | |
| try: | |
| libc = ctypes.CDLL("libc.so.6") | |
| libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] | |
| libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t] | |
| libc.memchr.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] | |
| except Exception: | |
| libc = None | |
| # --- C Extension Handling --- | |
| C_LIB_PATH = os.path.join(os.path.dirname(__file__), "_memory_bench_c.so") | |
| C_SRC_PATH = os.path.join(os.path.dirname(__file__), "memory_bench_c.c") | |
| def _compile_c_helper(): | |
| """Compiles the C helper library if it doesn't exist or is outdated.""" | |
| if not os.path.exists(C_SRC_PATH): | |
| return None | |
| needs_compile = False | |
| if not os.path.exists(C_LIB_PATH): | |
| needs_compile = True | |
| else: | |
| # Check timestamps | |
| if os.path.getmtime(C_SRC_PATH) > os.path.getmtime(C_LIB_PATH): | |
| needs_compile = True | |
| if needs_compile: | |
| # User requested max optimization | |
| cmd = f"gcc -O3 -shared -fPIC -o {C_LIB_PATH} {C_SRC_PATH}" | |
| if os.system(cmd) != 0: | |
| print("Failed to compile C helper.") | |
| return None | |
| try: | |
| lib = ctypes.CDLL(C_LIB_PATH) | |
| lib.measure_latency_random.argtypes = [ctypes.c_size_t, ctypes.c_size_t] | |
| lib.measure_latency_random.restype = ctypes.c_double | |
| lib.measure_latency_sequential.argtypes = [ctypes.c_size_t, ctypes.c_size_t] | |
| lib.measure_latency_sequential.restype = ctypes.c_double | |
| lib.measure_alloc_rate.argtypes = [ctypes.c_size_t, ctypes.c_size_t] | |
| lib.measure_alloc_rate.restype = ctypes.c_double | |
| return lib | |
| except Exception as e: | |
| print(f"Failed to load C helper: {e}") | |
| return None | |
| c_lib = _compile_c_helper() | |
| def _raw_memory_worker(args): | |
| """ | |
| Worker process for memory bandwidth test using raw C calls. | |
| Equivalent to sysbench memory test. | |
| """ | |
| block_size_mb, duration, mode = args | |
| block_size = block_size_mb * 1024 * 1024 | |
| # Use mmap for aligned, raw memory allocation (no Python object overhead) | |
| # Anonymous mapping | |
| src_map = mmap.mmap(-1, block_size) | |
| dst_map = None | |
| # For copy mode, we need a destination | |
| if mode == 'copy': | |
| dst_map = mmap.mmap(-1, block_size) | |
| # Get raw pointers | |
| src_addr = ctypes.addressof(ctypes.c_char.from_buffer(src_map)) | |
| dst_addr = ctypes.addressof(ctypes.c_char.from_buffer(dst_map)) if dst_map else 0 | |
| # Prepare C function calls | |
| memset = libc.memset | |
| memcpy = libc.memcpy | |
| memchr = libc.memchr | |
| start_time = time.time() | |
| iterations = 0 | |
| while time.time() - start_time < duration: | |
| if mode == 'read': | |
| # Scan memory (read access) | |
| # Find a byte that (likely) isn't there to force full scan | |
| memchr(src_addr, 1, block_size) | |
| elif mode == 'write': | |
| # Write memory | |
| memset(src_addr, 0, block_size) | |
| elif mode == 'copy': | |
| # Copy memory | |
| memcpy(dst_addr, src_addr, block_size) | |
| iterations += 1 | |
| elapsed = time.time() - start_time | |
| # Cleanup | |
| src_map.close() | |
| if dst_map: | |
| dst_map.close() | |
| return iterations, elapsed | |
| def benchmark_memory_bandwidth(block_size_mb: int = 4) -> Dict[str, Any]: | |
| """ | |
| 内存带宽测试 (Raw C Performance) | |
| Uses multiprocessing + ctypes to bypass Python overhead. | |
| """ | |
| if not libc: | |
| return {"error": "libc not found, cannot run optimized benchmark"} | |
| num_cores = multiprocessing.cpu_count() | |
| duration = 3.0 | |
| # sysbench defaults to 1KB-1MB blocks. User mentioned 1MB. | |
| # We use a slightly larger buffer per thread to amortize loop overhead if needed, | |
| # but 1MB-4MB is usually good for L3/RAM cache thrashing. | |
| # Let's stick to 4MB per thread to ensure we hit RAM. | |
| modes = ['read', 'write', 'copy'] | |
| results = {} | |
| with ProcessPoolExecutor(max_workers=num_cores) as executor: | |
| for mode in modes: | |
| # Submit tasks | |
| futures = [executor.submit(_raw_memory_worker, (block_size_mb, duration, mode)) for _ in range(num_cores)] | |
| total_iterations = 0 | |
| max_elapsed = 0 | |
| for f in futures: | |
| iters, elapsed = f.result() | |
| total_iterations += iters | |
| max_elapsed = max(max_elapsed, elapsed) | |
| # Calculate Bandwidth | |
| # Data transferred per iteration = block_size | |
| bytes_per_iter = block_size_mb * 1024 * 1024 | |
| total_bytes = total_iterations * bytes_per_iter | |
| # Note: For 'copy', sysbench counts read+write? | |
| # Usually bandwidth is defined as bytes processed. | |
| # If we copy 1GB, we read 1GB and write 1GB. | |
| # sysbench memory test reports "transferred". | |
| # For copy, let's report the amount of data moved (Payload). | |
| # Or if user wants bus bandwidth, it's 2x. | |
| # Benchmarks usually report the size of the buffer processed. | |
| # However, previous impl multiplied by 2. Let's stick to total bytes moved over bus. | |
| if mode == 'copy': | |
| total_bytes *= 2 | |
| # Avoid division by zero | |
| if max_elapsed > 0: | |
| bandwidth_gb_s = total_bytes / max_elapsed / (1024**3) | |
| else: | |
| bandwidth_gb_s = 0 | |
| results[f"{mode}_bandwidth_gb_s"] = round(bandwidth_gb_s, 3) | |
| return { | |
| "test": "memory_bandwidth", | |
| "description": f"Memory bandwidth test (Multi-core C-level, {num_cores} threads)", | |
| "block_size_mb": block_size_mb, | |
| "read_bandwidth_gb_s": results['read_bandwidth_gb_s'], | |
| "write_bandwidth_gb_s": results['write_bandwidth_gb_s'], | |
| "copy_bandwidth_gb_s": results['copy_bandwidth_gb_s'], | |
| "score": round((results['read_bandwidth_gb_s'] + results['write_bandwidth_gb_s'] + results['copy_bandwidth_gb_s']) * 10, 2), | |
| } | |
| def benchmark_memory_latency(iterations: int = 10000000) -> Dict[str, Any]: | |
| """ | |
| 内存延迟测试(随机访问) | |
| Uses C helper for precise pointer chasing. | |
| """ | |
| if not c_lib: | |
| return {"error": "C helper not available"} | |
| # Test random access latency on a large block (64MB) to hit RAM | |
| array_size_bytes = 64 * 1024 * 1024 | |
| elapsed = c_lib.measure_latency_random(array_size_bytes, iterations) | |
| if elapsed <= 0: | |
| return {"error": "Benchmark failed"} | |
| latency_ns = (elapsed / iterations) * 1e9 | |
| return { | |
| "test": "memory_latency_random", | |
| "description": "Random access latency (64MB working set, Pointer Chasing)", | |
| "iterations": iterations, | |
| "total_time_seconds": round(elapsed, 4), | |
| "average_latency_ns": round(latency_ns, 2), | |
| "score": round(100 / latency_ns * 1000, 2), # Adjusted score scale | |
| } | |
| def benchmark_sequential_latency(iterations: int = 10000000) -> Dict[str, Any]: | |
| """ | |
| 内存延迟测试(顺序访问) | |
| Uses C helper. | |
| """ | |
| if not c_lib: | |
| return {"error": "C helper not available"} | |
| # Same 64MB block | |
| array_size_bytes = 64 * 1024 * 1024 | |
| elapsed = c_lib.measure_latency_sequential(array_size_bytes, iterations) | |
| if elapsed <= 0: | |
| return {"error": "Benchmark failed"} | |
| latency_ns = (elapsed / iterations) * 1e9 | |
| return { | |
| "test": "memory_latency_sequential", | |
| "description": "Sequential access latency (64MB working set, Strided Read)", | |
| "iterations": iterations, | |
| "total_time_seconds": round(elapsed, 4), | |
| "average_latency_ns": round(latency_ns, 2), | |
| "score": round(100 / latency_ns * 1000, 2), | |
| } | |
| def benchmark_alloc_rate(iterations: int = 1000000) -> Dict[str, Any]: | |
| """ | |
| 内存分配/释放速率测试 | |
| """ | |
| if not c_lib: | |
| return {"error": "C helper not available"} | |
| # Test small allocations (e.g. 1KB) which are common | |
| alloc_size = 1024 | |
| elapsed = c_lib.measure_alloc_rate(alloc_size, iterations) | |
| if elapsed <= 0: | |
| return {"error": "Benchmark failed"} | |
| ops_per_sec = iterations / elapsed | |
| return { | |
| "test": "memory_alloc_rate", | |
| "description": f"Malloc/Free rate (Size: {alloc_size} bytes)", | |
| "iterations": iterations, | |
| "ops_per_sec": round(ops_per_sec, 2), | |
| "score": round(ops_per_sec / 10000, 2), | |
| } | |
| def benchmark_cache_latency() -> Dict[str, Any]: | |
| """ | |
| 缓存层级延迟测试 (L1/L2/L3) | |
| Uses C helper pointer chasing with smaller working sets. | |
| """ | |
| if not c_lib: | |
| return {"error": "C helper not available"} | |
| results = {} | |
| # Approximate sizes. | |
| # Must be small enough to fit in cache, but large enough to measure. | |
| # Typical: L1=32KB(use 16KB), L2=256KB(use 128KB), L3=8MB+(use 4MB) | |
| levels = [ | |
| ("L1", 16 * 1024), | |
| ("L2", 128 * 1024), | |
| ("L3", 4 * 1024 * 1024) | |
| ] | |
| iterations = 10000000 # 10M iterations | |
| for name, size in levels: | |
| elapsed = c_lib.measure_latency_random(size, iterations) | |
| latency_ns = (elapsed / iterations) * 1e9 | |
| for name, size in levels: | |
| elapsed = c_lib.measure_latency_random(size, iterations) | |
| if elapsed <= 0: | |
| # Fallback or error | |
| latency_ns = 0.0 | |
| else: | |
| latency_ns = (elapsed / iterations) * 1e9 | |
| results[name] = { | |
| "size_bytes": size, | |
| "latency_ns": round(latency_ns, 2) | |
| } | |
| l1_lat = results["L1"]["latency_ns"] | |
| score = 0 | |
| if l1_lat > 0: | |
| score = round(100 / l1_lat * 500, 2) | |
| return { | |
| "test": "cache_latency", | |
| "description": "Cache hierarchy latency (Pointer Chasing)", | |
| "levels": results, | |
| "score": score | |
| } | |
| def run_all_memory_benchmarks() -> Dict[str, Any]: | |
| """运行所有内存基准测试""" | |
| results = { | |
| "bandwidth": benchmark_memory_bandwidth(), | |
| "latency_random": benchmark_memory_latency(), | |
| "latency_sequential": benchmark_sequential_latency(), | |
| "cache_latency": benchmark_cache_latency(), | |
| "alloc_rate": benchmark_alloc_rate(), | |
| } | |
| # 计算总分 | |
| total_score = sum(r.get("score", 0) for r in results.values()) | |
| results["total_score"] = round(total_score, 2) | |
| return results | |