Spaces:

TwoBraincells
/

Elite-Trade-Sentry

Sleeping

App Files Files Community

Elite-Trade-Sentry / hf auditor /src /spsc_ring_buffer.hpp

TheRealAIGuy

E1P1 Fix Hopefully

f0023cf about 2 months ago

raw

history blame contribute delete

12 kB

	#pragma once
	// ============================================================================
	// SPSCRingBuffer — Lock-Free Single-Producer Single-Consumer Queue
	// ============================================================================
	//
	// ARCHITECTURE:
	// A bounded, lock-free ring buffer for passing trades from the ingestion
	// thread (producer) to the engine thread (consumer). This is the only
	// cross-thread data path in the entire system.
	//
	// MEMORY ORDERING:
	// Uses std::memory_order_acquire/release — the minimum fence strength
	// that guarantees correctness for SPSC. Sequential consistency
	// (memory_order_seq_cst) is avoided because it inserts unnecessary
	// MFENCE instructions on x86 that cost ~30ns each.
	//
	// Producer: writes data, then head_.store(release)
	// → ensures all writes to buffer_[idx] are visible before
	// the consumer sees the incremented head.
	//
	// Consumer: reads head_.load(acquire)
	// → ensures it sees all writes the producer made before
	// the matching head_.store(release).
	//
	// FALSE-SHARING PREVENTION:
	// head_ and tail_ are placed on separate 128-byte aligned regions.
	// Why 128 and not 64? Intel CPUs prefetch cache lines in pairs
	// (spatial prefetcher). Separating by 128 bytes ensures head and tail
	// never share a prefetch pair, eliminating false sharing even on
	// aggressive prefetch hardware.
	//
	// CAPACITY:
	// Must be a power of 2 so that index wrapping uses bitwise AND:
	// real_index = sequence_number & (Capacity - 1)
	// This compiles to a single AND instruction instead of an integer
	// division (which costs 20-90 cycles on modern x86).
	//
	// ZERO-ALLOCATION GUARANTEE:
	// The internal buffer is allocated once at construction via
	// std::vector::resize(). No heap calls during push/pop.
	//
	// ============================================================================

	#include <atomic>
	#include <vector>
	#include <cstdint>
	#include <cstddef>
	#include <cassert>
	#include <type_traits>

	// ────────────────────────────────────────────────────────────────────────────
	// RingEntry — 32-byte aligned ingestion record
	// ────────────────────────────────────────────────────────────────────────────
	//
	// This struct matches the data produced by the ingestion gateway.
	// It's intentionally identical in layout to OrderPool::TradeSlot so
	// the consumer can memcpy directly (or the compiler can optimize
	// the field-by-field copy into a single AVX store).
	//
	// Layout (total = 32 bytes):
	// [0..7] trade_id uint64_t
	// [8..15] price int64_t
	// [16..19] quantity int32_t
	// [20..23] counterparty_id uint32_t
	// [24..31] timestamp_ns uint64_t
	//
	struct alignas(32) RingEntry {
	uint64_t trade_id; // 8 bytes
	int64_t price; // 8 bytes
	int32_t quantity; // 4 bytes
	uint32_t counterparty_id; // 4 bytes
	uint64_t timestamp_ns; // 8 bytes
	// ────────────────────── = 32 bytes total
	};
	static_assert(sizeof(RingEntry) == 32, "RingEntry must be exactly 32 bytes");

	// ────────────────────────────────────────────────────────────────────────────
	// SPSCRingBuffer<Capacity> — Lock-free SPSC queue
	// ────────────────────────────────────────────────────────────────────────────
	template <size_t Capacity>
	class SPSCRingBuffer {
	static_assert((Capacity & (Capacity - 1)) == 0,
	"SPSCRingBuffer capacity must be a power of 2");
	static_assert(Capacity > 0, "Capacity must be > 0");

	// Compile-time mask for index wrapping: Capacity - 1
	static constexpr size_t MASK = Capacity - 1;

	public:
	// ────────────────────────────────────────────────────────────────────
	// Constructor: pre-allocate the ring buffer
	//
	// The buffer is heap-allocated via std::vector at construction time.
	// This is the ONLY allocation. All subsequent push/pop operations
	// are zero-allocation.
	// ────────────────────────────────────────────────────────────────────
	SPSCRingBuffer() : buffer_(Capacity) {}

	// Non-copyable, non-movable (atomics can't be copied)
	SPSCRingBuffer(const SPSCRingBuffer&) = delete;
	SPSCRingBuffer& operator=(const SPSCRingBuffer&) = delete;
	SPSCRingBuffer(SPSCRingBuffer&&) = delete;
	SPSCRingBuffer& operator=(SPSCRingBuffer&&) = delete;

	// ────────────────────────────────────────────────────────────────────
	// try_push() — Producer API
	//
	// Called from the ingestion thread. Attempts to enqueue one trade.
	// Returns false if the buffer is full (back-pressure signal to the
	// producer that the consumer isn't keeping up).
	//
	// Memory ordering:
	// 1. Load tail_ with ACQUIRE to see the consumer's latest progress
	// 2. Write the entry to buffer_[head & MASK]
	// 3. Store head_+1 with RELEASE to publish the new entry
	// ────────────────────────────────────────────────────────────────────
	bool try_push(uint64_t trade_id, int64_t price, int32_t quantity,
	uint32_t counterparty_id, uint64_t timestamp_ns)
	{
	// 1. Load the current head_ with memory_order_relaxed (we are the only writer)
	const size_t head = head_.load(std::memory_order_relaxed);

	// 2. Calculate next_head = (head + 1) & (Capacity - 1)
	const size_t next_head = (head + 1) & MASK;

	// 3. Load the tail_ using memory_order_acquire (synchronize with consumer)
	const size_t tail = tail_.load(std::memory_order_acquire);

	// Check if buffer is full (if next_head catches up to tail, one slot empty)
	if (next_head == tail) {
	return false;
	}

	// 4. Write trade fields directly into buffer_[head]
	// This is safe because the consumer has not reached head yet.
	auto& entry = buffer_[head];
	entry.trade_id = trade_id;
	entry.price = price;
	entry.quantity = quantity;
	entry.counterparty_id = counterparty_id;
	entry.timestamp_ns = timestamp_ns;

	// 5. Store the new next_head into head_ using memory_order_release
	// This publishes the write to the consumer thread.
	head_.store(next_head, std::memory_order_release);

	return true;
	}

	// ────────────────────────────────────────────────────────────────────
	template <typename Callback>
	size_t drain(Callback&& cb) {
	// 1. Load our own tail_ with memory_order_relaxed (we are the only writer)
	size_t tail = tail_.load(std::memory_order_relaxed);

	// 2. Load the producer's head_ using memory_order_acquire
	// to synchronize with published entries.
	const size_t head = head_.load(std::memory_order_acquire);

	// Nothing to drain?
	if (tail == head) {
	return 0;
	}

	// 3. Drain all available entries in a tight loop
	size_t count = 0;
	while (tail != head) {
	// 4. Invoke process_func directly (zero-copy)
	cb(buffer_[tail]);

	// 5. Increment tail (tail = (tail + 1) & (Capacity - 1))
	tail = (tail + 1) & MASK;
	++count;
	}

	// 6. After the loop, atomic store the final tail with memory_order_release
	// This notifies the producer that slots are now free.
	tail_.store(tail, std::memory_order_release);

	return count;
	}

	// ────────────────────────────────────────────────────────────────────
	// try_pop() — Consumer API (single)
	// ────────────────────────────────────────────────────────────────────
	bool try_pop(RingEntry& out) {
	const size_t tail = tail_.load(std::memory_order_relaxed);
	const size_t head = head_.load(std::memory_order_acquire);

	if (tail == head) {
	return false;
	}

	out = buffer_[tail];
	tail_.store((tail + 1) & MASK, std::memory_order_release);
	return true;
	}

	// ────────────────────────────────────────────────────────────────────
	// Status queries (Modulo arithmetic for wrapped pointers)
	// ────────────────────────────────────────────────────────────────────
	size_t size() const {
	const size_t head = head_.load(std::memory_order_acquire);
	const size_t tail = tail_.load(std::memory_order_acquire);

	if (head >= tail) {
	return head - tail;
	} else {
	return (Capacity - tail) + head;
	}
	}

	bool empty() const {
	return head_.load(std::memory_order_relaxed) == tail_.load(std::memory_order_relaxed);
	}

	bool full() const {
	return ((head_.load(std::memory_order_relaxed) + 1) & MASK) == tail_.load(std::memory_order_relaxed);
	}

	static constexpr size_t capacity() { return Capacity; }

	private:
	// ── CRITICAL: False-sharing prevention ──────────────────────────────
	//
	// head_ is written ONLY by the producer thread.
	// tail_ is written ONLY by the consumer thread.
	//
	// alignas(128) places them on separate 128-byte regions, ensuring
	// they never share a cache line pair. Without this, every producer
	// write to head_ would invalidate the consumer's cache line
	// containing tail_, and vice versa — a catastrophic performance bug
	// called "false sharing" that can reduce throughput by 10-50×.
	//
	// Why 128 instead of 64 (one cache line)?
	// Intel's spatial prefetcher fetches adjacent cache lines in pairs.
	// A 64-byte separation still allows the prefetcher to pull both
	// lines into the same core's L1, causing false sharing.
	// 128 bytes guarantees isolation.
	//
	alignas(128) std::atomic<uint64_t> head_{0}; // Producer writes
	alignas(128) std::atomic<uint64_t> tail_{0}; // Consumer writes

	// The ring buffer storage. Heap-allocated once at construction.
	// Each entry is 32-byte aligned for potential AVX loads.
	std::vector<RingEntry> buffer_;
	};