Qapdex
/

MiMo-V2.5-QEdge

text-generation-inference

Model card Files Files and versions

MiMo-V2.5-QEdge / main.cpp

Qapdex's picture

Update main.cpp

5bed82c verified 8 days ago

History Blame Contribute Delete

3.59 kB

	// project_natal/src/main.cpp
	#include <iostream>
	#include <vector>
	#include <cmath>
	#include <chrono>
	#include <string>
	#include <memory>

	struct bf16_t {
	uint16_t bits;
	float to_float() const {
	uint32_t val_32 = uint32_t(bits) << 16;
	return reinterpret_cast<float>(&val_32);
	}
	static bf16_t from_float(float f) {
	uint32_t val_32 = reinterpret_cast<uint32_t>(&f);
	bf16_t out;
	out.bits = uint16_t(val_32 >> 16);
	return out;
	}
	};

	struct NatalTernaryTensor {
	std::string name;
	std::vector<int8_t> weights;
	std::vector<float> scales;
	int rows;
	int cols;
	};

	class NatalEngine {
	public:
	NatalEngine() {
	std::cout << "[Project Natal] Bit-Masking Engine aktiv. Verzweigungen eliminiert.\n";
	}

	// BRANCHLESS TERNARY KERNEL (Verhindert Pipeline-Stalls bei echten Gewichten)
	void mat_vec_multiply_branchless(const NatalTernaryTensor& matrix, const std::vector<bf16_t>& vec_in, std::vector<float>& vec_out) {
	vec_out.assign(matrix.rows, 0.0f);

	#pragma omp parallel for
	for (int r = 0; r < matrix.rows; ++r) {
	float sum = 0.0f;
	int row_offset = r * matrix.cols;
	float current_scale = matrix.scales[r];

	for (int c = 0; c < matrix.cols; ++c) {
	int8_t w = matrix.weights[row_offset + c];
	float val = vec_in[c].to_float();

	// Mathematischer Trick statt IF/ELSE:
	// Wenn w = 1 -> (1 * val) - (0 * val) = +val
	// Wenn w = -1 -> (0 * val) - (1 * val) = -val
	// Wenn w = 0 -> (0 * val) - (0 * val) = 0
	float is_pos = static_cast<float>(w == 1);
	float is_neg = static_cast<float>(w == -1);

	sum += (is_pos * val) - (is_neg * val);
	}
	vec_out[r] = sum * current_scale;
	}
	}

	void process_fused_qkv(const NatalTernaryTensor& fused_qkv, const std::vector<bf16_t>& hidden_states) {
	std::vector<float> qkv_output;
	auto start = std::chrono::high_resolution_clock::now();

	mat_vec_multiply_branchless(fused_qkv, hidden_states, qkv_output);

	auto end = std::chrono::high_resolution_clock::now();
	std::chrono::duration<double, std::milli> elapsed = end - start;

	std::cout << "[Natal - Performance] Zeit pro Schicht: " << elapsed.count() << " ms.\n";
	}
	};

	int main() {
	std::cout << "==================================================\n";
	std::cout << " PROJECT NATAL - BENCHMARK RUN (BRANCHLESS)\n";
	std::cout << "==================================================\n\n";

	NatalEngine engine;
	int hidden_dim = 4096;
	int fused_rows = hidden_dim * 3;

	NatalTernaryTensor mock_qkv;
	mock_qkv.name = "layers.0.attention.attn_qkv";
	mock_qkv.rows = fused_rows;
	mock_qkv.cols = hidden_dim;
	mock_qkv.weights.assign(fused_rows * hidden_dim, 0);
	mock_qkv.scales.assign(fused_rows, 0.02f);

	// Realistischeres Gewichtsrauschen simulieren
	for (size_t i = 0; i < mock_qkv.weights.size(); ++i) {
	if (i % 5 == 0) mock_qkv.weights[i] = 1;
	else if (i % 11 == 0) mock_qkv.weights[i] = -1;
	}

	std::vector<bf16_t> mock_hidden_states(hidden_dim);
	for (int i = 0; i < hidden_dim; ++i) {
	mock_hidden_states[i] = bf16_t::from_float(1.0f + std::cos(i * 0.05f));
	}

	engine.process_fused_qkv(mock_qkv, mock_hidden_states);
	return 0;
	}