Spaces:

NOT-OMEGA
/

Inference

Running

App Files Files Community

Inference / inference.cpp

NOT-OMEGA

Upload 4 files

472497b verified about 1 month ago

raw

history blame contribute delete

12.4 kB

	/*
	* OPTIMIZED SLM 50M INFERENCE ENGINE
	* Target: i3 11th Gen \| Windows 11 \| 8GB RAM
	* OpenMP Parallel + AVX2 Auto Vectorized
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <string.h>
	#include <time.h>
	#include <vector>
	#include <algorithm>
	#include <immintrin.h> // REQUIRED FOR AVX2 SIMD

	#ifdef _OPENMP
	#include <omp.h>
	#endif

	// ---------------------------------------------------------------------------
	// Config & Structures
	// ---------------------------------------------------------------------------

	typedef struct {
	int n_layer;
	int n_head;
	int n_embd;
	int block_size;
	int vocab_size;
	} Config;

	typedef struct {
	float* wte; float* wpe;
	float ln1_w; float ln1_b;
	float c_attn_w; float c_attn_b;
	float c_proj_w; float c_proj_b;
	float ln2_w; float ln2_b;
	float fc_w; float fc_b;
	float mlp_proj_w; float mlp_proj_b;
	float* ln_f_w; float* ln_f_b;
	float* lm_head_w;
	} Weights;

	typedef struct { float* k_cache; float* v_cache; } KVCache;

	static Config cfg;
	static Weights W;
	static float* model_data_buffer = NULL;

	// ---------------------------------------------------------------------------
	// Math Kernels
	// ---------------------------------------------------------------------------

	static void layer_norm(float* out, const float* x, const float* w, const float* b, int size) {
	float mean = 0.0f, var = 0.0f;

	for (int i = 0; i < size; i++) mean += x[i];
	mean /= size;

	for (int i = 0; i < size; i++) {
	float d = x[i] - mean;
	var += d * d;
	}
	var /= size;

	float scale = 1.0f / sqrtf(var + 1e-5f);

	for (int i = 0; i < size; i++)
	out[i] = (x[i] - mean) * scale * w[i] + b[i];
	}

	// OpenMP + AVX2 + FMA parallelized matmul
	static void matmul_vec(float* out, const float* mat, const float* x, int M, int K) {

	#pragma omp parallel for
	for (int i = 0; i < M; i++) {
	const float* row = mat + (long long)i * K;

	// Initialize a 256-bit vector with all zeros
	__m256 sum_vec = _mm256_setzero_ps();

	int j = 0;
	// Process 8 floats at a time
	for (; j <= K - 8; j += 8) {
	// Load 8 floats from the matrix row and the input vector
	__m256 m_val = _mm256_loadu_ps(&row[j]);
	__m256 x_val = _mm256_loadu_ps(&x[j]);

	// FMA (Fused Multiply-Add): sum_vec += m_val * x_val
	sum_vec = _mm256_fmadd_ps(m_val, x_val, sum_vec);
	}

	// Extract the 8 floats back out and sum them horizontally
	float sum_arr[8];
	_mm256_storeu_ps(sum_arr, sum_vec);
	float sum = sum_arr[0] + sum_arr[1] + sum_arr[2] + sum_arr[3] +
	sum_arr[4] + sum_arr[5] + sum_arr[6] + sum_arr[7];

	// Handle any leftover elements if K is not a multiple of 8
	for (; j < K; j++) {
	sum += row[j] * x[j];
	}

	out[i] = sum;
	}
	}

	static void add_bias(float* x, const float* b, int N) {
	#pragma omp parallel for
	for (int i = 0; i < N; i++)
	x[i] += b[i];
	}

	static void residual_add(float* x, const float* y, int N) {
	#pragma omp parallel for
	for (int i = 0; i < N; i++)
	x[i] += y[i];
	}

	static void gelu_inplace(float* x, int N) {
	const float c = 0.7978845608f;

	#pragma omp parallel for
	for (int i = 0; i < N; i++) {
	float v = x[i];
	float t = tanhf(c * (v + 0.044715f * v * v * v));
	x[i] = 0.5f * v * (1.0f + t);
	}
	}

	static void softmax_inplace(float* x, int N) {

	float max_val = x[0];
	for (int i = 1; i < N; i++)
	if (x[i] > max_val) max_val = x[i];

	float sum = 0.0f;
	for (int i = 0; i < N; i++) {
	x[i] = expf(x[i] - max_val);
	sum += x[i];
	}

	for (int i = 0; i < N; i++)
	x[i] /= sum;
	}

	// ---------------------------------------------------------------------------
	// Transformer Forward
	// ---------------------------------------------------------------------------

	static void forward(
	int token_id,
	int pos,
	KVCache* kv,
	float* x,
	float* buf,
	float* qkv_buf,
	float* attn_buf,
	float* ff_buf,
	float* logits
	) {
	const int C = cfg.n_embd;
	const int H = cfg.n_head;
	const int hs = C / H;

	float* content_row = W.wte + (long long)token_id * C;
	float* pos_row = W.wpe + (long long)pos * C;

	#pragma omp parallel for
	for (int i = 0; i < C; i++)
	x[i] = content_row[i] + pos_row[i];

	for (int l = 0; l < cfg.n_layer; l++) {

	layer_norm(buf, x, W.ln1_w[l], W.ln1_b[l], C);

	matmul_vec(qkv_buf, W.c_attn_w[l], buf, 3 * C, C);
	add_bias(qkv_buf, W.c_attn_b[l], 3 * C);

	float* q = qkv_buf;
	float* k = qkv_buf + C;
	float* v = qkv_buf + 2 * C;

	float* k_cache = kv->k_cache + (long long)l * cfg.block_size * C;
	float* v_cache = kv->v_cache + (long long)l * cfg.block_size * C;

	memcpy(k_cache + (long long)pos * C, k, C * sizeof(float));
	memcpy(v_cache + (long long)pos * C, v, C * sizeof(float));

	#pragma omp parallel for
	for (int h = 0; h < H; h++) {

	float* q_h = q + h * hs;
	float scale = 1.0f / sqrtf((float)hs);

	// Give each thread its own slice of the attention buffer
	float* local_attn = attn_buf + h * cfg.block_size;

	for (int t = 0; t <= pos; t++) {
	float* k_h = k_cache + (long long)t * C + h * hs;
	float dot = 0.0f;
	for (int d = 0; d < hs; d++)
	dot += q_h[d] * k_h[d];

	local_attn[t] = dot * scale;
	}

	softmax_inplace(local_attn, pos + 1);

	float* out_h = buf + h * hs;
	memset(out_h, 0, hs * sizeof(float));

	for (int t = 0; t <= pos; t++) {
	float* v_h = v_cache + (long long)t * C + h * hs;
	float a = local_attn[t];
	for (int d = 0; d < hs; d++)
	out_h[d] += a * v_h[d];
	}
	}

	float* attn_out = qkv_buf;
	matmul_vec(attn_out, W.c_proj_w[l], buf, C, C);
	add_bias(attn_out, W.c_proj_b[l], C);
	residual_add(x, attn_out, C);

	layer_norm(buf, x, W.ln2_w[l], W.ln2_b[l], C);

	matmul_vec(ff_buf, W.fc_w[l], buf, 4 * C, C);
	add_bias(ff_buf, W.fc_b[l], 4 * C);
	gelu_inplace(ff_buf, 4 * C);

	matmul_vec(buf, W.mlp_proj_w[l], ff_buf, C, 4 * C);
	add_bias(buf, W.mlp_proj_b[l], C);
	residual_add(x, buf, C);
	}

	layer_norm(buf, x, W.ln_f_w, W.ln_f_b, C);
	matmul_vec(logits, W.lm_head_w, buf, cfg.vocab_size, C);
	}

	// ---------------------------------------------------------------------------
	// Weight Mapping
	// ---------------------------------------------------------------------------

	static void map_weights(float* data) {

	float* ptr = data;
	const int C = cfg.n_embd;
	const int L = cfg.n_layer;

	W.wte = ptr; ptr += (long long)cfg.vocab_size * C;
	W.wpe = ptr; ptr += (long long)cfg.block_size * C;

	W.ln1_w = (float*)malloc(L sizeof(float*));
	W.ln1_b = (float*)malloc(L sizeof(float*));
	W.c_attn_w = (float*)malloc(L sizeof(float*));
	W.c_attn_b = (float*)malloc(L sizeof(float*));
	W.c_proj_w = (float*)malloc(L sizeof(float*));
	W.c_proj_b = (float*)malloc(L sizeof(float*));
	W.ln2_w = (float*)malloc(L sizeof(float*));
	W.ln2_b = (float*)malloc(L sizeof(float*));
	W.fc_w = (float*)malloc(L sizeof(float*));
	W.fc_b = (float*)malloc(L sizeof(float*));
	W.mlp_proj_w = (float*)malloc(L sizeof(float*));
	W.mlp_proj_b = (float*)malloc(L sizeof(float*));

	for (int l = 0; l < L; l++) {
	W.ln1_w[l] = ptr; ptr += C;
	W.ln1_b[l] = ptr; ptr += C;

	W.c_attn_w[l] = ptr; ptr += 3LL * C * C;
	W.c_attn_b[l] = ptr; ptr += 3LL * C;

	W.c_proj_w[l] = ptr; ptr += 1LL * C * C;
	W.c_proj_b[l] = ptr; ptr += C;

	W.ln2_w[l] = ptr; ptr += C;
	W.ln2_b[l] = ptr; ptr += C;

	W.fc_w[l] = ptr; ptr += 4LL * C * C;
	W.fc_b[l] = ptr; ptr += 4LL * C;

	W.mlp_proj_w[l] = ptr; ptr += 1LL * C * 4 * C;
	W.mlp_proj_b[l] = ptr; ptr += C;
	}

	W.ln_f_w = ptr; ptr += C;
	W.ln_f_b = ptr; ptr += C;

	W.lm_head_w = ptr;
	}

	// ---------------------------------------------------------------------------
	// MAIN
	// ---------------------------------------------------------------------------

	int main(int argc, char* argv[]) {

	if (argc < 3) {
	printf("ERROR_ARGS");
	return 1;
	}

	FILE* f = fopen("model.bin", "rb");
	if (!f) {
	printf("ERROR_MODEL_NOT_FOUND");
	return 1;
	}

	fread(&cfg, sizeof(int), 5, f);
	fseek(f, 0, SEEK_END);
	long file_size = ftell(f);
	fseek(f, 5 * sizeof(int), SEEK_SET);

	model_data_buffer = (float)malloc(file_size - 5 sizeof(int));
	fread(model_data_buffer, 1, file_size - 5 * sizeof(int), f);
	fclose(f);

	map_weights(model_data_buffer);

	std::vector<int> input_ids;
	char* token = strtok(argv[1], ",");
	while (token) {
	input_ids.push_back(atoi(token));
	token = strtok(NULL, ",");
	}

	if (input_ids.size() >= (size_t)cfg.block_size)
	input_ids.resize(cfg.block_size - 1);

	int max_new_tokens = atoi(argv[2]);

	float temperature = (argc > 3) ? atof(argv[3]) : 0.8f;
	int top_k = (argc > 4) ? atoi(argv[4]) : 40;
	if (temperature < 0.01f) temperature = 0.01f;
	if (top_k < 1) top_k = 1;
	if (top_k > cfg.vocab_size) top_k = cfg.vocab_size;

	srand((unsigned int)time(NULL));

	const int C = cfg.n_embd;

	KVCache kv;
	kv.k_cache = (float)calloc((long long)cfg.n_layer cfg.block_size * C, sizeof(float));
	kv.v_cache = (float)calloc((long long)cfg.n_layer cfg.block_size * C, sizeof(float));

	float* x = (float)malloc(C sizeof(float));
	float* buf = (float)malloc(C sizeof(float));
	float* qkv_buf = (float)malloc(3 C * sizeof(float));

	// Allocate enough space for ALL heads to process simultaneously
	float* attn_buf = (float)malloc(cfg.n_head cfg.block_size * sizeof(float));

	float* ff_buf = (float)malloc(4 C * sizeof(float));
	float* logits = (float)malloc(cfg.vocab_size sizeof(float));

	for (int i = 0; i < (int)input_ids.size(); i++)
	forward(input_ids[i], i, &kv, x, buf, qkv_buf, attn_buf, ff_buf, logits);

	int pos = input_ids.size();

	for (int i = 0; i < max_new_tokens; i++) {

	if (pos >= cfg.block_size)
	break;

	for (int v = 0; v < cfg.vocab_size; v++)
	logits[v] /= temperature;

	std::vector<std::pair<float, int>> pairs(cfg.vocab_size);
	for (int v = 0; v < cfg.vocab_size; v++)
	pairs[v] = {logits[v], v};

	std::partial_sort(pairs.begin(), pairs.begin() + top_k, pairs.end(),
	[](const std::pair<float,int>& a, const std::pair<float,int>& b) {
	return a.first > b.first;
	});

	float sum = 0.0f;
	for (int j = 0; j < top_k; j++) {
	pairs[j].first = expf(pairs[j].first);
	sum += pairs[j].first;
	}
	for (int j = 0; j < top_k; j++)
	pairs[j].first /= sum;

	float r = (float)rand() / ((float)RAND_MAX + 1.0f);
	float cum = 0.0f;
	int best = pairs[0].second;
	for (int j = 0; j < top_k; j++) {
	cum += pairs[j].first;
	if (r < cum) {
	best = pairs[j].second;
	break;
	}
	}

	printf("%d ", best);

	if (best == 50256)
	break;

	forward(best, pos, &kv, x, buf, qkv_buf, attn_buf, ff_buf, logits);
	pos++;
	}

	free(model_data_buffer);
	return 0;
	}