| |
| #include <iostream> |
| #include <vector> |
| #include <cmath> |
| #include <chrono> |
| #include <string> |
| #include <memory> |
|
|
| struct bf16_t { |
| uint16_t bits; |
| float to_float() const { |
| uint32_t val_32 = uint32_t(bits) << 16; |
| return *reinterpret_cast<float*>(&val_32); |
| } |
| static bf16_t from_float(float f) { |
| uint32_t val_32 = *reinterpret_cast<uint32_t*>(&f); |
| bf16_t out; |
| out.bits = uint16_t(val_32 >> 16); |
| return out; |
| } |
| }; |
|
|
| struct NatalTernaryTensor { |
| std::string name; |
| std::vector<int8_t> weights; |
| std::vector<float> scales; |
| int rows; |
| int cols; |
| }; |
|
|
| class NatalEngine { |
| public: |
| NatalEngine() { |
| std::cout << "[Project Natal] Bit-Masking Engine aktiv. Verzweigungen eliminiert.\n"; |
| } |
|
|
| |
| void mat_vec_multiply_branchless(const NatalTernaryTensor& matrix, const std::vector<bf16_t>& vec_in, std::vector<float>& vec_out) { |
| vec_out.assign(matrix.rows, 0.0f); |
|
|
| #pragma omp parallel for |
| for (int r = 0; r < matrix.rows; ++r) { |
| float sum = 0.0f; |
| int row_offset = r * matrix.cols; |
| float current_scale = matrix.scales[r]; |
|
|
| for (int c = 0; c < matrix.cols; ++c) { |
| int8_t w = matrix.weights[row_offset + c]; |
| float val = vec_in[c].to_float(); |
| |
| |
| |
| |
| |
| float is_pos = static_cast<float>(w == 1); |
| float is_neg = static_cast<float>(w == -1); |
| |
| sum += (is_pos * val) - (is_neg * val); |
| } |
| vec_out[r] = sum * current_scale; |
| } |
| } |
|
|
| void process_fused_qkv(const NatalTernaryTensor& fused_qkv, const std::vector<bf16_t>& hidden_states) { |
| std::vector<float> qkv_output; |
| auto start = std::chrono::high_resolution_clock::now(); |
| |
| mat_vec_multiply_branchless(fused_qkv, hidden_states, qkv_output); |
| |
| auto end = std::chrono::high_resolution_clock::now(); |
| std::chrono::duration<double, std::milli> elapsed = end - start; |
|
|
| std::cout << "[Natal - Performance] Zeit pro Schicht: " << elapsed.count() << " ms.\n"; |
| } |
| }; |
|
|
| int main() { |
| std::cout << "==================================================\n"; |
| std::cout << " PROJECT NATAL - BENCHMARK RUN (BRANCHLESS)\n"; |
| std::cout << "==================================================\n\n"; |
|
|
| NatalEngine engine; |
| int hidden_dim = 4096; |
| int fused_rows = hidden_dim * 3; |
|
|
| NatalTernaryTensor mock_qkv; |
| mock_qkv.name = "layers.0.attention.attn_qkv"; |
| mock_qkv.rows = fused_rows; |
| mock_qkv.cols = hidden_dim; |
| mock_qkv.weights.assign(fused_rows * hidden_dim, 0); |
| mock_qkv.scales.assign(fused_rows, 0.02f); |
|
|
| |
| for (size_t i = 0; i < mock_qkv.weights.size(); ++i) { |
| if (i % 5 == 0) mock_qkv.weights[i] = 1; |
| else if (i % 11 == 0) mock_qkv.weights[i] = -1; |
| } |
|
|
| std::vector<bf16_t> mock_hidden_states(hidden_dim); |
| for (int i = 0; i < hidden_dim; ++i) { |
| mock_hidden_states[i] = bf16_t::from_float(1.0f + std::cos(i * 0.05f)); |
| } |
|
|
| engine.process_fused_qkv(mock_qkv, mock_hidden_states); |
| return 0; |
| } |
|
|