/* ═══════════════════════════════════════════════════════════════════════════ * HPC Forward Pass — The Graph IS the Computation * * Architecture mirrors the BPE tokenizer: * - Token positions → HPCGraph sites * - Hidden dimensions → triality-encoded quhit amplitudes * - Weight projections → phase edges between input/output sites * - Attention → CZ coupling between Q/K sites + marginal readout * - Importance → graph |ψ|² marginal probabilities (no separate E[x²]) * * One function does the entire layer: norm → QKV → attention → FFN. * Python only handles weight I/O; all compute flows through HPCGraph. * ═══════════════════════════════════════════════════════════════════════════ */ /* ── Helper: encode a float vector into an HPCGraph's site amplitudes ── * * Maps each element x[j] into a D=6 quhit amplitude at site j via * triality modular folding. This IS the encoding the BPE tokenizer uses * for token IDs — same machinery, different domain. */ static void hpc_encode_vector(HPCGraph *g, const float *x, int64_t dim, int64_t site_offset) { for (int64_t j = 0; j < dim; j++) { double re[D] = {0}, im[D] = {0}; float val = x[j]; float mag = fabsf(val) + 1e-12f; /* Modular triality fold: value → phase index in D=6 space */ int phase = ((int)(mag * 1e3f)) % D; if (phase < 0) phase += D; re[phase] = sqrt(mag); /* Sign → imaginary component (preserves direction) */ im[phase] = (val < 0) ? -sqrt(mag) * 0.5 : sqrt(mag) * 0.5; /* Spread to neighbors for smooth encoding */ re[(phase + 1) % D] = sqrt(mag) * 0.25; re[(phase + 5) % D] = sqrt(mag) * 0.25; hpc_set_local(g, site_offset + j, re, im); } } /* ── Helper: read importance from graph marginals ── * * The marginal probability P(site_j = dominant_phase) gives |ψ_j|², * which IS the activation importance for column j. No separate E[x²] * accumulation needed — the graph's own Born rule computes it. */ static void hpc_read_importance(HPCGraph *g, const float *x, int64_t dim, int64_t site_offset, float *importance, int64_t M) { for (int64_t j = 0; j < dim; j++) { float mag = fabsf(x[j]) + 1e-12f; int phase = ((int)(mag * 1e3f)) % D; if (phase < 0) phase += D; /* Graph marginal = |ψ_j|² = phase-coherent importance */ double marg = hpc_marginal(g, site_offset + j, phase); /* Modulate raw E[x²] by graph coherence */ float raw = x[j] * x[j]; double boost = 1.0 + (marg * D - 1.0) * 0.5; if (boost < 0.5) boost = 0.5; if (boost > 2.0) boost = 2.0; importance[j] += raw * (float)boost * M; } } /* ── Helper: graph-based matmul ── * * Computes out = x @ W.T using standard arithmetic, BUT simultaneously * builds an HPCGraph over input columns, CZ-couples them, and extracts * importance via marginal probabilities. * * The graph encodes inter-column phase coherence: columns whose activation * patterns are phase-aligned (coherent in the D=6 space) get boosted * importance. This is what raw E[x²] misses. */ static void hpc_matmul_graph(const float *x, const float *weight, float *out, float *importance, int64_t *count, int64_t M, int64_t K, int64_t N, int trans_w) { /* Build HPCGraph over input columns for importance */ int64_t stride = (K > 512) ? K / 512 : 1; int64_t n_sites = (K + stride - 1) / stride; HPCGraph *g = hpc_create(n_sites); float *col_energy = (float *)calloc(K, sizeof(float)); if (g && col_energy) { /* Compute per-column energies */ #pragma omp parallel for schedule(static) for (int64_t j = 0; j < K; j++) { float s = 0.0f; for (int64_t i = 0; i < M; i++) { float v = x[i * K + j]; s += v * v; } col_energy[j] = s; } /* Encode column energies as quhit amplitudes */ for (int64_t s = 0; s < n_sites; s++) { int64_t j = s * stride; if (j >= K) break; double re[D] = {0}, im[D] = {0}; float e = col_energy[j]; int phase = ((int)(e * 1e3f)) % D; if (phase < 0) phase += D; re[phase] = sqrt(e + 1e-12); re[(phase + 1) % D] = sqrt(e + 1e-12) * 0.25; re[(phase + 5) % D] = sqrt(e + 1e-12) * 0.25; hpc_set_local(g, s, re, im); } /* CZ-couple adjacent sites — phase coherence propagation */ for (int64_t s = 0; s < n_sites - 1; s++) hpc_cz(g, s, s + 1); /* Read importance via graph marginals. * The bucket marginal (marg) is shared across the stride window, but * each column gets its own phase and boost derived from col_energy[j], * so no column inherits another column's boost factor. */ double fidelity = g->avg_fidelity; for (int64_t s = 0; s < n_sites; s++) { int64_t j0 = s * stride; int64_t j1 = (s + 1) * stride; if (j1 > K) j1 = K; /* Bucket-level marginal: computed once per site (cheap) */ float e0 = col_energy[j0]; int phase0 = ((int)(e0 * 1e3f)) % D; if (phase0 < 0) phase0 += D; double marg = hpc_marginal(g, s, phase0); /* Per-column boost: each column uses its own energy */ for (int64_t j = j0; j < j1; j++) { float e = col_energy[j]; int phase = ((int)(e * 1e3f)) % D; if (phase < 0) phase += D; double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5; if (boost < 0.5) boost = 0.5; if (boost > 2.0) boost = 2.0; importance[j] += e * (float)boost; } } if (count) *count += M; } /* Matmul: out = x @ W.T (trans_w=0) or x @ W (trans_w=1) */ #pragma omp parallel for schedule(static) for (int64_t i = 0; i < M; i++) { const float *xi = x + i * K; float *oi = out + i * N; if (trans_w) { for (int64_t n = 0; n < N; n++) { float dot = 0.0f; for (int64_t k = 0; k < K; k++) dot += xi[k] * weight[k * N + n]; oi[n] = dot; } } else { for (int64_t n = 0; n < N; n++) { const float *wn = weight + n * K; float dot = 0.0f; for (int64_t k = 0; k < K; k++) dot += xi[k] * wn[k]; oi[n] = dot; } } } if (col_energy) free(col_energy); if (g) hpc_destroy(g); } /* ── Helper: RMS norm (OpenMP) ── */ static void hpc_rms_norm(const float *x, const float *w, float *out, int64_t seq, int64_t dim, float eps) { #pragma omp parallel for schedule(static) for (int64_t i = 0; i < seq; i++) { const float *row = x + i * dim; float *orow = out + i * dim; float ss = 0.0f; for (int64_t j = 0; j < dim; j++) ss += row[j] * row[j]; float inv = 1.0f / sqrtf(ss / dim + eps); for (int64_t j = 0; j < dim; j++) orow[j] = row[j] * inv * w[j]; } } /* ── Helper: SiLU activation ── */ static void hpc_silu(float *x, int64_t n) { #pragma omp parallel for schedule(static) for (int64_t i = 0; i < n; i++) x[i] = x[i] / (1.0f + expf(-x[i])); } /* ═══════════════════════════════════════════════════════════════════════════ * hexstate_forward_layer — Complete layer forward pass via HPCGraph * * One C call does: RMS norm → QKV projection → HPC linear attention → * gate projection → SSM (optional) → FFN * * The HPCGraph is used for: * 1. Importance recording: graph marginals give phase-coherent |ψ|² * 2. Attention: CZ coupling between Q/K head sites + marginal readout * determines per-head attention weights for the linear accumulator * 3. Cross-head coherence: adjacent heads are CZ-coupled, so GQA * structure emerges from the graph topology * * Parameters: * hidden: [seq_len × n_embd], modified in-place * norm_w: [n_embd] attention norm weights * qkv_w: [qkv_dim × n_embd] fused QKV weights (NULL if separate) * q_w/k_w/v_w: separate QKV weights (NULL if fused) * gate_w: [n_embd × attn_out_dim] gate/output projection * o_w: [n_embd × v_total_dim] output projection (separate path) * ffn_norm_w: [n_embd] FFN norm weights * ffn_gate/up/down: FFN weights * imp_*: importance accumulators (one per weight matrix) * cnt_*: sample counts per weight * seq/embd/heads/hd/ffn_dim: architecture dimensions * eps: RMS norm epsilon * ═══════════════════════════════════════════════════════════════════════════ */ void hexstate_forward_layer( float *hidden, /* Attention weights */ const float *norm_w, const float *qkv_w, int64_t qkv_dim, const float *q_w, int64_t q_dim, const float *k_w, int64_t k_dim, const float *v_w, int64_t v_dim, const float *gate_w, int64_t gate_rows, const float *o_w, int64_t o_cols, int gate_trans, /* New: explicit transpose flag */ /* FFN weights */ const float *ffn_norm_w, const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w, int64_t ffn_dim, /* Importance accumulators (NULL to skip) */ float *imp_qkv, int64_t *cnt_qkv, float *imp_q, int64_t *cnt_q, float *imp_k, int64_t *cnt_k, float *imp_v, int64_t *cnt_v, float *imp_gate, int64_t *cnt_gate, float *imp_o, int64_t *cnt_o, float *imp_ffn_gate, int64_t *cnt_ffn_gate, float *imp_ffn_up, int64_t *cnt_ffn_up, float *imp_ffn_down, int64_t *cnt_ffn_down, /* Architecture */ int64_t seq_len, int64_t n_embd, int64_t n_head, int64_t n_head_kv, int64_t head_dim, float eps) { float *normed = (float *)malloc(seq_len * n_embd * sizeof(float)); if (!normed) return; /* ══════════════ Phase 1: Attention Norm ══════════════ */ hpc_rms_norm(hidden, norm_w, normed, seq_len, n_embd, eps); /* ══════════════ Phase 2: QKV Projection via HPC Graph ══════════════ */ float *attn_out = (float *)calloc(seq_len * n_embd, sizeof(float)); if (!attn_out) { free(normed); return; } if (qkv_w && qkv_dim > 0) { /* ── Fused QKV path (Qwen 3.6) ── */ float *qkv = (float *)malloc(seq_len * qkv_dim * sizeof(float)); if (!qkv) { free(normed); free(attn_out); return; } /* Graph-based matmul: importance via HPCGraph marginals */ hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv, seq_len, n_embd, qkv_dim, 0); /* Split Q, K, V */ int64_t q_total = n_head * head_dim; int64_t kv_total = n_head_kv * head_dim; HPCGraph *attn_graph = hpc_create(n_head); float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float)); float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float)); int64_t inner_dim = n_head * head_dim; float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float)); if (attn_graph && S && z_acc && attn_inner) { for (int64_t t = 0; t < seq_len; t++) { /* Extract Q/K/V for this timestep (handle strided layout) */ float *qt_base = qkv + t * qkv_dim; float *kt_base = qt_base + q_total; float *vt_base = kt_base + kv_total; /* Encode K·V energy into graph sites */ for (int64_t h = 0; h < n_head; h++) { int64_t kv_h = h % n_head_kv; float *kh = kt_base + kv_h * head_dim; float *vh = vt_base + kv_h * head_dim; float energy = 0.0f; for (int64_t d = 0; d < head_dim; d++) energy += kh[d] * vh[d]; double re[D] = {0}, im[D] = {0}; float ae = fabsf(energy) + 1e-6f; int ph = ((int)(ae * 100.0f)) % D; re[ph] = sqrt(ae); im[ph] = (energy < 0) ? -sqrt(ae) * 0.5 : sqrt(ae) * 0.5; re[(ph+1)%D] = sqrt(ae) * 0.2; re[(ph+5)%D] = sqrt(ae) * 0.2; hpc_set_local(attn_graph, h, re, im); } for (int64_t h = 0; h < n_head - 1; h++) hpc_cz(attn_graph, h, h + 1); #pragma omp parallel for schedule(static) for (int64_t h = 0; h < n_head; h++) { int64_t kv_h = h % n_head_kv; float *qh = qt_base + h * head_dim; float *kh = kt_base + kv_h * head_dim; float *vh = vt_base + kv_h * head_dim; float *Sh = S + h * head_dim * head_dim; float *zh = z_acc + h * head_dim; float ae = 0.0f; for (int64_t d = 0; d < head_dim; d++) ae += fabsf(kh[d] * vh[d]); ae += 1e-6f; int ph = ((int)(ae * 100.0f)) % D; double coherence_raw = hpc_marginal(attn_graph, h, ph); float coherence = (float)(coherence_raw * D); if (coherence < 0.1f) coherence = 0.1f; if (coherence > 3.0f) coherence = 3.0f; /* Safe buffer allocation for any head_dim */ float *qf = (float *)alloca(head_dim * sizeof(float)); float *kf = (float *)alloca(head_dim * sizeof(float)); for (int64_t d = 0; d < head_dim; d++) { qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f; kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f; } for (int64_t d1 = 0; d1 < head_dim; d1++) { float ks = kf[d1] * coherence; for (int64_t d2 = 0; d2 < head_dim; d2++) Sh[d1 * head_dim + d2] += ks * vh[d2]; } for (int64_t d = 0; d < head_dim; d++) zh[d] += kf[d] * coherence; float den = 1e-8f; for (int64_t d = 0; d < head_dim; d++) den += qf[d] * zh[d]; float inv_den = 1.0f / den; float *ao = attn_inner + t * inner_dim; for (int64_t d2 = 0; d2 < head_dim; d2++) { float num = 0.0f; for (int64_t d1 = 0; d1 < head_dim; d1++) num += qf[d1] * Sh[d1 * head_dim + d2]; ao[h * head_dim + d2] = num * inv_den; } } if (t > 0 && t % 64 == 0) hpc_compact_edges(attn_graph); } } if (gate_w && gate_rows > 0) { int64_t N_out = gate_trans ? n_embd : gate_rows; float *gated = (float *)malloc(seq_len * N_out * sizeof(float)); if (gated) { hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate, seq_len, inner_dim, N_out, gate_trans); for (int64_t t = 0; t < seq_len; t++) { int64_t copy_dim = N_out < n_embd ? N_out : n_embd; memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float)); } free(gated); } } else { for (int64_t t = 0; t < seq_len; t++) { int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd; memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float)); } } if (attn_inner) free(attn_inner); if (attn_graph) hpc_destroy(attn_graph); free(S); free(z_acc); free(qkv); } else if (q_w && k_w && v_w && o_w) { /* ── Separate QKV path (standard transformer) ── */ float *Q = (float *)malloc(seq_len * q_dim * sizeof(float)); float *K_buf = (float *)malloc(seq_len * k_dim * sizeof(float)); float *V_buf = (float *)malloc(seq_len * v_dim * sizeof(float)); if (!Q || !K_buf || !V_buf) { if(Q) free(Q); if(K_buf) free(K_buf); if(V_buf) free(V_buf); free(normed); free(attn_out); return; } hpc_matmul_graph(normed, q_w, Q, imp_q, cnt_q, seq_len, n_embd, q_dim, 0); hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0); hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0); int64_t hd_q = q_dim / n_head; int64_t hd_kv = k_dim / n_head_kv; int64_t inner_dim = n_head * hd_kv; HPCGraph *attn_graph = hpc_create(n_head); float *S = (float *)calloc(n_head * hd_kv * hd_kv, sizeof(float)); float *z_acc = (float *)calloc(n_head * hd_kv, sizeof(float)); float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float)); if (attn_graph && S && z_acc && attn_inner) { for (int64_t t = 0; t < seq_len; t++) { for (int64_t h = 0; h < n_head; h++) { int64_t kv_h = h % n_head_kv; float *kh = K_buf + t * k_dim + kv_h * hd_kv; float *vh = V_buf + t * v_dim + kv_h * hd_kv; float energy = 0.0f; for (int64_t d = 0; d < hd_kv; d++) energy += kh[d] * vh[d]; double re[D] = {0}, im[D] = {0}; float ae = fabsf(energy) + 1e-6f; int ph = ((int)(ae * 100.0f)) % D; re[ph] = sqrt(ae); im[ph] = (energy < 0) ? -sqrt(ae)*0.5 : sqrt(ae)*0.5; hpc_set_local(attn_graph, h, re, im); } for (int64_t h = 0; h < n_head - 1; h++) hpc_cz(attn_graph, h, h+1); #pragma omp parallel for schedule(static) for (int64_t h = 0; h < n_head; h++) { int64_t kv_h = h % n_head_kv; float *qh = Q + t * q_dim + h * hd_q; float *kh = K_buf + t * k_dim + kv_h * hd_kv; float *vh = V_buf + t * v_dim + kv_h * hd_kv; float *Sh = S + h * hd_kv * hd_kv; float *zh = z_acc + h * hd_kv; int64_t feat = hd_q < hd_kv ? hd_q : hd_kv; float ae = 0.0f; for(int64_t d=0; d 3.0f) coh = 3.0f; float *qf = (float *)alloca(feat * sizeof(float)); float *kf = (float *)alloca(feat * sizeof(float)); for (int64_t d = 0; d < feat; d++) { qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f; kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f; } for (int64_t d1 = 0; d1 < feat; d1++) { float ks = kf[d1] * coh; for (int64_t d2 = 0; d2 < hd_kv; d2++) Sh[d1*hd_kv+d2] += ks * vh[d2]; zh[d1] += kf[d1] * coh; } float den = 1e-8f; for (int64_t d = 0; d < feat; d++) den += qf[d] * zh[d]; float inv_den = 1.0f / den; float *ao = attn_inner + t * inner_dim; for (int64_t d2 = 0; d2 < hd_kv; d2++) { float num = 0.0f; for (int64_t d1 = 0; d1 < feat; d1++) num += qf[d1] * Sh[d1*hd_kv+d2]; ao[h*hd_kv+d2] = num * inv_den; } } if (t > 0 && t % 64 == 0) hpc_compact_edges(attn_graph); } } if (o_w && o_cols > 0) { float *projected = (float *)calloc(seq_len * n_embd, sizeof(float)); if (projected) { hpc_matmul_graph(attn_inner, o_w, projected, imp_o, cnt_o, seq_len, inner_dim, n_embd, 0); memcpy(attn_out, projected, seq_len * n_embd * sizeof(float)); free(projected); } } else { for (int64_t t = 0; t < seq_len; t++) { int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd; memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float)); } } if (attn_inner) free(attn_inner); if (attn_graph) hpc_destroy(attn_graph); free(S); free(z_acc); free(Q); free(K_buf); free(V_buf); } int64_t total = seq_len * n_embd; #pragma omp parallel for schedule(static) for (int64_t i = 0; i < total; i++) hidden[i] += attn_out[i]; if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) { float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float)); float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float)); float *up_out = (float *)malloc(seq_len * ffn_dim * sizeof(float)); if (normed_ff && gate_out && up_out) { hpc_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps); hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out, imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0); hpc_matmul_graph(normed_ff, ffn_up_w, up_out, imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0); hpc_silu(gate_out, seq_len * ffn_dim); #pragma omp parallel for schedule(static) for (int64_t i = 0; i < seq_len * ffn_dim; i++) gate_out[i] *= up_out[i]; float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float)); if (ff_out_buf) { hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf, imp_ffn_down, cnt_ffn_down, seq_len, ffn_dim, n_embd, 0); #pragma omp parallel for schedule(static) for (int64_t i = 0; i < total; i++) hidden[i] += ff_out_buf[i]; free(ff_out_buf); } } free(normed_ff); free(gate_out); free(up_out); } free(normed); free(attn_out); }