| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| static void hpc_encode_vector(HPCGraph *g, const float *x, int64_t dim, |
| int64_t site_offset) |
| { |
| for (int64_t j = 0; j < dim; j++) { |
| double re[D] = {0}, im[D] = {0}; |
| float val = x[j]; |
| float mag = fabsf(val) + 1e-12f; |
| |
| int phase = ((int)(mag * 1e3f)) % D; |
| if (phase < 0) phase += D; |
| re[phase] = sqrt(mag); |
| |
| im[phase] = (val < 0) ? -sqrt(mag) * 0.5 : sqrt(mag) * 0.5; |
| |
| re[(phase + 1) % D] = sqrt(mag) * 0.25; |
| re[(phase + 5) % D] = sqrt(mag) * 0.25; |
| hpc_set_local(g, site_offset + j, re, im); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| static void hpc_read_importance(HPCGraph *g, const float *x, int64_t dim, |
| int64_t site_offset, float *importance, |
| int64_t M) |
| { |
| for (int64_t j = 0; j < dim; j++) { |
| float mag = fabsf(x[j]) + 1e-12f; |
| int phase = ((int)(mag * 1e3f)) % D; |
| if (phase < 0) phase += D; |
| |
| double marg = hpc_marginal(g, site_offset + j, phase); |
| |
| float raw = x[j] * x[j]; |
| double boost = 1.0 + (marg * D - 1.0) * 0.5; |
| if (boost < 0.5) boost = 0.5; |
| if (boost > 2.0) boost = 2.0; |
| importance[j] += raw * (float)boost * M; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static void hpc_matmul_graph(const float *x, const float *weight, float *out, |
| float *importance, int64_t *count, |
| int64_t M, int64_t K, int64_t N, int trans_w) |
| { |
| |
| int64_t stride = (K > 512) ? K / 512 : 1; |
| int64_t n_sites = (K + stride - 1) / stride; |
| HPCGraph *g = hpc_create(n_sites); |
| float *col_energy = (float *)calloc(K, sizeof(float)); |
|
|
| if (g && col_energy) { |
| |
| #pragma omp parallel for schedule(static) |
| for (int64_t j = 0; j < K; j++) { |
| float s = 0.0f; |
| for (int64_t i = 0; i < M; i++) { |
| float v = x[i * K + j]; |
| s += v * v; |
| } |
| col_energy[j] = s; |
| } |
|
|
| |
| for (int64_t s = 0; s < n_sites; s++) { |
| int64_t j = s * stride; |
| if (j >= K) break; |
| double re[D] = {0}, im[D] = {0}; |
| float e = col_energy[j]; |
| int phase = ((int)(e * 1e3f)) % D; |
| if (phase < 0) phase += D; |
| re[phase] = sqrt(e + 1e-12); |
| re[(phase + 1) % D] = sqrt(e + 1e-12) * 0.25; |
| re[(phase + 5) % D] = sqrt(e + 1e-12) * 0.25; |
| hpc_set_local(g, s, re, im); |
| } |
|
|
| |
| for (int64_t s = 0; s < n_sites - 1; s++) |
| hpc_cz(g, s, s + 1); |
|
|
| |
| |
| |
| |
| double fidelity = g->avg_fidelity; |
| for (int64_t s = 0; s < n_sites; s++) { |
| int64_t j0 = s * stride; |
| int64_t j1 = (s + 1) * stride; |
| if (j1 > K) j1 = K; |
| |
| float e0 = col_energy[j0]; |
| int phase0 = ((int)(e0 * 1e3f)) % D; |
| if (phase0 < 0) phase0 += D; |
| double marg = hpc_marginal(g, s, phase0); |
| |
| for (int64_t j = j0; j < j1; j++) { |
| float e = col_energy[j]; |
| int phase = ((int)(e * 1e3f)) % D; |
| if (phase < 0) phase += D; |
| double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5; |
| if (boost < 0.5) boost = 0.5; |
| if (boost > 2.0) boost = 2.0; |
| importance[j] += e * (float)boost; |
| } |
| } |
| if (count) *count += M; |
| } |
|
|
| |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < M; i++) { |
| const float *xi = x + i * K; |
| float *oi = out + i * N; |
| if (trans_w) { |
| for (int64_t n = 0; n < N; n++) { |
| float dot = 0.0f; |
| for (int64_t k = 0; k < K; k++) |
| dot += xi[k] * weight[k * N + n]; |
| oi[n] = dot; |
| } |
| } else { |
| for (int64_t n = 0; n < N; n++) { |
| const float *wn = weight + n * K; |
| float dot = 0.0f; |
| for (int64_t k = 0; k < K; k++) |
| dot += xi[k] * wn[k]; |
| oi[n] = dot; |
| } |
| } |
| } |
|
|
| if (col_energy) free(col_energy); |
| if (g) hpc_destroy(g); |
| } |
|
|
| |
| static void hpc_rms_norm(const float *x, const float *w, float *out, |
| int64_t seq, int64_t dim, float eps) |
| { |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < seq; i++) { |
| const float *row = x + i * dim; |
| float *orow = out + i * dim; |
| float ss = 0.0f; |
| for (int64_t j = 0; j < dim; j++) ss += row[j] * row[j]; |
| float inv = 1.0f / sqrtf(ss / dim + eps); |
| for (int64_t j = 0; j < dim; j++) orow[j] = row[j] * inv * w[j]; |
| } |
| } |
|
|
| |
| static void hpc_silu(float *x, int64_t n) |
| { |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < n; i++) |
| x[i] = x[i] / (1.0f + expf(-x[i])); |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| void hexstate_forward_layer( |
| float *hidden, |
| |
| const float *norm_w, |
| const float *qkv_w, int64_t qkv_dim, |
| const float *q_w, int64_t q_dim, |
| const float *k_w, int64_t k_dim, |
| const float *v_w, int64_t v_dim, |
| const float *gate_w, int64_t gate_rows, |
| const float *o_w, int64_t o_cols, |
| int gate_trans, |
| |
| const float *ffn_norm_w, |
| const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w, |
| int64_t ffn_dim, |
| |
| float *imp_qkv, int64_t *cnt_qkv, |
| float *imp_q, int64_t *cnt_q, |
| float *imp_k, int64_t *cnt_k, |
| float *imp_v, int64_t *cnt_v, |
| float *imp_gate, int64_t *cnt_gate, |
| float *imp_o, int64_t *cnt_o, |
| float *imp_ffn_gate, int64_t *cnt_ffn_gate, |
| float *imp_ffn_up, int64_t *cnt_ffn_up, |
| float *imp_ffn_down, int64_t *cnt_ffn_down, |
| |
| int64_t seq_len, int64_t n_embd, int64_t n_head, int64_t n_head_kv, |
| int64_t head_dim, float eps) |
| { |
| float *normed = (float *)malloc(seq_len * n_embd * sizeof(float)); |
| if (!normed) return; |
|
|
| |
| hpc_rms_norm(hidden, norm_w, normed, seq_len, n_embd, eps); |
|
|
| |
| float *attn_out = (float *)calloc(seq_len * n_embd, sizeof(float)); |
| if (!attn_out) { free(normed); return; } |
|
|
| if (qkv_w && qkv_dim > 0) { |
| |
| float *qkv = (float *)malloc(seq_len * qkv_dim * sizeof(float)); |
| if (!qkv) { free(normed); free(attn_out); return; } |
|
|
| |
| hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv, |
| seq_len, n_embd, qkv_dim, 0); |
|
|
| |
| int64_t q_total = n_head * head_dim; |
| int64_t kv_total = n_head_kv * head_dim; |
|
|
| HPCGraph *attn_graph = hpc_create(n_head); |
| float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float)); |
| float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float)); |
| int64_t inner_dim = n_head * head_dim; |
| float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float)); |
|
|
| if (attn_graph && S && z_acc && attn_inner) { |
| for (int64_t t = 0; t < seq_len; t++) { |
| |
| float *qt_base = qkv + t * qkv_dim; |
| float *kt_base = qt_base + q_total; |
| float *vt_base = kt_base + kv_total; |
|
|
| |
| for (int64_t h = 0; h < n_head; h++) { |
| int64_t kv_h = h % n_head_kv; |
| float *kh = kt_base + kv_h * head_dim; |
| float *vh = vt_base + kv_h * head_dim; |
| float energy = 0.0f; |
| for (int64_t d = 0; d < head_dim; d++) |
| energy += kh[d] * vh[d]; |
|
|
| double re[D] = {0}, im[D] = {0}; |
| float ae = fabsf(energy) + 1e-6f; |
| int ph = ((int)(ae * 100.0f)) % D; |
| re[ph] = sqrt(ae); |
| im[ph] = (energy < 0) ? -sqrt(ae) * 0.5 : sqrt(ae) * 0.5; |
| re[(ph+1)%D] = sqrt(ae) * 0.2; |
| re[(ph+5)%D] = sqrt(ae) * 0.2; |
| hpc_set_local(attn_graph, h, re, im); |
| } |
|
|
| for (int64_t h = 0; h < n_head - 1; h++) |
| hpc_cz(attn_graph, h, h + 1); |
|
|
| #pragma omp parallel for schedule(static) |
| for (int64_t h = 0; h < n_head; h++) { |
| int64_t kv_h = h % n_head_kv; |
| float *qh = qt_base + h * head_dim; |
| float *kh = kt_base + kv_h * head_dim; |
| float *vh = vt_base + kv_h * head_dim; |
| float *Sh = S + h * head_dim * head_dim; |
| float *zh = z_acc + h * head_dim; |
|
|
| float ae = 0.0f; |
| for (int64_t d = 0; d < head_dim; d++) |
| ae += fabsf(kh[d] * vh[d]); |
| ae += 1e-6f; |
| int ph = ((int)(ae * 100.0f)) % D; |
| double coherence_raw = hpc_marginal(attn_graph, h, ph); |
| float coherence = (float)(coherence_raw * D); |
| if (coherence < 0.1f) coherence = 0.1f; |
| if (coherence > 3.0f) coherence = 3.0f; |
|
|
| |
| float *qf = (float *)alloca(head_dim * sizeof(float)); |
| float *kf = (float *)alloca(head_dim * sizeof(float)); |
| for (int64_t d = 0; d < head_dim; d++) { |
| qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f; |
| kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f; |
| } |
|
|
| for (int64_t d1 = 0; d1 < head_dim; d1++) { |
| float ks = kf[d1] * coherence; |
| for (int64_t d2 = 0; d2 < head_dim; d2++) |
| Sh[d1 * head_dim + d2] += ks * vh[d2]; |
| } |
| for (int64_t d = 0; d < head_dim; d++) |
| zh[d] += kf[d] * coherence; |
|
|
| float den = 1e-8f; |
| for (int64_t d = 0; d < head_dim; d++) |
| den += qf[d] * zh[d]; |
| float inv_den = 1.0f / den; |
|
|
| float *ao = attn_inner + t * inner_dim; |
| for (int64_t d2 = 0; d2 < head_dim; d2++) { |
| float num = 0.0f; |
| for (int64_t d1 = 0; d1 < head_dim; d1++) |
| num += qf[d1] * Sh[d1 * head_dim + d2]; |
| ao[h * head_dim + d2] = num * inv_den; |
| } |
| } |
|
|
| if (t > 0 && t % 64 == 0) |
| hpc_compact_edges(attn_graph); |
| } |
| } |
|
|
| if (gate_w && gate_rows > 0) { |
| int64_t N_out = gate_trans ? n_embd : gate_rows; |
| float *gated = (float *)malloc(seq_len * N_out * sizeof(float)); |
| if (gated) { |
| hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate, |
| seq_len, inner_dim, N_out, gate_trans); |
| for (int64_t t = 0; t < seq_len; t++) { |
| int64_t copy_dim = N_out < n_embd ? N_out : n_embd; |
| memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float)); |
| } |
| free(gated); |
| } |
| } else { |
| for (int64_t t = 0; t < seq_len; t++) { |
| int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd; |
| memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float)); |
| } |
| } |
| if (attn_inner) free(attn_inner); |
| if (attn_graph) hpc_destroy(attn_graph); |
| free(S); free(z_acc); free(qkv); |
|
|
| } else if (q_w && k_w && v_w && o_w) { |
| |
| float *Q = (float *)malloc(seq_len * q_dim * sizeof(float)); |
| float *K_buf = (float *)malloc(seq_len * k_dim * sizeof(float)); |
| float *V_buf = (float *)malloc(seq_len * v_dim * sizeof(float)); |
| if (!Q || !K_buf || !V_buf) { |
| if(Q) free(Q); if(K_buf) free(K_buf); if(V_buf) free(V_buf); |
| free(normed); free(attn_out); |
| return; |
| } |
|
|
| hpc_matmul_graph(normed, q_w, Q, imp_q, cnt_q, seq_len, n_embd, q_dim, 0); |
| hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0); |
| hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0); |
|
|
| int64_t hd_q = q_dim / n_head; |
| int64_t hd_kv = k_dim / n_head_kv; |
| int64_t inner_dim = n_head * hd_kv; |
| HPCGraph *attn_graph = hpc_create(n_head); |
| float *S = (float *)calloc(n_head * hd_kv * hd_kv, sizeof(float)); |
| float *z_acc = (float *)calloc(n_head * hd_kv, sizeof(float)); |
| float *attn_inner = (float *)calloc(seq_len * inner_dim, sizeof(float)); |
|
|
| if (attn_graph && S && z_acc && attn_inner) { |
| for (int64_t t = 0; t < seq_len; t++) { |
| for (int64_t h = 0; h < n_head; h++) { |
| int64_t kv_h = h % n_head_kv; |
| float *kh = K_buf + t * k_dim + kv_h * hd_kv; |
| float *vh = V_buf + t * v_dim + kv_h * hd_kv; |
| float energy = 0.0f; |
| for (int64_t d = 0; d < hd_kv; d++) |
| energy += kh[d] * vh[d]; |
| double re[D] = {0}, im[D] = {0}; |
| float ae = fabsf(energy) + 1e-6f; |
| int ph = ((int)(ae * 100.0f)) % D; |
| re[ph] = sqrt(ae); |
| im[ph] = (energy < 0) ? -sqrt(ae)*0.5 : sqrt(ae)*0.5; |
| hpc_set_local(attn_graph, h, re, im); |
| } |
| for (int64_t h = 0; h < n_head - 1; h++) |
| hpc_cz(attn_graph, h, h+1); |
|
|
| #pragma omp parallel for schedule(static) |
| for (int64_t h = 0; h < n_head; h++) { |
| int64_t kv_h = h % n_head_kv; |
| float *qh = Q + t * q_dim + h * hd_q; |
| float *kh = K_buf + t * k_dim + kv_h * hd_kv; |
| float *vh = V_buf + t * v_dim + kv_h * hd_kv; |
| float *Sh = S + h * hd_kv * hd_kv; |
| float *zh = z_acc + h * hd_kv; |
| int64_t feat = hd_q < hd_kv ? hd_q : hd_kv; |
|
|
| float ae = 0.0f; |
| for(int64_t d=0; d<hd_kv; d++) ae += fabsf(kh[d]*vh[d]); |
| ae += 1e-6f; |
| int ph = ((int)(ae * 100.0f)) % D; |
| double coh_raw = hpc_marginal(attn_graph, h, ph); |
| float coh = (float)(coh_raw * D); |
| if (coh < 0.1f) coh = 0.1f; |
| if (coh > 3.0f) coh = 3.0f; |
|
|
| float *qf = (float *)alloca(feat * sizeof(float)); |
| float *kf = (float *)alloca(feat * sizeof(float)); |
| for (int64_t d = 0; d < feat; d++) { |
| qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f; |
| kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f; |
| } |
|
|
| for (int64_t d1 = 0; d1 < feat; d1++) { |
| float ks = kf[d1] * coh; |
| for (int64_t d2 = 0; d2 < hd_kv; d2++) |
| Sh[d1*hd_kv+d2] += ks * vh[d2]; |
| zh[d1] += kf[d1] * coh; |
| } |
|
|
| float den = 1e-8f; |
| for (int64_t d = 0; d < feat; d++) |
| den += qf[d] * zh[d]; |
| float inv_den = 1.0f / den; |
|
|
| float *ao = attn_inner + t * inner_dim; |
| for (int64_t d2 = 0; d2 < hd_kv; d2++) { |
| float num = 0.0f; |
| for (int64_t d1 = 0; d1 < feat; d1++) |
| num += qf[d1] * Sh[d1*hd_kv+d2]; |
| ao[h*hd_kv+d2] = num * inv_den; |
| } |
| } |
| if (t > 0 && t % 64 == 0) |
| hpc_compact_edges(attn_graph); |
| } |
| } |
|
|
| if (o_w && o_cols > 0) { |
| float *projected = (float *)calloc(seq_len * n_embd, sizeof(float)); |
| if (projected) { |
| hpc_matmul_graph(attn_inner, o_w, projected, imp_o, cnt_o, |
| seq_len, inner_dim, n_embd, 0); |
| memcpy(attn_out, projected, seq_len * n_embd * sizeof(float)); |
| free(projected); |
| } |
| } else { |
| for (int64_t t = 0; t < seq_len; t++) { |
| int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd; |
| memcpy(attn_out + t * n_embd, attn_inner + t * inner_dim, copy_dim * sizeof(float)); |
| } |
| } |
| if (attn_inner) free(attn_inner); |
| if (attn_graph) hpc_destroy(attn_graph); |
| free(S); free(z_acc); |
| free(Q); free(K_buf); free(V_buf); |
| } |
|
|
| int64_t total = seq_len * n_embd; |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < total; i++) |
| hidden[i] += attn_out[i]; |
|
|
| if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) { |
| float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float)); |
| float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float)); |
| float *up_out = (float *)malloc(seq_len * ffn_dim * sizeof(float)); |
|
|
| if (normed_ff && gate_out && up_out) { |
| hpc_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps); |
| hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out, |
| imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0); |
| hpc_matmul_graph(normed_ff, ffn_up_w, up_out, |
| imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0); |
|
|
| hpc_silu(gate_out, seq_len * ffn_dim); |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < seq_len * ffn_dim; i++) |
| gate_out[i] *= up_out[i]; |
|
|
| float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float)); |
| if (ff_out_buf) { |
| hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf, |
| imp_ffn_down, cnt_ffn_down, |
| seq_len, ffn_dim, n_embd, 0); |
| #pragma omp parallel for schedule(static) |
| for (int64_t i = 0; i < total; i++) |
| hidden[i] += ff_out_buf[i]; |
| free(ff_out_buf); |
| } |
| } |
| free(normed_ff); free(gate_out); free(up_out); |
| } |
| free(normed); |
| free(attn_out); |
| } |
|
|