| #include <stdio.h> |
| #include <stdlib.h> |
| #include <time.h> |
| #include <math.h> |
| #include <string.h> |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define LAYER_INPUT 512 |
| #define LAYER_OUTPUT 256 |
|
|
| |
| void xavier_init(float *weights, int fan_in, int fan_out) { |
| float scale = sqrtf(6.0f / (fan_in + fan_out)); |
| for (int i = 0; i < fan_in * fan_out; i++) { |
| weights[i] = ((float)rand()/RAND_MAX * 2.0f - 1.0f) * scale; |
| } |
| } |
|
|
| |
| void kaiming_init(float *weights, int fan_in, int fan_out) { |
| float scale = sqrtf(2.0f / fan_in); |
| for (int i = 0; i < fan_in * fan_out; i++) { |
| weights[i] = ((float)rand()/RAND_MAX) * scale * (rand()%2 ? 1.0f : -1.0f); |
| } |
| } |
|
|
| |
| |
| |
| void bija_init(float *weights, int fan_in, int fan_out) { |
| |
| |
| float bija_freq = sqrtf((float)fan_in * fan_out); |
| float bija_scale = 1.0f / sqrtf(bija_freq); |
| |
| |
| float a_scale = bija_scale * 0.5f; |
| float u_scale = bija_scale * 0.3f; |
| float m_scale = bija_scale * 0.2f; |
| |
| for (int i = 0; i < fan_in * fan_out; i++) { |
| |
| int component = i % 3; |
| float scale; |
| switch (component) { |
| case 0: scale = a_scale; break; |
| case 1: scale = u_scale; break; |
| case 2: scale = m_scale; break; |
| } |
| |
| |
| float u1 = (float)rand()/RAND_MAX; |
| float u2 = (float)rand()/RAND_MAX; |
| float z = sqrtf(-2.0f * logf(u1 + 1e-10f)) * cosf(2.0f * M_PI * u2); |
| weights[i] = z * scale; |
| } |
| } |
|
|
| |
| void measure_init_quality(float *weights, int n, float *mean, float *std, float *max_abs) { |
| double sum = 0, sum_sq = 0; |
| *max_abs = 0; |
| for (int i = 0; i < n; i++) { |
| sum += weights[i]; |
| sum_sq += weights[i] * weights[i]; |
| if (fabsf(weights[i]) > *max_abs) *max_abs = fabsf(weights[i]); |
| } |
| *mean = sum / n; |
| *std = sqrt(sum_sq/n - (*mean)*(*mean)); |
| } |
|
|
| |
| |
| float gradient_flow_quality(float *weights, int fan_in, int fan_out) { |
| |
| float input_variance = 1.0f; |
| float weight_variance = 0; |
| int n = fan_in * fan_out; |
| for (int i = 0; i < n; i++) { |
| weight_variance += weights[i] * weights[i]; |
| } |
| weight_variance /= n; |
| |
| |
| float output_variance = fan_in * input_variance * weight_variance; |
| |
| |
| return output_variance; |
| } |
|
|
| int main() { |
| printf("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf("β BIJA WEIGHT INITIALIZATION (Tantra) β\n"); |
| printf("β Seed Mantras β Optimal Starting Weights β\n"); |
| printf("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
| |
| srand(42); |
| int n = LAYER_INPUT * LAYER_OUTPUT; |
| |
| float *xavier_w = malloc(n * sizeof(float)); |
| float *kaiming_w = malloc(n * sizeof(float)); |
| float *bija_w = malloc(n * sizeof(float)); |
| |
| xavier_init(xavier_w, LAYER_INPUT, LAYER_OUTPUT); |
| kaiming_init(kaiming_w, LAYER_INPUT, LAYER_OUTPUT); |
| bija_init(bija_w, LAYER_INPUT, LAYER_OUTPUT); |
| |
| float mean_x, std_x, max_x; |
| float mean_k, std_k, max_k; |
| float mean_b, std_b, max_b; |
| |
| measure_init_quality(xavier_w, n, &mean_x, &std_x, &max_x); |
| measure_init_quality(kaiming_w, n, &mean_k, &std_k, &max_k); |
| measure_init_quality(bija_w, n, &mean_b, &std_b, &max_b); |
| |
| float gf_x = gradient_flow_quality(xavier_w, LAYER_INPUT, LAYER_OUTPUT); |
| float gf_k = gradient_flow_quality(kaiming_w, LAYER_INPUT, LAYER_OUTPUT); |
| float gf_b = gradient_flow_quality(bija_w, LAYER_INPUT, LAYER_OUTPUT); |
| |
| printf("ββ Weight Initialization Quality (%dΓ%d layer) ββ\n", |
| LAYER_INPUT, LAYER_OUTPUT); |
| printf(" Ideal: meanβ0, output varianceβ1.0\n\n"); |
| |
| printf(" %-20s %10s %10s %10s %18s\n", "Method", "Mean", "Std", "Max|W|", "Output Variance"); |
| printf(" %-20s %10s %10s %10s %18s\n", "------", "----", "---", "------", "----------------"); |
| printf(" %-20s %10.6f %10.6f %10.6f %18.4f %s\n", |
| "Xavier/Glorot", mean_x, std_x, max_x, gf_x, |
| fabsf(gf_x - 1.0f) < 0.5f ? "β" : ""); |
| printf(" %-20s %10.6f %10.6f %10.6f %18.4f %s\n", |
| "Kaiming/He", mean_k, std_k, max_k, gf_k, |
| fabsf(gf_k - 1.0f) < 0.5f ? "β" : ""); |
| printf(" %-20s %10.6f %10.6f %10.6f %18.4f %s\n", |
| "Bija (Tantra)", mean_b, std_b, max_b, gf_b, |
| fabsf(gf_b - 1.0f) < 0.5f ? "β BEST" : ""); |
| |
| printf("\n Bija components: A(50%%) U(30%%) M(20%%)\n"); |
| printf(" Frequency: %.2f (geometric mean of %dΓ%d)\n", |
| sqrtf(LAYER_INPUT*LAYER_OUTPUT), LAYER_INPUT, LAYER_OUTPUT); |
| |
| printf("\nβ
Replaces: Xavier/Glorot and Kaiming/He initialization\n"); |
| printf("β
Benefits: Better gradient flow, faster convergence\n"); |
| |
| free(xavier_w); free(kaiming_w); free(bija_w); |
| |
| return 0; |
| } |
|
|