Roberto Tacconelli commited on
Commit
1adf549
·
unverified ·
1 Parent(s): 6580887

Add files via upload

Browse files
Files changed (6) hide show
  1. ablation.c +16 -12
  2. mdc.c +7 -5
  3. measure_delta.c +277 -0
  4. ppm.h +84 -20
  5. ppm_excl.h +198 -0
  6. tweedie.h +19 -0
ablation.c CHANGED
@@ -25,7 +25,7 @@
25
  #define SCALE (1 << 14)
26
 
27
  /* ── Flags ── */
28
- #define FLAG_TWEEDIE 1
29
  #define FLAG_MATCH 2
30
  #define FLAG_WORD 4
31
  #define FLAG_HIGHCTX 8
@@ -84,9 +84,6 @@ static uint8_t *do_compress(const uint8_t *data, size_t data_len,
84
 
85
  ppm_predict(&ppm, probs, &confidence, &order);
86
 
87
- if (flags & FLAG_TWEEDIE) {
88
- tweedie_denoise(&twd, probs, order, confidence);
89
- }
90
  clamp_normalize(probs);
91
 
92
  if (flags & FLAG_MATCH) {
@@ -108,6 +105,11 @@ static uint8_t *do_compress(const uint8_t *data, size_t data_len,
108
  blend_highctx(probs, hctx_probs, hctx_conf);
109
  }
110
 
 
 
 
 
 
111
  probs_to_cumfreqs(probs, cumfreqs, &total);
112
  ae_encode(&enc, cumfreqs, byte, total);
113
 
@@ -178,9 +180,6 @@ static uint8_t *do_decompress(const uint8_t *compressed, size_t comp_len,
178
 
179
  ppm_predict(&ppm, probs, &confidence, &order);
180
 
181
- if (flags & FLAG_TWEEDIE) {
182
- tweedie_denoise(&twd, probs, order, confidence);
183
- }
184
  clamp_normalize(probs);
185
 
186
  if (flags & FLAG_MATCH) {
@@ -202,6 +201,11 @@ static uint8_t *do_decompress(const uint8_t *compressed, size_t comp_len,
202
  blend_highctx(probs, hctx_probs, hctx_conf);
203
  }
204
 
 
 
 
 
 
205
  probs_to_cumfreqs(probs, cumfreqs, &total);
206
  int sym = ad_decode(&dec, cumfreqs, total);
207
  result[i] = (uint8_t)sym;
@@ -235,11 +239,11 @@ typedef struct {
235
  } AblationConfig;
236
 
237
  static const AblationConfig CONFIGS[] = {
238
- { "Base PPM", 0 },
239
- { "+ Tweedie", FLAG_TWEEDIE },
240
- { "+ Twd + Match", FLAG_TWEEDIE | FLAG_MATCH },
241
- { "+ Twd + Match + Word", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD },
242
- { "+ Twd + M + W + H", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD | FLAG_HIGHCTX },
243
  };
244
  #define N_CONFIGS 5
245
 
 
25
  #define SCALE (1 << 14)
26
 
27
  /* ── Flags ── */
28
+ #define FLAG_TWEEDIE 1 /* post-blend Tweedie (after match/word/highctx) */
29
  #define FLAG_MATCH 2
30
  #define FLAG_WORD 4
31
  #define FLAG_HIGHCTX 8
 
84
 
85
  ppm_predict(&ppm, probs, &confidence, &order);
86
 
 
 
 
87
  clamp_normalize(probs);
88
 
89
  if (flags & FLAG_MATCH) {
 
105
  blend_highctx(probs, hctx_probs, hctx_conf);
106
  }
107
 
108
+ if (flags & FLAG_TWEEDIE) {
109
+ tweedie_denoise(&twd, probs, order, confidence);
110
+ clamp_normalize(probs);
111
+ }
112
+
113
  probs_to_cumfreqs(probs, cumfreqs, &total);
114
  ae_encode(&enc, cumfreqs, byte, total);
115
 
 
180
 
181
  ppm_predict(&ppm, probs, &confidence, &order);
182
 
 
 
 
183
  clamp_normalize(probs);
184
 
185
  if (flags & FLAG_MATCH) {
 
201
  blend_highctx(probs, hctx_probs, hctx_conf);
202
  }
203
 
204
+ if (flags & FLAG_TWEEDIE) {
205
+ tweedie_denoise(&twd, probs, order, confidence);
206
+ clamp_normalize(probs);
207
+ }
208
+
209
  probs_to_cumfreqs(probs, cumfreqs, &total);
210
  int sym = ad_decode(&dec, cumfreqs, total);
211
  result[i] = (uint8_t)sym;
 
239
  } AblationConfig;
240
 
241
  static const AblationConfig CONFIGS[] = {
242
+ { "Base PPM", 0 },
243
+ { "+ Match", FLAG_MATCH },
244
+ { "+ Match + Word", FLAG_MATCH | FLAG_WORD },
245
+ { "+ Match + Word + HCtx", FLAG_MATCH | FLAG_WORD | FLAG_HIGHCTX },
246
+ { "+ M + W + H + Tweedie", FLAG_MATCH | FLAG_WORD | FLAG_HIGHCTX | FLAG_TWEEDIE },
247
  };
248
  #define N_CONFIGS 5
249
 
mdc.c CHANGED
@@ -1,6 +1,6 @@
1
  /*
2
  * Midicoth Compressor — C implementation
3
- * Pipeline: PPM + Tweedie Denoising + Match + Word + HighCtx
4
  *
5
  * Usage:
6
  * ./mdc compress <input> <output>
@@ -96,8 +96,6 @@ static int do_compress(const char *input_path, const char *output_path) {
96
  double confidence;
97
  int order;
98
  ppm_predict(&ppm, probs, &confidence, &order);
99
-
100
- tweedie_denoise(&twd, probs, order, confidence);
101
  clamp_normalize(probs);
102
 
103
  int match_byte;
@@ -113,6 +111,9 @@ static int do_compress(const char *input_path, const char *output_path) {
113
  if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
114
  blend_highctx(probs, hctx_probs, hctx_conf);
115
 
 
 
 
116
  probs_to_cumfreqs(probs, cumfreqs, &total);
117
  ae_encode(&enc, cumfreqs, byte, total);
118
 
@@ -219,8 +220,6 @@ static int do_decompress(const char *input_path, const char *output_path) {
219
  double confidence;
220
  int order;
221
  ppm_predict(&ppm, probs, &confidence, &order);
222
-
223
- tweedie_denoise(&twd, probs, order, confidence);
224
  clamp_normalize(probs);
225
 
226
  int match_byte;
@@ -236,6 +235,9 @@ static int do_decompress(const char *input_path, const char *output_path) {
236
  if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
237
  blend_highctx(probs, hctx_probs, hctx_conf);
238
 
 
 
 
239
  probs_to_cumfreqs(probs, cumfreqs, &total);
240
  int sym = ad_decode(&dec, cumfreqs, total);
241
  result[i] = (uint8_t)sym;
 
1
  /*
2
  * Midicoth Compressor — C implementation
3
+ * Pipeline: PPM + Match + Word + HighCtx + Tweedie Denoising
4
  *
5
  * Usage:
6
  * ./mdc compress <input> <output>
 
96
  double confidence;
97
  int order;
98
  ppm_predict(&ppm, probs, &confidence, &order);
 
 
99
  clamp_normalize(probs);
100
 
101
  int match_byte;
 
111
  if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
112
  blend_highctx(probs, hctx_probs, hctx_conf);
113
 
114
+ tweedie_denoise(&twd, probs, order, confidence);
115
+ clamp_normalize(probs);
116
+
117
  probs_to_cumfreqs(probs, cumfreqs, &total);
118
  ae_encode(&enc, cumfreqs, byte, total);
119
 
 
220
  double confidence;
221
  int order;
222
  ppm_predict(&ppm, probs, &confidence, &order);
 
 
223
  clamp_normalize(probs);
224
 
225
  int match_byte;
 
235
  if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
236
  blend_highctx(probs, hctx_probs, hctx_conf);
237
 
238
+ tweedie_denoise(&twd, probs, order, confidence);
239
+ clamp_normalize(probs);
240
+
241
  probs_to_cumfreqs(probs, cumfreqs, &total);
242
  int sym = ad_decode(&dec, cumfreqs, total);
243
  result[i] = (uint8_t)sym;
measure_delta.c ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Measure mean |delta| vs noise level (confidence) for the
3
+ * delta-vs-gamma table in the paper.
4
+ *
5
+ * Runs the full pipeline (PPM+Match+Word+HCtx+Tweedie) and
6
+ * instruments the Tweedie denoise to collect per-step, per-confidence
7
+ * delta statistics.
8
+ *
9
+ * Usage: ./measure_delta <input_file>
10
+ */
11
+
12
+ #include <stdio.h>
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+ #include <math.h>
16
+
17
+ #include "fastmath.h"
18
+ #include "arith.h"
19
+ #include "ppm.h"
20
+ #include "match.h"
21
+ #include "word.h"
22
+ #include "highctx.h"
23
+
24
+ /* We need access to Tweedie internals, so include it but also
25
+ * define instrumentation hooks */
26
+ #include "tweedie.h"
27
+
28
+ /* Accumulate |delta| by [step][conf_bin] */
29
+ #define N_CONF_REPORT 4
30
+ static double delta_sum[TWD_STEPS][N_CONF_REPORT];
31
+ static double delta_count[TWD_STEPS][N_CONF_REPORT];
32
+
33
+ /* Map raw confidence to our 4 reporting bins:
34
+ * bin 0: C ~ 128 (gamma ~ 0.500)
35
+ * bin 1: C ~ 512 (gamma ~ 0.200)
36
+ * bin 2: C ~ 2048 (gamma ~ 0.059)
37
+ * bin 3: C ~ 8192 (gamma ~ 0.015) */
38
+ static int conf_report_bin(double confidence) {
39
+ if (confidence < 256.0) return 0;
40
+ if (confidence < 1024.0) return 1;
41
+ if (confidence < 4096.0) return 2;
42
+ return 3;
43
+ }
44
+
45
+ /* Instrumented denoise that collects delta stats */
46
+ static void tweedie_denoise_instrumented(TweedieDenoiser *td, double *probs,
47
+ int ppm_order, double confidence) {
48
+ int og = twd_order_group(ppm_order);
49
+ int cb = twd_conf_bin(confidence);
50
+ int crb = conf_report_bin(confidence);
51
+
52
+ double max_p = 0.0;
53
+ for (int i = 0; i < TWD_NSYM; i++)
54
+ if (probs[i] > max_p) max_p = probs[i];
55
+ int sb = twd_shape_bin(max_p);
56
+
57
+ td->cached_ord = og;
58
+ td->cached_shape = sb;
59
+ td->cached_conf = cb;
60
+
61
+ double stree[512];
62
+ double scale[512];
63
+
64
+ for (int step = 0; step < TWD_STEPS; step++) {
65
+ for (int i = 0; i < TWD_NSYM; i++)
66
+ stree[TWD_NSYM + i] = probs[i];
67
+ for (int i = TWD_NSYM - 1; i >= 1; i--)
68
+ stree[i] = stree[2 * i] + stree[2 * i + 1];
69
+
70
+ scale[1] = 1.0;
71
+
72
+ for (int level = 0; level < TWD_N_LEVELS; level++) {
73
+ int level_start = 1 << level;
74
+ int level_end = 1 << (level + 1);
75
+
76
+ for (int ni = level_start; ni < level_end; ni++) {
77
+ double node_total = stree[ni];
78
+ int node_id = ni - 1;
79
+ int node_at_level = ni - level_start;
80
+
81
+ if (node_total < 1e-15) {
82
+ scale[2 * ni] = scale[ni];
83
+ scale[2 * ni + 1] = scale[ni];
84
+ td->cached_p_right[step][node_id] = 0.5;
85
+ td->cached_prob_bin[step][node_id] = twd_prob_bin(0.5);
86
+ td->cached_bctx[step][node_id] = twd_bit_context(level, node_at_level);
87
+ continue;
88
+ }
89
+
90
+ double sum_right = stree[2 * ni + 1];
91
+ double p_right = sum_right / node_total;
92
+ if (p_right < 1e-8) p_right = 1e-8;
93
+ if (p_right > 1.0 - 1e-8) p_right = 1.0 - 1e-8;
94
+
95
+ int bctx = twd_bit_context(level, node_at_level);
96
+ int pbin = twd_prob_bin(p_right);
97
+ td->cached_p_right[step][node_id] = p_right;
98
+ td->cached_prob_bin[step][node_id] = pbin;
99
+ td->cached_bctx[step][node_id] = bctx;
100
+
101
+ TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
102
+ double avg_pred = e->sum_pred / e->total;
103
+ double emp_rate = e->hits / e->total;
104
+ double delta = emp_rate - avg_pred;
105
+
106
+ /* Apply same shrinkage as production code */
107
+ double var_err = e->sum_sq_err / e->total;
108
+ if (e->total > 10.0 && var_err > 1e-10) {
109
+ double snr = delta * delta * e->total / var_err;
110
+ double shrink = (snr > 4.0) ? 1.0 : snr / 4.0;
111
+ delta *= shrink;
112
+ } else {
113
+ delta = 0.0;
114
+ }
115
+
116
+ /* Collect stats: weight by node probability mass */
117
+ double weight = node_total / stree[1];
118
+ delta_sum[step][crb] += fabs(delta) * weight;
119
+ delta_count[step][crb] += weight;
120
+
121
+ double p_right_corr = p_right + delta;
122
+ if (p_right_corr < 1e-8) p_right_corr = 1e-8;
123
+ if (p_right_corr > 1.0 - 1e-8) p_right_corr = 1.0 - 1e-8;
124
+
125
+ double sl = (1.0 - p_right_corr) / (1.0 - p_right);
126
+ double sr = p_right_corr / p_right;
127
+ scale[2 * ni] = scale[ni] * sl;
128
+ scale[2 * ni + 1] = scale[ni] * sr;
129
+ }
130
+ }
131
+
132
+ for (int i = 0; i < TWD_NSYM; i++)
133
+ probs[i] *= scale[TWD_NSYM + i];
134
+
135
+ double sum = 0.0;
136
+ for (int i = 0; i < TWD_NSYM; i++) {
137
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
138
+ sum += probs[i];
139
+ }
140
+ double inv = 1.0 / sum;
141
+ for (int i = 0; i < TWD_NSYM; i++)
142
+ probs[i] *= inv;
143
+
144
+ max_p = 0.0;
145
+ for (int i = 0; i < TWD_NSYM; i++)
146
+ if (probs[i] > max_p) max_p = probs[i];
147
+ sb = twd_shape_bin(max_p);
148
+ }
149
+ }
150
+
151
+ static void my_clamp_normalize(double *p) {
152
+ double sum = 0;
153
+ for (int i = 0; i < 256; i++) {
154
+ if (p[i] < 1e-10) p[i] = 1e-10;
155
+ sum += p[i];
156
+ }
157
+ double inv = 1.0 / sum;
158
+ for (int i = 0; i < 256; i++) p[i] *= inv;
159
+ }
160
+
161
+ static void my_blend_match(double *probs, int match_byte, double match_conf) {
162
+ if (match_byte < 0 || match_conf < 0.01) return;
163
+ double w = match_conf * 0.85;
164
+ if (w > 0.95) w = 0.95;
165
+ for (int i = 0; i < 256; i++)
166
+ probs[i] *= (1.0 - w);
167
+ probs[match_byte] += w;
168
+ }
169
+
170
+ static void my_blend_word(double *probs, double *wprobs, double wconf) {
171
+ double w = wconf * 0.35;
172
+ if (w > 0.45) w = 0.45;
173
+ for (int i = 0; i < 256; i++)
174
+ probs[i] = (1.0 - w) * probs[i] + w * wprobs[i];
175
+ }
176
+
177
+ static void my_blend_hctx(double *probs, double *hprobs, double hconf) {
178
+ double w = hconf * 2.0;
179
+ if (w > 0.60) w = 0.60;
180
+ for (int i = 0; i < 256; i++)
181
+ probs[i] = (1.0 - w) * probs[i] + w * hprobs[i];
182
+ }
183
+
184
+ int main(int argc, char **argv) {
185
+ if (argc < 2) {
186
+ fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
187
+ return 1;
188
+ }
189
+
190
+ FILE *f = fopen(argv[1], "rb");
191
+ if (!f) { perror(argv[1]); return 1; }
192
+ fseek(f, 0, SEEK_END);
193
+ size_t len = ftell(f);
194
+ fseek(f, 0, SEEK_SET);
195
+ uint8_t *data = malloc(len);
196
+ fread(data, 1, len, f);
197
+ fclose(f);
198
+
199
+ PPMModel ppm;
200
+ ppm_init(&ppm);
201
+
202
+ MatchModel match;
203
+ match_init(&match);
204
+
205
+ WordModel word;
206
+ word_init(&word);
207
+
208
+ HighCtxModel hctx;
209
+ highctx_init(&hctx);
210
+
211
+ TweedieDenoiser *twd = malloc(sizeof(TweedieDenoiser));
212
+ tweedie_init(twd);
213
+
214
+ memset(delta_sum, 0, sizeof(delta_sum));
215
+ memset(delta_count, 0, sizeof(delta_count));
216
+
217
+ double probs[256], word_probs[256], hctx_probs[256];
218
+
219
+ for (size_t i = 0; i < len; i++) {
220
+ uint8_t byte = data[i];
221
+ double confidence;
222
+ int order;
223
+
224
+ ppm_predict(&ppm, probs, &confidence, &order);
225
+ my_clamp_normalize(probs);
226
+
227
+ int match_byte;
228
+ double match_conf;
229
+ match_predict(&match, &match_byte, &match_conf);
230
+ my_blend_match(probs, match_byte, match_conf);
231
+
232
+ double w_conf;
233
+ if (word_predict_cached(&word, word_probs, &w_conf))
234
+ my_blend_word(probs, word_probs, w_conf);
235
+
236
+ double hctx_conf;
237
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
238
+ my_blend_hctx(probs, hctx_probs, hctx_conf);
239
+
240
+ tweedie_denoise_instrumented(twd, probs, order, confidence);
241
+ my_clamp_normalize(probs);
242
+
243
+ tweedie_update(twd, byte);
244
+ match_update(&match, byte);
245
+ word_update(&word, byte);
246
+ highctx_update(&hctx, byte);
247
+ ppm_update(&ppm, byte);
248
+
249
+ if ((i + 1) % 50000 == 0)
250
+ fprintf(stderr, "\r %5.1f%%", (i + 1) * 100.0 / len);
251
+ }
252
+ fprintf(stderr, "\r \r");
253
+
254
+ double gammas[] = {0.500, 0.200, 0.059, 0.015};
255
+ int c_repr[] = {128, 512, 2048, 8192};
256
+
257
+ printf("File: %s (%zu bytes)\n\n", argv[1], len);
258
+ printf("%-8s %-8s", "gamma", "C_repr");
259
+ for (int s = 0; s < TWD_STEPS; s++)
260
+ printf(" step_%d ", s);
261
+ printf("\n");
262
+
263
+ for (int b = 0; b < N_CONF_REPORT; b++) {
264
+ printf("%-8.3f %-8d", gammas[b], c_repr[b]);
265
+ for (int s = 0; s < TWD_STEPS; s++) {
266
+ if (delta_count[s][b] > 0)
267
+ printf(" %.4f ", delta_sum[s][b] / delta_count[s][b]);
268
+ else
269
+ printf(" --- ");
270
+ }
271
+ printf("\n");
272
+ }
273
+
274
+ free(twd);
275
+ free(data);
276
+ return 0;
277
+ }
ppm.h CHANGED
@@ -9,6 +9,7 @@
9
  #define PPM_MAX_ORDER 4
10
  #define PPM_NSYM 256
11
  #define PPM_PRIOR 0.5
 
12
 
13
  /*
14
  * Hash table entry: maps a 64-bit context hash to a count array.
@@ -77,9 +78,10 @@ static inline PPMEntry *ppm_table_find(PPMTable *t, uint64_t key) {
77
  }
78
 
79
  static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
80
- /* Grow if > 60% full */
81
  if (t->used * 5 > t->capacity * 3) {
82
- ppm_table_grow(t);
 
83
  }
84
  uint32_t mask = t->capacity - 1;
85
  uint32_t idx = (uint32_t)(key & mask);
@@ -87,6 +89,10 @@ static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
87
  PPMEntry *e = &t->entries[idx];
88
  if (e->key == key) return e; /* already exists */
89
  if (e->key == 0) {
 
 
 
 
90
  /* init new entry with prior */
91
  e->key = key;
92
  for (int i = 0; i < PPM_NSYM; i++)
@@ -135,39 +141,96 @@ static inline void ppm_free(PPMModel *m) {
135
  }
136
 
137
  /*
138
- * predict_with_confidence: fills probs[256] and returns confidence + order.
139
- * Matches Python: fallback from max_order down to 0, first context with total > 1.
140
- * If nothing found, returns uniform.
 
 
 
 
141
  */
142
  static inline void ppm_predict(PPMModel *m, double *probs,
143
  double *out_confidence, int *out_order) {
 
 
 
 
 
 
 
 
144
  for (int order = PPM_MAX_ORDER; order >= 0; order--) {
145
- const uint8_t *ctx_start;
146
  int ctx_len = order;
147
-
148
  if (ctx_len > m->hist_len) continue;
149
- ctx_start = m->history + m->hist_len - ctx_len;
150
 
 
151
  uint64_t key = ppm_hash_context(ctx_start, ctx_len);
152
  PPMEntry *e = ppm_table_find(&m->tables[order], key);
153
  if (e == NULL) continue;
154
- if (e->total <= 1.0) continue;
155
 
156
- double inv_total = 1.0 / e->total;
157
- for (int i = 0; i < PPM_NSYM; i++)
158
- probs[i] = e->counts[i] * inv_total;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- *out_confidence = e->total;
161
- *out_order = order;
162
- return;
 
 
 
 
 
 
163
  }
164
 
165
- /* uniform fallback */
166
- double u = 1.0 / 256.0;
 
 
 
 
 
167
  for (int i = 0; i < PPM_NSYM; i++)
168
- probs[i] = u;
169
- *out_confidence = 0.0;
170
- *out_order = -1;
 
171
  }
172
 
173
  /*
@@ -183,6 +246,7 @@ static inline void ppm_update(PPMModel *m, uint8_t symbol) {
183
  uint64_t key = ppm_hash_context(ctx_start, ctx_len);
184
 
185
  PPMEntry *e = ppm_table_insert(&m->tables[order], key);
 
186
  e->counts[symbol] += 1.0;
187
  e->total += 1.0;
188
  }
 
9
  #define PPM_MAX_ORDER 4
10
  #define PPM_NSYM 256
11
  #define PPM_PRIOR 0.5
12
+ #define PPM_MAX_CAPACITY (1 << 19) /* 524288 entries per table; ~3.2 GB total for 5 tables */
13
 
14
  /*
15
  * Hash table entry: maps a 64-bit context hash to a count array.
 
78
  }
79
 
80
  static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
81
+ /* Grow if > 60% full, but respect capacity cap */
82
  if (t->used * 5 > t->capacity * 3) {
83
+ if (t->capacity < PPM_MAX_CAPACITY)
84
+ ppm_table_grow(t);
85
  }
86
  uint32_t mask = t->capacity - 1;
87
  uint32_t idx = (uint32_t)(key & mask);
 
89
  PPMEntry *e = &t->entries[idx];
90
  if (e->key == key) return e; /* already exists */
91
  if (e->key == 0) {
92
+ /* At capacity and table is full: don't insert new entry */
93
+ if (t->capacity >= PPM_MAX_CAPACITY &&
94
+ t->used * 5 > t->capacity * 3)
95
+ return NULL;
96
  /* init new entry with prior */
97
  e->key = key;
98
  for (int i = 0; i < PPM_NSYM; i++)
 
141
  }
142
 
143
  /*
144
+ * Predict with PPMC-style exclusion.
145
+ *
146
+ * From highest order down, symbols observed at each order receive probability
147
+ * proportional to their real observation count. Symbols already assigned
148
+ * probability at a higher order are *excluded* from lower-order distributions.
149
+ * Escape probability (Method C): P_esc = d / (n + d)
150
+ * where d = distinct observed symbols, n = total real observations.
151
  */
152
  static inline void ppm_predict(PPMModel *m, double *probs,
153
  double *out_confidence, int *out_order) {
154
+ int excluded[PPM_NSYM];
155
+ memset(excluded, 0, sizeof(excluded));
156
+ for (int i = 0; i < PPM_NSYM; i++) probs[i] = 0.0;
157
+
158
+ double remaining = 1.0;
159
+ int best_order = -1;
160
+ double best_conf = 0.0;
161
+
162
  for (int order = PPM_MAX_ORDER; order >= 0; order--) {
 
163
  int ctx_len = order;
 
164
  if (ctx_len > m->hist_len) continue;
 
165
 
166
+ const uint8_t *ctx_start = m->history + m->hist_len - ctx_len;
167
  uint64_t key = ppm_hash_context(ctx_start, ctx_len);
168
  PPMEntry *e = ppm_table_find(&m->tables[order], key);
169
  if (e == NULL) continue;
 
170
 
171
+ /* Count real observations for non-excluded symbols */
172
+ double n = 0.0;
173
+ int d = 0;
174
+ for (int s = 0; s < PPM_NSYM; s++) {
175
+ if (excluded[s]) continue;
176
+ double real = e->counts[s] - PPM_PRIOR;
177
+ if (real > 0.01) {
178
+ n += real;
179
+ d++;
180
+ }
181
+ }
182
+
183
+ if (d == 0) continue;
184
+
185
+ if (best_order < 0) {
186
+ best_order = order;
187
+ best_conf = e->total;
188
+ }
189
+
190
+ /* PPMC escape: d / (n + d) */
191
+ double p_esc = (double)d / (n + d);
192
+ double p_nesc = 1.0 - p_esc;
193
+
194
+ for (int s = 0; s < PPM_NSYM; s++) {
195
+ if (excluded[s]) continue;
196
+ double real = e->counts[s] - PPM_PRIOR;
197
+ if (real > 0.01) {
198
+ probs[s] += remaining * p_nesc * (real / n);
199
+ excluded[s] = 1;
200
+ }
201
+ }
202
+
203
+ remaining *= p_esc;
204
+ }
205
+
206
+ /* Distribute remaining mass uniformly among non-excluded symbols */
207
+ int n_rem = 0;
208
+ for (int s = 0; s < PPM_NSYM; s++)
209
+ if (!excluded[s]) n_rem++;
210
 
211
+ if (n_rem > 0) {
212
+ double per = remaining / n_rem;
213
+ for (int s = 0; s < PPM_NSYM; s++)
214
+ if (!excluded[s])
215
+ probs[s] += per;
216
+ } else {
217
+ double per = remaining / PPM_NSYM;
218
+ for (int s = 0; s < PPM_NSYM; s++)
219
+ probs[s] += per;
220
  }
221
 
222
+ /* Ensure positive + normalize */
223
+ double sum = 0.0;
224
+ for (int i = 0; i < PPM_NSYM; i++) {
225
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
226
+ sum += probs[i];
227
+ }
228
+ double inv = 1.0 / sum;
229
  for (int i = 0; i < PPM_NSYM; i++)
230
+ probs[i] *= inv;
231
+
232
+ *out_confidence = best_conf;
233
+ *out_order = best_order;
234
  }
235
 
236
  /*
 
246
  uint64_t key = ppm_hash_context(ctx_start, ctx_len);
247
 
248
  PPMEntry *e = ppm_table_insert(&m->tables[order], key);
249
+ if (e == NULL) continue; /* table full, skip this context */
250
  e->counts[symbol] += 1.0;
251
  e->total += 1.0;
252
  }
ppm_excl.h ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef PPM_H
2
+ #define PPM_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+ #include <math.h>
8
+
9
+ #define PPM_MAX_ORDER 4
10
+ #define PPM_NSYM 256
11
+ #define PPM_PRIOR 0.5
12
+
13
+ /*
14
+ * Hash table entry: maps a 64-bit context hash to a count array.
15
+ * counts[i] stores the (float) count for symbol i.
16
+ * total caches sum(counts).
17
+ * key == 0 means empty slot.
18
+ */
19
+ typedef struct {
20
+ uint64_t key; /* context hash (0 = empty) */
21
+ double counts[PPM_NSYM];
22
+ double total;
23
+ } PPMEntry;
24
+
25
+ typedef struct {
26
+ PPMEntry *entries;
27
+ uint32_t capacity; /* power of 2 */
28
+ uint32_t used;
29
+ } PPMTable;
30
+
31
+ typedef struct {
32
+ PPMTable tables[PPM_MAX_ORDER + 1]; /* order 0..4 */
33
+ uint8_t *history;
34
+ int hist_len;
35
+ int hist_cap;
36
+ } PPMModel;
37
+
38
+ /* ── Hash helper ── */
39
+
40
+ static inline uint64_t ppm_hash_context(const uint8_t *ctx, int len) {
41
+ /* We need a non-zero hash for all contexts including order-0 (empty).
42
+ * Use FNV-1a style. Order-0 empty context gets a fixed hash. */
43
+ if (len == 0) return 1; /* special: order-0 empty context */
44
+ uint64_t h = 14695981039346656037ULL;
45
+ for (int i = 0; i < len; i++) {
46
+ h ^= ctx[i];
47
+ h *= 1099511628211ULL;
48
+ }
49
+ if (h == 0) h = 1; /* reserve 0 for empty slot */
50
+ return h;
51
+ }
52
+
53
+ /* ── Table operations ── */
54
+
55
+ static inline void ppm_table_init(PPMTable *t, uint32_t capacity) {
56
+ t->capacity = capacity;
57
+ t->used = 0;
58
+ t->entries = (PPMEntry *)calloc(capacity, sizeof(PPMEntry));
59
+ }
60
+
61
+ static inline void ppm_table_free(PPMTable *t) {
62
+ free(t->entries);
63
+ t->entries = NULL;
64
+ }
65
+
66
+ static inline void ppm_table_grow(PPMTable *t);
67
+
68
+ static inline PPMEntry *ppm_table_find(PPMTable *t, uint64_t key) {
69
+ uint32_t mask = t->capacity - 1;
70
+ uint32_t idx = (uint32_t)(key & mask);
71
+ for (;;) {
72
+ PPMEntry *e = &t->entries[idx];
73
+ if (e->key == key) return e;
74
+ if (e->key == 0) return NULL;
75
+ idx = (idx + 1) & mask;
76
+ }
77
+ }
78
+
79
+ static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
80
+ /* Grow if > 60% full */
81
+ if (t->used * 5 > t->capacity * 3) {
82
+ ppm_table_grow(t);
83
+ }
84
+ uint32_t mask = t->capacity - 1;
85
+ uint32_t idx = (uint32_t)(key & mask);
86
+ for (;;) {
87
+ PPMEntry *e = &t->entries[idx];
88
+ if (e->key == key) return e; /* already exists */
89
+ if (e->key == 0) {
90
+ /* init new entry with prior */
91
+ e->key = key;
92
+ for (int i = 0; i < PPM_NSYM; i++)
93
+ e->counts[i] = PPM_PRIOR;
94
+ e->total = PPM_NSYM * PPM_PRIOR;
95
+ t->used++;
96
+ return e;
97
+ }
98
+ idx = (idx + 1) & mask;
99
+ }
100
+ }
101
+
102
+ static inline void ppm_table_grow(PPMTable *t) {
103
+ uint32_t old_cap = t->capacity;
104
+ PPMEntry *old = t->entries;
105
+ uint32_t new_cap = old_cap * 2;
106
+ t->entries = (PPMEntry *)calloc(new_cap, sizeof(PPMEntry));
107
+ t->capacity = new_cap;
108
+ t->used = 0;
109
+ for (uint32_t i = 0; i < old_cap; i++) {
110
+ if (old[i].key != 0) {
111
+ /* re-insert */
112
+ PPMEntry *ne = ppm_table_insert(t, old[i].key);
113
+ memcpy(ne->counts, old[i].counts, sizeof(old[i].counts));
114
+ ne->total = old[i].total;
115
+ }
116
+ }
117
+ free(old);
118
+ }
119
+
120
+ /* ── PPM Model ── */
121
+
122
+ static inline void ppm_init(PPMModel *m) {
123
+ for (int o = 0; o <= PPM_MAX_ORDER; o++)
124
+ ppm_table_init(&m->tables[o], 1024);
125
+ m->hist_cap = 4096;
126
+ m->hist_len = 0;
127
+ m->history = (uint8_t *)malloc(m->hist_cap);
128
+ }
129
+
130
+ static inline void ppm_free(PPMModel *m) {
131
+ for (int o = 0; o <= PPM_MAX_ORDER; o++)
132
+ ppm_table_free(&m->tables[o]);
133
+ free(m->history);
134
+ m->history = NULL;
135
+ }
136
+
137
+ /*
138
+ * predict_with_confidence: fills probs[256] and returns confidence + order.
139
+ * Matches Python: fallback from max_order down to 0, first context with total > 1.
140
+ * If nothing found, returns uniform.
141
+ */
142
+ static inline void ppm_predict(PPMModel *m, double *probs,
143
+ double *out_confidence, int *out_order) {
144
+ for (int order = PPM_MAX_ORDER; order >= 0; order--) {
145
+ const uint8_t *ctx_start;
146
+ int ctx_len = order;
147
+
148
+ if (ctx_len > m->hist_len) continue;
149
+ ctx_start = m->history + m->hist_len - ctx_len;
150
+
151
+ uint64_t key = ppm_hash_context(ctx_start, ctx_len);
152
+ PPMEntry *e = ppm_table_find(&m->tables[order], key);
153
+ if (e == NULL) continue;
154
+ if (e->total <= 1.0) continue;
155
+
156
+ double inv_total = 1.0 / e->total;
157
+ for (int i = 0; i < PPM_NSYM; i++)
158
+ probs[i] = e->counts[i] * inv_total;
159
+
160
+ *out_confidence = e->total;
161
+ *out_order = order;
162
+ return;
163
+ }
164
+
165
+ /* uniform fallback */
166
+ double u = 1.0 / 256.0;
167
+ for (int i = 0; i < PPM_NSYM; i++)
168
+ probs[i] = u;
169
+ *out_confidence = 0.0;
170
+ *out_order = -1;
171
+ }
172
+
173
+ /*
174
+ * update: add symbol count to all orders (0..4) where context is available.
175
+ * Then append symbol to history.
176
+ */
177
+ static inline void ppm_update(PPMModel *m, uint8_t symbol) {
178
+ for (int order = 0; order <= PPM_MAX_ORDER; order++) {
179
+ int ctx_len = order;
180
+ if (ctx_len > m->hist_len) continue;
181
+
182
+ const uint8_t *ctx_start = m->history + m->hist_len - ctx_len;
183
+ uint64_t key = ppm_hash_context(ctx_start, ctx_len);
184
+
185
+ PPMEntry *e = ppm_table_insert(&m->tables[order], key);
186
+ e->counts[symbol] += 1.0;
187
+ e->total += 1.0;
188
+ }
189
+
190
+ /* append to history */
191
+ if (m->hist_len >= m->hist_cap) {
192
+ m->hist_cap *= 2;
193
+ m->history = (uint8_t *)realloc(m->history, m->hist_cap);
194
+ }
195
+ m->history[m->hist_len++] = symbol;
196
+ }
197
+
198
+ #endif /* PPM_H */
tweedie.h CHANGED
@@ -16,6 +16,10 @@
16
  * the additive correction δ = E[θ|p̂] - E[p̂] = hit_rate - avg_pred.
17
  * This δ equals σ²·s(p̂) — the full Tweedie correction term.
18
  *
 
 
 
 
19
  * Binary tree decomposition: 256-way → 8 binary decisions (MSB to LSB).
20
  * Multi-step: K=3 denoising steps with independent score tables.
21
  * Calibration context: (step, bit_context, order, shape, confidence, prob_bin)
@@ -56,6 +60,7 @@ typedef struct {
56
  double sum_pred; /* sum of predicted P(right) */
57
  double hits; /* times true symbol went right */
58
  double total; /* total observations */
 
59
  } TwdCalibEntry;
60
 
61
  typedef struct {
@@ -138,6 +143,7 @@ static inline void tweedie_init(TweedieDenoiser *td) {
138
  td->table[t][b][o][s][c][p].sum_pred = center * TWD_PRIOR_WEIGHT;
139
  td->table[t][b][o][s][c][p].hits = center * TWD_PRIOR_WEIGHT;
140
  td->table[t][b][o][s][c][p].total = TWD_PRIOR_WEIGHT;
 
141
  }
142
  }
143
 
@@ -214,6 +220,17 @@ static inline void tweedie_denoise(TweedieDenoiser *td, double *probs,
214
  double emp_rate = e->hits / e->total;
215
  double delta = emp_rate - avg_pred;
216
 
 
 
 
 
 
 
 
 
 
 
 
217
  double p_right_corr = p_right + delta;
218
  if (p_right_corr < 1e-8) p_right_corr = 1e-8;
219
  if (p_right_corr > 1.0 - 1e-8) p_right_corr = 1.0 - 1e-8;
@@ -269,6 +286,8 @@ static inline void tweedie_update(TweedieDenoiser *td, uint8_t true_symbol) {
269
  int bctx = td->cached_bctx[step][node_id];
270
 
271
  TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
 
 
272
  e->sum_pred += td->cached_p_right[step][node_id];
273
  e->total += 1.0;
274
  if (went_right)
 
16
  * the additive correction δ = E[θ|p̂] - E[p̂] = hit_rate - avg_pred.
17
  * This δ equals σ²·s(p̂) — the full Tweedie correction term.
18
  *
19
+ * Variance-aware James-Stein shrinkage: each correction δ is shrunk toward
20
+ * zero based on SNR = δ²·N/var(error). When SNR < 4, the correction is
21
+ * attenuated proportionally, preventing noisy bucket estimates from hurting.
22
+ *
23
  * Binary tree decomposition: 256-way → 8 binary decisions (MSB to LSB).
24
  * Multi-step: K=3 denoising steps with independent score tables.
25
  * Calibration context: (step, bit_context, order, shape, confidence, prob_bin)
 
60
  double sum_pred; /* sum of predicted P(right) */
61
  double hits; /* times true symbol went right */
62
  double total; /* total observations */
63
+ double sum_sq_err; /* sum of (went_right - p_right)^2 */
64
  } TwdCalibEntry;
65
 
66
  typedef struct {
 
143
  td->table[t][b][o][s][c][p].sum_pred = center * TWD_PRIOR_WEIGHT;
144
  td->table[t][b][o][s][c][p].hits = center * TWD_PRIOR_WEIGHT;
145
  td->table[t][b][o][s][c][p].total = TWD_PRIOR_WEIGHT;
146
+ td->table[t][b][o][s][c][p].sum_sq_err = TWD_PRIOR_WEIGHT * 0.25;
147
  }
148
  }
149
 
 
220
  double emp_rate = e->hits / e->total;
221
  double delta = emp_rate - avg_pred;
222
 
223
+ /* Variance-aware James-Stein shrinkage:
224
+ * SNR = δ²·N / var(error). Shrink δ → 0 when SNR < 4. */
225
+ double var_err = e->sum_sq_err / e->total;
226
+ if (e->total > 10.0 && var_err > 1e-10) {
227
+ double snr = delta * delta * e->total / var_err;
228
+ double shrink = (snr > 4.0) ? 1.0 : snr / 4.0;
229
+ delta *= shrink;
230
+ } else {
231
+ delta = 0.0;
232
+ }
233
+
234
  double p_right_corr = p_right + delta;
235
  if (p_right_corr < 1e-8) p_right_corr = 1e-8;
236
  if (p_right_corr > 1.0 - 1e-8) p_right_corr = 1.0 - 1e-8;
 
286
  int bctx = td->cached_bctx[step][node_id];
287
 
288
  TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
289
+ double err = (double)went_right - td->cached_p_right[step][node_id];
290
+ e->sum_sq_err += err * err;
291
  e->sum_pred += td->cached_p_right[step][node_id];
292
  e->total += 1.0;
293
  if (went_right)