Spaces:
Sleeping
Sleeping
| /* | |
| * Midicoth Compressor β C implementation | |
| * Pipeline: PPM + Match + Word + HighCtx + Tweedie Denoising | |
| * | |
| * Usage: | |
| * ./mdc compress <input> <output> | |
| * ./mdc decompress <input> <output> | |
| */ | |
| /* ββ Helpers ββ */ | |
| static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs, | |
| int64_t *out_total) { | |
| cumfreqs[0] = 0; | |
| for (int i = 0; i < 256; i++) { | |
| int64_t f = (int64_t)(probs[i] * SCALE + 0.5); | |
| if (f < 1) f = 1; | |
| cumfreqs[i + 1] = cumfreqs[i] + f; | |
| } | |
| *out_total = cumfreqs[256]; | |
| } | |
| static void clamp_normalize(double *probs) { | |
| double sum = 0.0; | |
| for (int i = 0; i < 256; i++) { | |
| if (probs[i] < 1e-10) probs[i] = 1e-10; | |
| sum += probs[i]; | |
| } | |
| double inv = 1.0 / sum; | |
| for (int i = 0; i < 256; i++) | |
| probs[i] *= inv; | |
| } | |
| /* ββ Compress ββ */ | |
| static int do_compress(const char *input_path, const char *output_path) { | |
| FILE *fin = fopen(input_path, "rb"); | |
| if (!fin) { perror(input_path); return 1; } | |
| fseek(fin, 0, SEEK_END); | |
| long file_size = ftell(fin); | |
| fseek(fin, 0, SEEK_SET); | |
| uint8_t *data = (uint8_t *)malloc(file_size); | |
| if (fread(data, 1, file_size, fin) != (size_t)file_size) { | |
| fprintf(stderr, "Read error\n"); fclose(fin); return 1; | |
| } | |
| fclose(fin); | |
| uint64_t original_size = (uint64_t)file_size; | |
| printf(" Input: %s (%lu bytes)\n", input_path, (unsigned long)original_size); | |
| if (original_size == 0) { | |
| FILE *fout = fopen(output_path, "wb"); | |
| fwrite(MAGIC, 1, 4, fout); | |
| uint64_t zero = 0; | |
| fwrite(&zero, 8, 1, fout); | |
| fclose(fout); | |
| printf(" Empty file -> 12 bytes\n"); | |
| free(data); | |
| return 0; | |
| } | |
| PPMModel ppm; ppm_init(&ppm); | |
| MatchModel match; match_init(&match); | |
| WordModel word; word_init(&word); | |
| HighCtxModel hctx; highctx_init(&hctx); | |
| ArithEncoder enc; ae_init(&enc); | |
| TweedieDenoiser *twd = (TweedieDenoiser *)malloc(sizeof(TweedieDenoiser)); | |
| tweedie_init(twd); | |
| double probs[256], word_probs[256], hctx_probs[256]; | |
| int64_t cumfreqs[257]; | |
| int64_t total; | |
| struct timespec t0, t1; | |
| clock_gettime(CLOCK_MONOTONIC, &t0); | |
| for (uint64_t i = 0; i < original_size; i++) { | |
| uint8_t byte = data[i]; | |
| double confidence; | |
| int order; | |
| ppm_predict(&ppm, probs, &confidence, &order); | |
| clamp_normalize(probs); | |
| int match_byte; | |
| double match_conf; | |
| match_predict(&match, &match_byte, &match_conf); | |
| blend_match(probs, match_byte, match_conf); | |
| double w_conf; | |
| if (word_predict_cached(&word, word_probs, &w_conf)) | |
| blend_word_model(probs, word_probs, w_conf); | |
| double hctx_conf; | |
| if (highctx_predict(&hctx, hctx_probs, &hctx_conf)) | |
| blend_highctx(probs, hctx_probs, hctx_conf); | |
| tweedie_denoise(twd, probs, order, confidence); | |
| clamp_normalize(probs); | |
| probs_to_cumfreqs(probs, cumfreqs, &total); | |
| ae_encode(&enc, cumfreqs, byte, total); | |
| tweedie_update(twd, byte); | |
| match_update(&match, byte); | |
| word_update(&word, byte); | |
| highctx_update(&hctx, byte); | |
| ppm_update(&ppm, byte); | |
| if ((i + 1) % 50000 == 0) { | |
| clock_gettime(CLOCK_MONOTONIC, &t1); | |
| double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; | |
| double pct = (i + 1) * 100.0 / original_size; | |
| double speed = (i + 1) / elapsed; | |
| fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s", | |
| pct, (unsigned long)(i + 1), (unsigned long)original_size, speed); | |
| } | |
| } | |
| ae_finish(&enc); | |
| clock_gettime(CLOCK_MONOTONIC, &t1); | |
| double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; | |
| fprintf(stderr, "\r \r"); | |
| FILE *fout = fopen(output_path, "wb"); | |
| if (!fout) { perror(output_path); return 1; } | |
| fwrite(MAGIC, 1, 4, fout); | |
| fwrite(&original_size, 8, 1, fout); | |
| fwrite(enc.buf, 1, enc.buf_len, fout); | |
| fclose(fout); | |
| uint64_t total_size = 4 + 8 + enc.buf_len; | |
| double ratio = (double)total_size / original_size; | |
| printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)total_size); | |
| printf(" Ratio: %.4f (%.2f%%)\n", ratio, ratio * 100.0); | |
| printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed); | |
| ae_free(&enc); | |
| ppm_free(&ppm); | |
| match_free(&match); | |
| word_free(&word); | |
| highctx_free(&hctx); | |
| free(twd); | |
| free(data); | |
| return 0; | |
| } | |
| /* ββ Decompress ββ */ | |
| static int do_decompress(const char *input_path, const char *output_path) { | |
| FILE *fin = fopen(input_path, "rb"); | |
| if (!fin) { perror(input_path); return 1; } | |
| char magic[4]; | |
| if (fread(magic, 1, 4, fin) != 4 || memcmp(magic, MAGIC, 4) != 0) { | |
| fprintf(stderr, "Error: not a MDC7 file\n"); | |
| fclose(fin); | |
| return 1; | |
| } | |
| uint64_t original_size; | |
| if (fread(&original_size, 8, 1, fin) != 1) { | |
| fprintf(stderr, "Read error\n"); fclose(fin); return 1; | |
| } | |
| fseek(fin, 0, SEEK_END); | |
| long fsize = ftell(fin); | |
| fseek(fin, 12, SEEK_SET); | |
| size_t comp_len = (size_t)(fsize - 12); | |
| uint8_t *compressed = (uint8_t *)malloc(comp_len); | |
| if (fread(compressed, 1, comp_len, fin) != comp_len) { | |
| fprintf(stderr, "Read error\n"); fclose(fin); return 1; | |
| } | |
| fclose(fin); | |
| printf(" Input: %s (%ld bytes)\n", input_path, fsize); | |
| printf(" Original size: %lu bytes\n", (unsigned long)original_size); | |
| if (original_size == 0) { | |
| FILE *fout = fopen(output_path, "wb"); | |
| fclose(fout); | |
| printf(" Empty file\n"); | |
| free(compressed); | |
| return 0; | |
| } | |
| PPMModel ppm; ppm_init(&ppm); | |
| MatchModel match; match_init(&match); | |
| WordModel word; word_init(&word); | |
| HighCtxModel hctx; highctx_init(&hctx); | |
| ArithDecoder dec; ad_init(&dec, compressed, comp_len); | |
| TweedieDenoiser *twd = (TweedieDenoiser *)malloc(sizeof(TweedieDenoiser)); | |
| tweedie_init(twd); | |
| uint8_t *result = (uint8_t *)malloc(original_size); | |
| double probs[256], word_probs[256], hctx_probs[256]; | |
| int64_t cumfreqs[257]; | |
| int64_t total; | |
| struct timespec t0, t1; | |
| clock_gettime(CLOCK_MONOTONIC, &t0); | |
| for (uint64_t i = 0; i < original_size; i++) { | |
| double confidence; | |
| int order; | |
| ppm_predict(&ppm, probs, &confidence, &order); | |
| clamp_normalize(probs); | |
| int match_byte; | |
| double match_conf; | |
| match_predict(&match, &match_byte, &match_conf); | |
| blend_match(probs, match_byte, match_conf); | |
| double w_conf; | |
| if (word_predict_cached(&word, word_probs, &w_conf)) | |
| blend_word_model(probs, word_probs, w_conf); | |
| double hctx_conf; | |
| if (highctx_predict(&hctx, hctx_probs, &hctx_conf)) | |
| blend_highctx(probs, hctx_probs, hctx_conf); | |
| tweedie_denoise(twd, probs, order, confidence); | |
| clamp_normalize(probs); | |
| probs_to_cumfreqs(probs, cumfreqs, &total); | |
| int sym = ad_decode(&dec, cumfreqs, total); | |
| result[i] = (uint8_t)sym; | |
| tweedie_update(twd, (uint8_t)sym); | |
| match_update(&match, (uint8_t)sym); | |
| word_update(&word, (uint8_t)sym); | |
| highctx_update(&hctx, (uint8_t)sym); | |
| ppm_update(&ppm, (uint8_t)sym); | |
| if ((i + 1) % 50000 == 0) { | |
| clock_gettime(CLOCK_MONOTONIC, &t1); | |
| double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; | |
| double pct = (i + 1) * 100.0 / original_size; | |
| double speed = (i + 1) / elapsed; | |
| fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s", | |
| pct, (unsigned long)(i + 1), (unsigned long)original_size, speed); | |
| } | |
| } | |
| clock_gettime(CLOCK_MONOTONIC, &t1); | |
| double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; | |
| fprintf(stderr, "\r \r"); | |
| FILE *fout = fopen(output_path, "wb"); | |
| fwrite(result, 1, original_size, fout); | |
| fclose(fout); | |
| printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)original_size); | |
| printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed); | |
| ppm_free(&ppm); | |
| match_free(&match); | |
| word_free(&word); | |
| highctx_free(&hctx); | |
| free(twd); | |
| free(compressed); | |
| free(result); | |
| return 0; | |
| } | |
| /* ββ Main ββ */ | |
| int main(int argc, char **argv) { | |
| if (argc != 4) { | |
| fprintf(stderr, "Usage: %s compress|decompress <input> <output>\n", argv[0]); | |
| return 1; | |
| } | |
| if (strcmp(argv[1], "compress") == 0) | |
| return do_compress(argv[2], argv[3]); | |
| else if (strcmp(argv[1], "decompress") == 0) | |
| return do_decompress(argv[2], argv[3]); | |
| else { | |
| fprintf(stderr, "Unknown command: %s\n", argv[1]); | |
| return 1; | |
| } | |
| } | |