harshithsaiv
/

kv-cache-compression

+"""
+Generate all publication-ready graphs for both models.
+"""
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+def load_results(model_name):
+    path = os.path.expanduser(f"~/kv-hack/results/{model_name}/benchmark_results.json")
+    with open(path) as f:
+        return json.load(f)
+mistral = load_results("mistral-7b")
+llama   = load_results("llama-3-8b")
+C_FP16    = "#ef4444"
+C_UNIFORM = "#f97316"
+C_MISTRAL = "#22c55e"
+C_LLAMA   = "#3b82f6"
+os.makedirs(os.path.expanduser("~/kv-hack/figures"), exist_ok=True)
+# ── GRAPH 1: Memory vs Context — Both Models ──────────
+fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+for ax, results, title in [
+    (axes[0], mistral, "Mistral-7B"),
+    (axes[1], llama,   "Llama-3-8B"),
+]:
+    ctx  = [r["context_len"]        for r in results["compression"]]
+    fp16 = [r["fp16_mb"]            for r in results["compression"]]
+    uni8 = [r["uniform8_mb"]        for r in results["compression"]]
+    ours = [r["mixed_precision_mb"] for r in results["compression"]]
+    ax.plot(ctx, fp16, 'o-', color=C_FP16,    linewidth=2.5, markersize=8, label="FP16 Baseline")
+    ax.plot(ctx, uni8, 's-', color=C_UNIFORM,  linewidth=2.5, markersize=8, label="Uniform 8-bit")
+    ax.plot(ctx, ours, '^-', color=C_MISTRAL if title == "Mistral-7B" else C_LLAMA,
+            linewidth=2.5, markersize=8, label="Per-Head Mixed (Ours)")
+    # annotate at 8K
+    ax.annotate(f"{fp16[-1]:.0f} MB", xy=(8192, fp16[-1]),
+                xytext=(5500, fp16[-1]+30), color=C_FP16, fontweight='bold', fontsize=9)
+    ax.annotate(f"{uni8[-1]:.0f} MB", xy=(8192, uni8[-1]),
+                xytext=(5500, uni8[-1]+30), color=C_UNIFORM, fontweight='bold', fontsize=9)
+    ax.annotate(f"{ours[-1]:.0f} MB\n({results['summary']['compression_8k']}x vs FP16)",
+                xy=(8192, ours[-1]), xytext=(4000, ours[-1]-150),
+                color=C_MISTRAL if title == "Mistral-7B" else C_LLAMA,
+                fontweight='bold', fontsize=9)
+    ax.set_xlabel("Context Length (tokens)", fontsize=12)
+    ax.set_ylabel("KV Cache Memory (MB)", fontsize=12)
+    ax.set_title(f"{title}\nKV Cache Memory vs Context Length", fontsize=13, fontweight='bold')
+    ax.legend(fontsize=10)
+    ax.grid(True, alpha=0.3)
+    ax.set_xticks(ctx)
+plt.suptitle("Per-Head Mixed-Precision KV Cache Compression",
+             fontsize=15, fontweight='bold', y=1.02)
+plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/memory_vs_context_both.png"),
+            dpi=150, bbox_inches='tight')
+print("✅ Saved figures/memory_vs_context_both.png")
+# ── GRAPH 2: Compression Bar Chart — Both Models ──────
+fig, ax = plt.subplots(figsize=(10, 6))
+x      = np.arange(3)
+width  = 0.35
+models = ["FP16\nBaseline", "Uniform\n8-bit", "Per-Head\nMixed (Ours)"]
+bars1 = ax.bar(x - width/2,
+               [1.0, 2.0, mistral["summary"]["compression_8k"]],
+               width, label="Mistral-7B", color=C_MISTRAL, edgecolor='white')
+bars2 = ax.bar(x + width/2,
+               [1.0, 2.0, llama["summary"]["compression_8k"]],
+               width, label="Llama-3-8B",  color=C_LLAMA,   edgecolor='white')
+for bar in bars1:
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
+            f"{bar.get_height():.2f}x", ha='center', fontweight='bold', fontsize=11)
+for bar in bars2:
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
+            f"{bar.get_height():.2f}x", ha='center', fontweight='bold', fontsize=11)
+ax.set_xticks(x)
+ax.set_xticklabels(models, fontsize=12)
+ax.set_ylabel("Compression vs FP16", fontsize=13)
+ax.set_title("KV Cache Compression at 8K Context\nPer-Head Mixed Precision vs Baselines",
+             fontsize=14, fontweight='bold')
+ax.set_ylim(0, 2.8)
+ax.legend(fontsize=12)
+ax.grid(True, axis='y', alpha=0.3)
+ax.axhline(y=1.0, color='gray', linestyle='--', alpha=0.4)
+plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/compression_bar_both.png"), dpi=150)
+print("✅ Saved figures/compression_bar_both.png")
+# ── GRAPH 3: Hero Summary Table ───────────────────────
+fig, ax = plt.subplots(figsize=(12, 4))
+ax.axis('off')
+table_data = [
+    ["Model", "Method", "Avg Bits", "KV @ 8K", "vs FP16", "vs 8-bit", "Perplexity", "Speed"],
+    ["Mistral-7B", "FP16 Baseline",        "16",   "1073 MB", "1.0x", "—",     str(mistral["perplexity"]), f"{mistral['decode_tokens_per_sec']} t/s"],
+    ["Mistral-7B", "Uniform 8-bit",         "8",    "537 MB",  "2.0x", "1.0x",  "~same",                   "~same"],
+    ["Mistral-7B", "Per-Head Mixed (Ours)", f"{mistral['avg_bits']}", f"{mistral['summary']['ours_8k_mb']} MB", f"{mistral['summary']['compression_8k']}x", "1.15x", str(mistral["perplexity"]), f"{mistral['decode_tokens_per_sec']} t/s"],
+    ["Llama-3-8B", "FP16 Baseline",        "16",   "1073 MB", "1.0x", "—",     str(llama["perplexity"]),   f"{llama['decode_tokens_per_sec']} t/s"],
+    ["Llama-3-8B", "Uniform 8-bit",         "8",    "537 MB",  "2.0x", "1.0x",  "~same",                   "~same"],
+    ["Llama-3-8B", "Per-Head Mixed (Ours)", f"{llama['avg_bits']}",   f"{llama['summary']['ours_8k_mb']} MB",   f"{llama['summary']['compression_8k']}x", "1.02x", str(llama["perplexity"]),   f"{llama['decode_tokens_per_sec']} t/s"],
+]
+table = ax.table(
+    cellText=table_data[1:],
+    colLabels=table_data[0],
+    cellLoc='center',
+    loc='center',
+)
+table.auto_set_font_size(False)
+table.set_fontsize(9)
+table.scale(1.2, 2.2)
+# style header
+for j in range(8):
+    table[0, j].set_facecolor("#1e293b")
+    table[0, j].set_text_props(color='white', fontweight='bold')
+# highlight our rows green
+for j in range(8):
+    table[3, j].set_facecolor("#dcfce7")
+    table[6, j].set_facecolor("#dbeafe")
+plt.title("Full Results — Per-Head Mixed-Precision KV Cache",
+          fontsize=13, fontweight='bold', pad=20)
+plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/results_table_both.png"),
+            dpi=150, bbox_inches='tight')
+print("✅ Saved figures/results_table_both.png")
+plt.close('all')
+print("\n🎉 All graphs saved to ~/kv-hack/figures/")