harshithsaiv commited on
Commit
c0919f1
Β·
1 Parent(s): 2555c0e

feat: auto-save integration results to JSON

Browse files
integrate.py CHANGED
@@ -119,3 +119,44 @@ for prompt in prompts:
119
  print(f"Output: {result['text'][len(prompt):len(prompt)+150]}")
120
 
121
  print("\nβœ… Quantized inference working!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  print(f"Output: {result['text'][len(prompt):len(prompt)+150]}")
120
 
121
  print("\nβœ… Quantized inference working!")
122
+
123
+ # ── save results ─────────────────────────────────────
124
+ import json
125
+ from datetime import datetime
126
+
127
+ all_results = {
128
+ "model": MODEL_NAME,
129
+ "timestamp": datetime.now().isoformat(),
130
+ "avg_bits": avg_bits,
131
+ "theoretical_compression": round(16 / avg_bits, 2),
132
+ "prompts": []
133
+ }
134
+
135
+ print("\n" + "="*60)
136
+ print("QUANTIZED INFERENCE TEST")
137
+ print("="*60)
138
+
139
+ for prompt in prompts:
140
+ print(f"\nPrompt: {prompt[:50]}...")
141
+ result = run_quantized_generation(prompt, max_new_tokens=50)
142
+ print(f"Peak memory: {result['peak_memory_gb']:.2f} GB")
143
+ print(f"KV cache: {result['fp16_kb']:.0f} KB β†’ {result['compressed_kb']:.0f} KB")
144
+ print(f"Compression: {result['compression_ratio']:.2f}x")
145
+ print(f"Speed: {result['tokens_per_sec']:.1f} tokens/sec")
146
+ print(f"Output: {result['text'][len(prompt):len(prompt)+150]}")
147
+
148
+ all_results["prompts"].append({
149
+ "prompt": prompt,
150
+ "compression_ratio": result["compression_ratio"],
151
+ "peak_memory_gb": result["peak_memory_gb"],
152
+ "tokens_per_sec": result["tokens_per_sec"],
153
+ "fp16_kb": result["fp16_kb"],
154
+ "compressed_kb": result["compressed_kb"],
155
+ })
156
+
157
+ # save
158
+ out_path = f"{results_dir}/integrate_results.json"
159
+ with open(out_path, "w") as f:
160
+ json.dump(all_results, f, indent=2)
161
+
162
+ print(f"\nβœ… Results saved to {out_path}")
results/llama-3-8b/integrate_results.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama-3-8b",
3
+ "timestamp": "2026-05-03T01:43:03.151972",
4
+ "avg_bits": 7.84375,
5
+ "theoretical_compression": 2.04,
6
+ "prompts": [
7
+ {
8
+ "prompt": "The history of artificial intelligence began",
9
+ "compression_ratio": 2.02,
10
+ "peak_memory_gb": 16.078,
11
+ "tokens_per_sec": 37.0,
12
+ "fp16_kb": 896.0,
13
+ "compressed_kb": 443.2
14
+ },
15
+ {
16
+ "prompt": "Explain how transformers work in deep learning:",
17
+ "compression_ratio": 2.03,
18
+ "peak_memory_gb": 16.078,
19
+ "tokens_per_sec": 37.0,
20
+ "fp16_kb": 1280.0,
21
+ "compressed_kb": 631.5
22
+ },
23
+ {
24
+ "prompt": "Write a Python function to sort a list:",
25
+ "compression_ratio": 2.03,
26
+ "peak_memory_gb": 16.078,
27
+ "tokens_per_sec": 36.6,
28
+ "fp16_kb": 1280.0,
29
+ "compressed_kb": 631.5
30
+ }
31
+ ]
32
+ }
results/mistral-7b/integrate_results.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "mistral-7b",
3
+ "timestamp": "2026-05-03T01:42:28.883064",
4
+ "avg_bits": 6.953125,
5
+ "theoretical_compression": 2.3,
6
+ "prompts": [
7
+ {
8
+ "prompt": "The history of artificial intelligence began",
9
+ "compression_ratio": 2.28,
10
+ "peak_memory_gb": 14.512,
11
+ "tokens_per_sec": 37.5,
12
+ "fp16_kb": 896.0,
13
+ "compressed_kb": 393.4
14
+ },
15
+ {
16
+ "prompt": "Explain how transformers work in deep learning:",
17
+ "compression_ratio": 2.29,
18
+ "peak_memory_gb": 14.513,
19
+ "tokens_per_sec": 37.4,
20
+ "fp16_kb": 1408.0,
21
+ "compressed_kb": 615.9
22
+ },
23
+ {
24
+ "prompt": "Write a Python function to sort a list:",
25
+ "compression_ratio": 2.28,
26
+ "peak_memory_gb": 14.513,
27
+ "tokens_per_sec": 37.7,
28
+ "fp16_kb": 1280.0,
29
+ "compressed_kb": 560.2
30
+ }
31
+ ]
32
+ }