File size: 4,849 Bytes
5885a23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import subprocess
import sys
import threading
import time
import shutil

os.environ["CUDA_VISIBLE_DEVICES"] = ""

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

def run_cmd(cmd):
    log(f"Running: {cmd}")
    subprocess.check_call(cmd, shell=True)

# Heartbeat thread
def heartbeat():
    start = time.time()
    while True:
        time.sleep(60)
        elapsed = int(time.time() - start)
        log(f"HEARTBEAT: still alive after {elapsed}s")

t = threading.Thread(target=heartbeat, daemon=True)
t.start()

# ============================================================
# Step 1: Find the model.tflite from the previous kernel output
# ============================================================
log("Looking for model.tflite from previous kernel output...")

# Kaggle mounts kernel sources under /kaggle/input/<kernel-slug>/
input_base = "/kaggle/input"
tflite_path = None

for root, dirs, files in os.walk(input_base):
    for f in files:
        fpath = os.path.join(root, f)
        size_mb = os.path.getsize(fpath) / (1024*1024)
        log(f"  Found: {fpath} ({size_mb:.1f} MB)")
        if f.endswith(".tflite"):
            tflite_path = fpath

if not tflite_path:
    log("ERROR: No .tflite file found in input!")
    log("Listing all input directories:")
    for root, dirs, files in os.walk(input_base):
        log(f"  DIR: {root} ({len(files)} files)")
    sys.exit(1)

size_gb = os.path.getsize(tflite_path) / (1024*1024*1024)
log(f"Found model: {tflite_path} ({size_gb:.2f} GB)")

# ============================================================
# Step 2: Install quantization dependencies
# ============================================================
log("Installing dependencies...")
run_cmd(f"{sys.executable} -m pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
run_cmd(f"{sys.executable} -m pip install -U litert-torch torchao transformers huggingface-hub mediapipe accelerate sentencepiece 'protobuf>=6.0'")

log("All dependencies installed.")

# ============================================================
# Step 3: Quantize the model
# ============================================================
log("=== QUANTIZING MODEL (dynamic_wi8_afp32) ===")

# Copy the tflite to working dir first (input is read-only)
work_tflite = "/kaggle/working/model.tflite"
log(f"Copying {tflite_path} -> {work_tflite}...")
shutil.copy2(tflite_path, work_tflite)
log("Copy done.")

# Also copy any other files from the previous output (embedder, tokenizer, etc.)
input_dir = os.path.dirname(tflite_path)
for f in os.listdir(input_dir):
    src = os.path.join(input_dir, f)
    dst = os.path.join("/kaggle/working", f)
    if os.path.isfile(src) and src != tflite_path:
        log(f"Copying {f}...")
        shutil.copy2(src, dst)

from ai_edge_quantizer import quantizer as quant_lib
from ai_edge_quantizer import recipe as recipe_lib

log("Starting quantization...")
quantized_path = "/kaggle/working/model_quantized.tflite"

qt = quant_lib.Quantizer(work_tflite)
recipe = recipe_lib.dynamic_wi8_afp32()
qt.load_quantization_recipe(recipe)
log("Running quantization (this will take a while)...")
qt.quantize().export_model(quantized_path, overwrite=True)

size_gb = os.path.getsize(quantized_path) / (1024*1024*1024)
log(f"Quantized model saved: {quantized_path} ({size_gb:.2f} GB)")

# Remove unquantized copy to save space
os.remove(work_tflite)
log("Removed unquantized copy.")

# ============================================================
# Step 4: Bundle into .litertlm
# ============================================================
log("=== BUNDLING INTO .litertlm ===")

import litert_torch.generative.export_hf.export as export_lib

# Try to bundle - this might need the full export context
# If bundling fails, the quantized tflite is still the main output
try:
    from litert_torch.generative.export_hf.core import bundle_utils
    output_bundle = "/kaggle/working/gemma-4-E2B-it-uncensored.litertlm"
    bundle_utils.bundle_litert_lm(
        model_path=quantized_path,
        output_path=output_bundle,
        model_type="gemma4",
    )
    size_gb = os.path.getsize(output_bundle) / (1024*1024*1024)
    log(f"Bundle saved: {output_bundle} ({size_gb:.2f} GB)")
except Exception as e:
    log(f"Bundling failed (not critical): {e}")
    log("The quantized .tflite is still available as output.")

# ============================================================
# Final: List all output files
# ============================================================
log("=== OUTPUT FILES ===")
for f in os.listdir("/kaggle/working"):
    fpath = os.path.join("/kaggle/working", f)
    if os.path.isfile(fpath):
        size_mb = os.path.getsize(fpath) / (1024*1024)
        log(f"  {f} ({size_mb:.1f} MB)")

log("SUCCESS! Quantization complete.")