File size: 4,849 Bytes
5885a23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import os
import subprocess
import sys
import threading
import time
import shutil
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def log(msg):
print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)
def run_cmd(cmd):
log(f"Running: {cmd}")
subprocess.check_call(cmd, shell=True)
# Heartbeat thread
def heartbeat():
start = time.time()
while True:
time.sleep(60)
elapsed = int(time.time() - start)
log(f"HEARTBEAT: still alive after {elapsed}s")
t = threading.Thread(target=heartbeat, daemon=True)
t.start()
# ============================================================
# Step 1: Find the model.tflite from the previous kernel output
# ============================================================
log("Looking for model.tflite from previous kernel output...")
# Kaggle mounts kernel sources under /kaggle/input/<kernel-slug>/
input_base = "/kaggle/input"
tflite_path = None
for root, dirs, files in os.walk(input_base):
for f in files:
fpath = os.path.join(root, f)
size_mb = os.path.getsize(fpath) / (1024*1024)
log(f" Found: {fpath} ({size_mb:.1f} MB)")
if f.endswith(".tflite"):
tflite_path = fpath
if not tflite_path:
log("ERROR: No .tflite file found in input!")
log("Listing all input directories:")
for root, dirs, files in os.walk(input_base):
log(f" DIR: {root} ({len(files)} files)")
sys.exit(1)
size_gb = os.path.getsize(tflite_path) / (1024*1024*1024)
log(f"Found model: {tflite_path} ({size_gb:.2f} GB)")
# ============================================================
# Step 2: Install quantization dependencies
# ============================================================
log("Installing dependencies...")
run_cmd(f"{sys.executable} -m pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
run_cmd(f"{sys.executable} -m pip install -U litert-torch torchao transformers huggingface-hub mediapipe accelerate sentencepiece 'protobuf>=6.0'")
log("All dependencies installed.")
# ============================================================
# Step 3: Quantize the model
# ============================================================
log("=== QUANTIZING MODEL (dynamic_wi8_afp32) ===")
# Copy the tflite to working dir first (input is read-only)
work_tflite = "/kaggle/working/model.tflite"
log(f"Copying {tflite_path} -> {work_tflite}...")
shutil.copy2(tflite_path, work_tflite)
log("Copy done.")
# Also copy any other files from the previous output (embedder, tokenizer, etc.)
input_dir = os.path.dirname(tflite_path)
for f in os.listdir(input_dir):
src = os.path.join(input_dir, f)
dst = os.path.join("/kaggle/working", f)
if os.path.isfile(src) and src != tflite_path:
log(f"Copying {f}...")
shutil.copy2(src, dst)
from ai_edge_quantizer import quantizer as quant_lib
from ai_edge_quantizer import recipe as recipe_lib
log("Starting quantization...")
quantized_path = "/kaggle/working/model_quantized.tflite"
qt = quant_lib.Quantizer(work_tflite)
recipe = recipe_lib.dynamic_wi8_afp32()
qt.load_quantization_recipe(recipe)
log("Running quantization (this will take a while)...")
qt.quantize().export_model(quantized_path, overwrite=True)
size_gb = os.path.getsize(quantized_path) / (1024*1024*1024)
log(f"Quantized model saved: {quantized_path} ({size_gb:.2f} GB)")
# Remove unquantized copy to save space
os.remove(work_tflite)
log("Removed unquantized copy.")
# ============================================================
# Step 4: Bundle into .litertlm
# ============================================================
log("=== BUNDLING INTO .litertlm ===")
import litert_torch.generative.export_hf.export as export_lib
# Try to bundle - this might need the full export context
# If bundling fails, the quantized tflite is still the main output
try:
from litert_torch.generative.export_hf.core import bundle_utils
output_bundle = "/kaggle/working/gemma-4-E2B-it-uncensored.litertlm"
bundle_utils.bundle_litert_lm(
model_path=quantized_path,
output_path=output_bundle,
model_type="gemma4",
)
size_gb = os.path.getsize(output_bundle) / (1024*1024*1024)
log(f"Bundle saved: {output_bundle} ({size_gb:.2f} GB)")
except Exception as e:
log(f"Bundling failed (not critical): {e}")
log("The quantized .tflite is still available as output.")
# ============================================================
# Final: List all output files
# ============================================================
log("=== OUTPUT FILES ===")
for f in os.listdir("/kaggle/working"):
fpath = os.path.join("/kaggle/working", f)
if os.path.isfile(fpath):
size_mb = os.path.getsize(fpath) / (1024*1024)
log(f" {f} ({size_mb:.1f} MB)")
log("SUCCESS! Quantization complete.")
|