Upload full_cloud_eval.sh with huggingface_hub

9857299 verified 2 months ago

10.2 kB

	#!/bin/bash
	set -e
	WORKDIR=/workspace
	HF_TOKEN="$1"
	if [ -z "$HF_TOKEN" ]; then echo "Usage: bash full_cloud_eval.sh <HF_TOKEN>"; exit 1; fi
	HF_REPO="Jakubrd4/bielik-quip-e8p12"
	LIMIT=200
	export HF_DATASETS_TRUST_REMOTE_CODE=1

	echo "========================================"
	echo " QuIP# Bielik Eval - FULL AUTO SETUP"
	echo " RTX 4090 / A100 / H100 (NOT Blackwell)"
	echo "========================================"
	echo "Start: $(date)"
	echo "GPU: $(python3 -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null \|\| echo 'unknown')"
	echo ""

	# ============================================
	# 1. Clone QuIP#
	# ============================================
	echo "[1/8] Cloning QuIP#..."
	cd $WORKDIR
	if [ -d quip-sharp ]; then
	echo " Already exists, skipping clone"
	else
	git clone https://github.com/Cornell-RelaxML/quip-sharp.git
	fi
	cd quip-sharp

	# ============================================
	# 2. Apply patches
	# ============================================
	echo "[2/8] Applying patches..."
	sed -i 's/from \.lm_eval_adaptor import.*/# disabled for lm-eval 0.4.x/' lib/utils/__init__.py
	echo " __init__.py patched"

	python3 << 'PATCHPY'
	path = 'lib/utils/unsafe_import.py'
	with open(path) as f:
	code = f.read()
	if 'from model.mistral' not in code:
	code = code.replace(
	'from model.llama import LlamaForCausalLM',
	'from model.llama import LlamaForCausalLM\nfrom model.mistral import MistralForCausalLM'
	)
	if "model_type == 'mistral'" not in code:
	old = " else:\n raise Exception"
	new = " elif model_type == 'mistral':\n model_str = transformers.MistralConfig.from_pretrained(path)._name_or_path\n model_cls = MistralForCausalLM\n else:\n raise Exception"
	code = code.replace(old, new)

	# Also force eager attention (QuIP# fused qkv_proj breaks sdpa)
	code = code.replace("attn_implementation='sdpa'", "attn_implementation='eager'")

	with open(path, 'w') as f:
	f.write(code)
	print(' unsafe_import.py patched for Mistral')
	PATCHPY

	python3 << 'PATCHPY2'
	path = 'model/llama.py'
	with open(path) as f:
	code = f.read()
	old_line = " causal_mask = AttentionMaskConverter._unmask_unattended("
	if old_line in code:
	new_block = """ if hasattr(AttentionMaskConverter, '_unmask_unattended'):
	causal_mask = AttentionMaskConverter._unmask_unattended(
	causal_mask, min_dtype
	)"""
	code = code.replace(
	old_line + "\n causal_mask, min_dtype\n )",
	new_block
	)
	with open(path, 'w') as f:
	f.write(code)
	print(' llama.py patched (_unmask_unattended)')
	else:
	print(' llama.py: patch not needed or already applied')
	PATCHPY2

	# Patch: add rope_theta default for Mistral config
	sed -i 's/self.rope_theta = config.rope_theta/self.rope_theta = getattr(config, "rope_theta", 1000000.0)/' model/mistral.py 2>/dev/null \|\| true
	echo " rope_theta patched"

	# ============================================
	# 3. Fix Python dependencies
	# ============================================
	echo "[3/8] Fixing Python dependencies..."
	pip install glog primefac protobuf 2>&1 \| tail -3
	pip install 'transformers==4.38.0' 2>&1 \| tail -3
	pip install 'datasets==2.20.0' 2>&1 \| tail -3
	# peft compatible with transformers 4.38
	pip install 'peft==0.9.0' 2>&1 \| tail -3
	echo " Dependencies fixed"

	# ============================================
	# 4. Compile QuIP# CUDA kernels
	# ============================================
	echo "[4/8] Compiling QuIP# CUDA kernels..."
	cd $WORKDIR/quip-sharp/quiptools
	pip install --no-build-isolation -e . 2>&1 \| tail -5
	echo " quiptools installed"
	echo " Installing fast-hadamard-transform..."
	pip install --no-build-isolation fast-hadamard-transform 2>&1 \| tail -3 \|\| {
	echo " PyPI install failed, trying from git..."
	pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git 2>&1 \| tail -3
	}
	echo " fast-hadamard-transform installed"

	# ============================================
	# 5. Install lm-eval Polish fork
	# ============================================
	echo "[5/8] Installing lm-evaluation-harness (Polish fork)..."
	cd $WORKDIR
	if [ -d lm-evaluation-harness ]; then
	echo " Already exists, skipping clone"
	else
	git clone --branch polish4_shuf https://github.com/speakleash/lm-evaluation-harness.git
	fi
	cd lm-evaluation-harness
	pip install -e . 2>&1 \| tail -5
	echo " lm-eval installed"

	# ============================================
	# 6. Download model from HuggingFace
	# ============================================
	echo "[6/8] Downloading model from HuggingFace..."
	python3 << DLPY
	from huggingface_hub import snapshot_download
	print(" Starting download...")
	snapshot_download('${HF_REPO}', local_dir='${WORKDIR}/model', token='${HF_TOKEN}')
	print(" Model downloaded!")
	DLPY
	echo " Model files:"
	ls -lh $WORKDIR/model/

	# ============================================
	# 7. Add rope_theta to model config if missing
	# ============================================
	echo "[7/8] Checking model config..."
	python3 << 'CFGPY'
	import json
	p = '/workspace/model/config.json'
	c = json.load(open(p))
	changed = False
	if 'rope_theta' not in c:
	c['rope_theta'] = 1000000.0
	changed = True
	if changed:
	json.dump(c, open(p, 'w'), indent=2)
	print(" Added rope_theta to config")
	else:
	print(" Config OK")
	CFGPY

	# ============================================
	# 8. Create eval script and run
	# ============================================
	echo "[8/8] Creating eval script and running..."
	cat > $WORKDIR/run_eval.py << 'PYEOF'
	import sys, os, json, time, torch, argparse
	sys.path.insert(0, "/workspace/quip-sharp")
	torch.set_grad_enabled(False)
	from transformers import AutoTokenizer
	from lm_eval import evaluator
	from lm_eval.models.huggingface import HFLM
	from lib.utils.unsafe_import import model_from_hf_path

	MC_TASKS = [
	"polemo2_in_multiple_choice", "polemo2_out_multiple_choice",
	"polish_8tags_multiple_choice", "polish_belebele_mc",
	"polish_dyk_multiple_choice", "polish_ppc_multiple_choice",
	"polish_psc_multiple_choice", "polish_cbd_multiple_choice",
	"polish_klej_ner_multiple_choice", "polish_polqa_reranking_multiple_choice",
	]
	PPL_TASKS = ["polish_poleval2018_task3_test_10k"]
	BASELINES = {
	"polemo2_in_multiple_choice": 0.416, "polemo2_out_multiple_choice": 0.368,
	"polish_8tags_multiple_choice": 0.143, "polish_belebele_mc": 0.279,
	"polish_dyk_multiple_choice": 0.289, "polish_ppc_multiple_choice": 0.419,
	"polish_psc_multiple_choice": 0.466, "polish_cbd_multiple_choice": 0.149,
	"polish_klej_ner_multiple_choice": 0.343, "polish_polqa_reranking_multiple_choice": 0.534,
	}

	parser = argparse.ArgumentParser()
	parser.add_argument("--limit", type=int, default=None)
	parser.add_argument("--batch_size", type=int, default=1)
	parser.add_argument("--model_path", type=str, default="/workspace/model")
	args = parser.parse_args()

	ALL_TASKS = MC_TASKS + PPL_TASKS
	start = time.time()
	lstr = str(args.limit) if args.limit else "FULL"
	print("=" * 70)
	print("Open PL LLM Leaderboard - QuIP# E8P12 2-bit Instruct")
	print("Batch: %d \| Limit: %s" % (args.batch_size, lstr))
	print("GPU: %s" % torch.cuda.get_device_name(0))
	print("=" * 70)

	print("Loading model...")
	model, model_str = model_from_hf_path(args.model_path, use_cuda_graph=False, use_flash_attn=False)
	tokenizer = AutoTokenizer.from_pretrained(model_str)
	tokenizer.pad_token = tokenizer.eos_token
	lm = HFLM(pretrained=model, tokenizer=tokenizer, backend="causal", batch_size=args.batch_size, max_length=4096, trust_remote_code=True)

	ekw = dict(model=lm, tasks=ALL_TASKS, num_fewshot=5, batch_size=args.batch_size, log_samples=False)
	if args.limit:
	ekw["limit"] = args.limit

	print("Running eval...")
	results = evaluator.simple_evaluate(**ekw)

	elapsed = time.time() - start
	print("\n" + "=" * 70)
	print("RESULTS (5-shot, limit=%s)" % lstr)
	print("=" * 70)
	scores = {}
	nscores = {}
	for t in ALL_TASKS:
	if t not in results.get("results", {}):
	print(" %-45s MISSING" % t)
	continue
	tr = results["results"][t]
	score = None
	metric = "?"
	for mk in ["acc,none", "f1,none", "word_perplexity,none"]:
	if mk in tr:
	score = tr[mk]
	metric = mk.split(",")[0]
	break
	if score is None:
	continue
	bl = BASELINES.get(t, 0)
	is_ppl = t in PPL_TASKS
	if is_ppl:
	norm = None
	elif 0 < bl < 1.0:
	norm = max(0, (score - bl) / (1.0 - bl))
	else:
	norm = max(0, score)
	scores[t] = score
	if norm is not None:
	nscores[t] = norm
	ns = "norm=%.4f" % norm if norm is not None else ""
	print(" %-45s %s=%.4f %s" % (t, metric, score, ns))

	print("-" * 70)
	avg = sum(nscores.values()) / len(nscores) if nscores else 0
	print(" %-45s %.4f (%.2f%%)" % ("Avg MC (normalized)", avg, avg * 100))
	print("=" * 70)
	print("Time: %.1f min" % (elapsed / 60))
	print("\nComparison:")
	print(" SpeakLeash IQ2_XXS = 61.34%%")
	print(" FP16 baseline = 65.71%%")
	print(" QuIP# E8P12 2-bit = %.2f%%" % (avg * 100))
	os.makedirs("/workspace/eval_results", exist_ok=True)
	fn = "/workspace/eval_results/results_limit%s.json" % (str(args.limit) if args.limit else "full")
	json.dump({"avg_mc": float(avg), "scores": {k: float(v) for k,v in scores.items()}, "normalized": {k: float(v) for k,v in nscores.items()}, "full": results.get("results", {})}, open(fn, "w"), indent=2, default=str)
	print("Saved to %s" % fn)
	PYEOF
	echo " Eval script created"

	echo "Running evaluation with limit=$LIMIT..."
	echo "========================================"
	cd $WORKDIR/quip-sharp
	python3 $WORKDIR/run_eval.py --limit $LIMIT

	echo ""
	echo "========================================"
	echo " ALL DONE! $(date)"
	echo "========================================"