| #!/bin/bash
|
| set -e
|
| WORKDIR=/workspace
|
| HF_TOKEN="$1"
|
| if [ -z "$HF_TOKEN" ]; then echo "Usage: bash full_cloud_eval.sh <HF_TOKEN>"; exit 1; fi
|
| HF_REPO="Jakubrd4/bielik-quip-e8p12"
|
| LIMIT=200
|
| export HF_DATASETS_TRUST_REMOTE_CODE=1
|
|
|
| echo "========================================"
|
| echo " QuIP# Bielik Eval - FULL AUTO SETUP"
|
| echo " RTX 4090 / A100 / H100 (NOT Blackwell)"
|
| echo "========================================"
|
| echo "Start: $(date)"
|
| echo "GPU: $(python3 -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null || echo 'unknown')"
|
| echo ""
|
|
|
|
|
|
|
|
|
| echo "[1/8] Cloning QuIP#..."
|
| cd $WORKDIR
|
| if [ -d quip-sharp ]; then
|
| echo " Already exists, skipping clone"
|
| else
|
| git clone https://github.com/Cornell-RelaxML/quip-sharp.git
|
| fi
|
| cd quip-sharp
|
|
|
|
|
|
|
|
|
| echo "[2/8] Applying patches..."
|
| sed -i 's/from \.lm_eval_adaptor import.*/# disabled for lm-eval 0.4.x/' lib/utils/__init__.py
|
| echo " __init__.py patched"
|
|
|
| python3 << 'PATCHPY'
|
| path = 'lib/utils/unsafe_import.py'
|
| with open(path) as f:
|
| code = f.read()
|
| if 'from model.mistral' not in code:
|
| code = code.replace(
|
| 'from model.llama import LlamaForCausalLM',
|
| 'from model.llama import LlamaForCausalLM\nfrom model.mistral import MistralForCausalLM'
|
| )
|
| if "model_type == 'mistral'" not in code:
|
| old = " else:\n raise Exception"
|
| new = " elif model_type == 'mistral':\n model_str = transformers.MistralConfig.from_pretrained(path)._name_or_path\n model_cls = MistralForCausalLM\n else:\n raise Exception"
|
| code = code.replace(old, new)
|
|
|
|
|
| code = code.replace("attn_implementation='sdpa'", "attn_implementation='eager'")
|
|
|
| with open(path, 'w') as f:
|
| f.write(code)
|
| print(' unsafe_import.py patched for Mistral')
|
| PATCHPY
|
|
|
| python3 << 'PATCHPY2'
|
| path = 'model/llama.py'
|
| with open(path) as f:
|
| code = f.read()
|
| old_line = " causal_mask = AttentionMaskConverter._unmask_unattended("
|
| if old_line in code:
|
| new_block = """ if hasattr(AttentionMaskConverter, '_unmask_unattended'):
|
| causal_mask = AttentionMaskConverter._unmask_unattended(
|
| causal_mask, min_dtype
|
| )"""
|
| code = code.replace(
|
| old_line + "\n causal_mask, min_dtype\n )",
|
| new_block
|
| )
|
| with open(path, 'w') as f:
|
| f.write(code)
|
| print(' llama.py patched (_unmask_unattended)')
|
| else:
|
| print(' llama.py: patch not needed or already applied')
|
| PATCHPY2
|
|
|
|
|
| sed -i 's/self.rope_theta = config.rope_theta/self.rope_theta = getattr(config, "rope_theta", 1000000.0)/' model/mistral.py 2>/dev/null || true
|
| echo " rope_theta patched"
|
|
|
|
|
|
|
|
|
| echo "[3/8] Fixing Python dependencies..."
|
| pip install glog primefac protobuf 2>&1 | tail -3
|
| pip install 'transformers==4.38.0' 2>&1 | tail -3
|
| pip install 'datasets==2.20.0' 2>&1 | tail -3
|
|
|
| pip install 'peft==0.9.0' 2>&1 | tail -3
|
| echo " Dependencies fixed"
|
|
|
|
|
|
|
|
|
| echo "[4/8] Compiling QuIP# CUDA kernels..."
|
| cd $WORKDIR/quip-sharp/quiptools
|
| pip install --no-build-isolation -e . 2>&1 | tail -5
|
| echo " quiptools installed"
|
| echo " Installing fast-hadamard-transform..."
|
| pip install --no-build-isolation fast-hadamard-transform 2>&1 | tail -3 || {
|
| echo " PyPI install failed, trying from git..."
|
| pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git 2>&1 | tail -3
|
| }
|
| echo " fast-hadamard-transform installed"
|
|
|
|
|
|
|
|
|
| echo "[5/8] Installing lm-evaluation-harness (Polish fork)..."
|
| cd $WORKDIR
|
| if [ -d lm-evaluation-harness ]; then
|
| echo " Already exists, skipping clone"
|
| else
|
| git clone --branch polish4_shuf https://github.com/speakleash/lm-evaluation-harness.git
|
| fi
|
| cd lm-evaluation-harness
|
| pip install -e . 2>&1 | tail -5
|
| echo " lm-eval installed"
|
|
|
|
|
|
|
|
|
| echo "[6/8] Downloading model from HuggingFace..."
|
| python3 << DLPY
|
| from huggingface_hub import snapshot_download
|
| print(" Starting download...")
|
| snapshot_download('${HF_REPO}', local_dir='${WORKDIR}/model', token='${HF_TOKEN}')
|
| print(" Model downloaded!")
|
| DLPY
|
| echo " Model files:"
|
| ls -lh $WORKDIR/model/
|
|
|
|
|
|
|
|
|
| echo "[7/8] Checking model config..."
|
| python3 << 'CFGPY'
|
| import json
|
| p = '/workspace/model/config.json'
|
| c = json.load(open(p))
|
| changed = False
|
| if 'rope_theta' not in c:
|
| c['rope_theta'] = 1000000.0
|
| changed = True
|
| if changed:
|
| json.dump(c, open(p, 'w'), indent=2)
|
| print(" Added rope_theta to config")
|
| else:
|
| print(" Config OK")
|
| CFGPY
|
|
|
|
|
|
|
|
|
| echo "[8/8] Creating eval script and running..."
|
| cat > $WORKDIR/run_eval.py << 'PYEOF'
|
| import sys, os, json, time, torch, argparse
|
| sys.path.insert(0, "/workspace/quip-sharp")
|
| torch.set_grad_enabled(False)
|
| from transformers import AutoTokenizer
|
| from lm_eval import evaluator
|
| from lm_eval.models.huggingface import HFLM
|
| from lib.utils.unsafe_import import model_from_hf_path
|
|
|
| MC_TASKS = [
|
| "polemo2_in_multiple_choice", "polemo2_out_multiple_choice",
|
| "polish_8tags_multiple_choice", "polish_belebele_mc",
|
| "polish_dyk_multiple_choice", "polish_ppc_multiple_choice",
|
| "polish_psc_multiple_choice", "polish_cbd_multiple_choice",
|
| "polish_klej_ner_multiple_choice", "polish_polqa_reranking_multiple_choice",
|
| ]
|
| PPL_TASKS = ["polish_poleval2018_task3_test_10k"]
|
| BASELINES = {
|
| "polemo2_in_multiple_choice": 0.416, "polemo2_out_multiple_choice": 0.368,
|
| "polish_8tags_multiple_choice": 0.143, "polish_belebele_mc": 0.279,
|
| "polish_dyk_multiple_choice": 0.289, "polish_ppc_multiple_choice": 0.419,
|
| "polish_psc_multiple_choice": 0.466, "polish_cbd_multiple_choice": 0.149,
|
| "polish_klej_ner_multiple_choice": 0.343, "polish_polqa_reranking_multiple_choice": 0.534,
|
| }
|
|
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--limit", type=int, default=None)
|
| parser.add_argument("--batch_size", type=int, default=1)
|
| parser.add_argument("--model_path", type=str, default="/workspace/model")
|
| args = parser.parse_args()
|
|
|
| ALL_TASKS = MC_TASKS + PPL_TASKS
|
| start = time.time()
|
| lstr = str(args.limit) if args.limit else "FULL"
|
| print("=" * 70)
|
| print("Open PL LLM Leaderboard - QuIP# E8P12 2-bit Instruct")
|
| print("Batch: %d | Limit: %s" % (args.batch_size, lstr))
|
| print("GPU: %s" % torch.cuda.get_device_name(0))
|
| print("=" * 70)
|
|
|
| print("Loading model...")
|
| model, model_str = model_from_hf_path(args.model_path, use_cuda_graph=False, use_flash_attn=False)
|
| tokenizer = AutoTokenizer.from_pretrained(model_str)
|
| tokenizer.pad_token = tokenizer.eos_token
|
| lm = HFLM(pretrained=model, tokenizer=tokenizer, backend="causal", batch_size=args.batch_size, max_length=4096, trust_remote_code=True)
|
|
|
| ekw = dict(model=lm, tasks=ALL_TASKS, num_fewshot=5, batch_size=args.batch_size, log_samples=False)
|
| if args.limit:
|
| ekw["limit"] = args.limit
|
|
|
| print("Running eval...")
|
| results = evaluator.simple_evaluate(**ekw)
|
|
|
| elapsed = time.time() - start
|
| print("\n" + "=" * 70)
|
| print("RESULTS (5-shot, limit=%s)" % lstr)
|
| print("=" * 70)
|
| scores = {}
|
| nscores = {}
|
| for t in ALL_TASKS:
|
| if t not in results.get("results", {}):
|
| print(" %-45s MISSING" % t)
|
| continue
|
| tr = results["results"][t]
|
| score = None
|
| metric = "?"
|
| for mk in ["acc,none", "f1,none", "word_perplexity,none"]:
|
| if mk in tr:
|
| score = tr[mk]
|
| metric = mk.split(",")[0]
|
| break
|
| if score is None:
|
| continue
|
| bl = BASELINES.get(t, 0)
|
| is_ppl = t in PPL_TASKS
|
| if is_ppl:
|
| norm = None
|
| elif 0 < bl < 1.0:
|
| norm = max(0, (score - bl) / (1.0 - bl))
|
| else:
|
| norm = max(0, score)
|
| scores[t] = score
|
| if norm is not None:
|
| nscores[t] = norm
|
| ns = "norm=%.4f" % norm if norm is not None else ""
|
| print(" %-45s %s=%.4f %s" % (t, metric, score, ns))
|
|
|
| print("-" * 70)
|
| avg = sum(nscores.values()) / len(nscores) if nscores else 0
|
| print(" %-45s %.4f (%.2f%%)" % ("Avg MC (normalized)", avg, avg * 100))
|
| print("=" * 70)
|
| print("Time: %.1f min" % (elapsed / 60))
|
| print("\nComparison:")
|
| print(" SpeakLeash IQ2_XXS = 61.34%%")
|
| print(" FP16 baseline = 65.71%%")
|
| print(" QuIP# E8P12 2-bit = %.2f%%" % (avg * 100))
|
| os.makedirs("/workspace/eval_results", exist_ok=True)
|
| fn = "/workspace/eval_results/results_limit%s.json" % (str(args.limit) if args.limit else "full")
|
| json.dump({"avg_mc": float(avg), "scores": {k: float(v) for k,v in scores.items()}, "normalized": {k: float(v) for k,v in nscores.items()}, "full": results.get("results", {})}, open(fn, "w"), indent=2, default=str)
|
| print("Saved to %s" % fn)
|
| PYEOF
|
| echo " Eval script created"
|
|
|
| echo "Running evaluation with limit=$LIMIT..."
|
| echo "========================================"
|
| cd $WORKDIR/quip-sharp
|
| python3 $WORKDIR/run_eval.py --limit $LIMIT
|
|
|
| echo ""
|
| echo "========================================"
|
| echo " ALL DONE! $(date)"
|
| echo "========================================"
|
|
|