Jakubrd4 commited on
Commit
bff9d12
·
verified ·
1 Parent(s): 3b2f342

Upload run_eval.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_eval.py +99 -0
run_eval.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, json, time, torch, argparse
2
+ sys.path.insert(0, "/workspace/quip-sharp")
3
+ torch.set_grad_enabled(False)
4
+ from transformers import AutoTokenizer
5
+ from lm_eval import evaluator
6
+ from lm_eval.models.huggingface import HFLM
7
+ from lib.utils.unsafe_import import model_from_hf_path
8
+
9
+ MC_TASKS = [
10
+ "polemo2_in_multiple_choice", "polemo2_out_multiple_choice",
11
+ "polish_8tags_multiple_choice", "polish_belebele_mc",
12
+ "polish_dyk_multiple_choice", "polish_ppc_multiple_choice",
13
+ "polish_psc_multiple_choice", "polish_cbd_multiple_choice",
14
+ "polish_klej_ner_multiple_choice", "polish_polqa_reranking_multiple_choice",
15
+ ]
16
+ PPL_TASKS = ["polish_poleval2018_task3_test_10k"]
17
+ BASELINES = {
18
+ "polemo2_in_multiple_choice": 0.416, "polemo2_out_multiple_choice": 0.368,
19
+ "polish_8tags_multiple_choice": 0.143, "polish_belebele_mc": 0.279,
20
+ "polish_dyk_multiple_choice": 0.289, "polish_ppc_multiple_choice": 0.419,
21
+ "polish_psc_multiple_choice": 0.466, "polish_cbd_multiple_choice": 0.149,
22
+ "polish_klej_ner_multiple_choice": 0.343, "polish_polqa_reranking_multiple_choice": 0.534,
23
+ }
24
+
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument("--limit", type=int, default=None)
27
+ parser.add_argument("--batch_size", type=int, default=1)
28
+ parser.add_argument("--model_path", type=str, default="/workspace/model")
29
+ args = parser.parse_args()
30
+
31
+ ALL_TASKS = MC_TASKS + PPL_TASKS
32
+ start = time.time()
33
+ lstr = str(args.limit) if args.limit else "FULL"
34
+ print("=" * 70)
35
+ print("Open PL LLM Leaderboard - QuIP# E8P12 2-bit Instruct")
36
+ print("Batch: %d | Limit: %s" % (args.batch_size, lstr))
37
+ print("GPU: %s" % torch.cuda.get_device_name(0))
38
+ print("=" * 70)
39
+
40
+ print("Loading model...")
41
+ model, model_str = model_from_hf_path(args.model_path, use_cuda_graph=False, use_flash_attn=False)
42
+ tokenizer = AutoTokenizer.from_pretrained(model_str)
43
+ tokenizer.pad_token = tokenizer.eos_token
44
+ lm = HFLM(pretrained=model, tokenizer=tokenizer, backend="causal", batch_size=args.batch_size, max_length=4096, trust_remote_code=True)
45
+
46
+ ekw = dict(model=lm, tasks=ALL_TASKS, num_fewshot=5, batch_size=args.batch_size, log_samples=False)
47
+ if args.limit:
48
+ ekw["limit"] = args.limit
49
+
50
+ print("Running eval...")
51
+ results = evaluator.simple_evaluate(**ekw)
52
+
53
+ elapsed = time.time() - start
54
+ print("\n" + "=" * 70)
55
+ print("RESULTS (5-shot, limit=%s)" % lstr)
56
+ print("=" * 70)
57
+ scores = {}
58
+ nscores = {}
59
+ for t in ALL_TASKS:
60
+ if t not in results.get("results", {}):
61
+ print(" %-45s MISSING" % t)
62
+ continue
63
+ tr = results["results"][t]
64
+ score = None
65
+ metric = "?"
66
+ for mk in ["acc,none", "f1,none", "word_perplexity,none"]:
67
+ if mk in tr:
68
+ score = tr[mk]
69
+ metric = mk.split(",")[0]
70
+ break
71
+ if score is None:
72
+ continue
73
+ bl = BASELINES.get(t, 0)
74
+ is_ppl = t in PPL_TASKS
75
+ if is_ppl:
76
+ norm = None
77
+ elif 0 < bl < 1.0:
78
+ norm = max(0, (score - bl) / (1.0 - bl))
79
+ else:
80
+ norm = max(0, score)
81
+ scores[t] = score
82
+ if norm is not None:
83
+ nscores[t] = norm
84
+ ns = "norm=%.4f" % norm if norm is not None else ""
85
+ print(" %-45s %s=%.4f %s" % (t, metric, score, ns))
86
+
87
+ print("-" * 70)
88
+ avg = sum(nscores.values()) / len(nscores) if nscores else 0
89
+ print(" %-45s %.4f (%.2f%%)" % ("Avg MC (normalized)", avg, avg * 100))
90
+ print("=" * 70)
91
+ print("Time: %.1f min" % (elapsed / 60))
92
+ print("\nComparison:")
93
+ print(" SpeakLeash IQ2_XXS = 61.34%%")
94
+ print(" FP16 baseline = 65.71%%")
95
+ print(" QuIP# E8P12 2-bit = %.2f%%" % (avg * 100))
96
+ os.makedirs("/workspace/eval_results", exist_ok=True)
97
+ fn = "/workspace/eval_results/results_limit%s.json" % (str(args.limit) if args.limit else "full")
98
+ json.dump({"avg_mc": float(avg), "scores": {k: float(v) for k,v in scores.items()}, "normalized": {k: float(v) for k,v in nscores.items()}, "full": results.get("results", {})}, open(fn, "w"), indent=2, default=str)
99
+ print("Saved to %s" % fn)