File size: 12,390 Bytes
838f737 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
"""
Evaluate the Chat model.
All the generic code lives here, and all the evlauation-specific
code lives in nanochat directory and is imported from here.
Example runs:
python -m scripts.chat_eval -a ARC-Easy
torchrun --nproc_per_node=8 -m scripts.chat_eval -- -a ARC-Easy
"""
import argparse
from functools import partial
from contextlib import nullcontext
import torch
import torch.distributed as dist
from nanochat.common import compute_init, compute_cleanup, get_dist_info, print0, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from nanochat.engine import Engine
from tasks.humaneval import HumanEval
from tasks.mmlu import MMLU
from tasks.arc import ARC
from tasks.gsm8k import GSM8K
from tasks.spellingbee import SpellingBee
# -----------------------------------------------------------------------------
# Generative evaluation loop (we go one problem at a time, sample, evaluate)
def run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None):
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
device = model.get_device()
num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
# Run the evaluation
num_passed, total = 0, 0
for i in range(ddp_rank, num_problems, ddp_world_size):
conversation = task_object[i]
# Tokenize the prompt
encoded_prompt = tokenizer.render_for_completion(conversation)
# Get the completions
results, _ = engine.generate_batch(
encoded_prompt,
num_samples=num_samples,
max_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
)
# Decode the completions as text
prefix_length = len(encoded_prompt)
completions = [tokenizer.decode(result_tokens[prefix_length:]) for result_tokens in results]
# Evaluate success criteria
outcomes = [task_object.evaluate(conversation, completion) for completion in completions]
passed = any(outcomes)
# Keep stats
total += 1
num_passed += int(passed)
# Logging (overwrite the same line in the console)
print(f"\r\033[KRank {ddp_rank} | {num_passed}/{total} ({100*num_passed/total:.2f}%)", end='', flush=True)
# Finish the in-place progress line with a newline before final summary
print()
# Aggregate results across all ranks
if ddp:
num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
total_tensor = torch.tensor([total], dtype=torch.long, device=device)
dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
num_passed = num_passed_tensor.item()
total = total_tensor.item()
print0("=" * 50)
print0(f"Final: {num_passed}/{total} ({100*num_passed/total:.2f}%)")
# Return the accuracy
return num_passed/total
# -----------------------------------------------------------------------------
# Categorical evaluation loop
# A lot easier because we don't have to sample. Therefore, we can actually go
# batches at a time and just check the logits for correct answer choices.
def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=None):
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
device = model.get_device()
bos = tokenizer.get_bos_token_id() # use BOS as pad token is ok, these positions are ignored
# We'll process batches of independent problems at a time because there is no sampling needed
num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
ceil_div = lambda x, y: -(-x // y)
num_batches = ceil_div(num_problems, batch_size)
# Run the evaluation
letter_to_id_cache = {} # many letters will repeat often, let's save the tokenizer some work
num_passed, total = 0, 0
for i in range(ddp_rank, num_batches, ddp_world_size):
i0, i1 = i * batch_size, min((i + 1) * batch_size, num_problems)
# Prepare the batch of problems. They might all be of different length, so we pad/collate them.
conversations = [task_object[ii] for ii in range(i0, i1)]
prompt_ids = [tokenizer.render_for_completion(conversation) for conversation in conversations] # TODO: remake the way this works
max_length = max(len(ids) for ids in prompt_ids)
answer_time_positions = [len(ids) - 1 for ids in prompt_ids] # where the last token is (and the predicted answer)
padded_prompt_ids = [ids + [bos] * (max_length - len(ids)) for ids in prompt_ids]
prompt_ids = torch.tensor(padded_prompt_ids, dtype=torch.long, device=device)
# Get the logits for the whole batch of conversations in parallel (efficiency win here)
with torch.no_grad():
logits = model(prompt_ids) # (B, T, V)
# Focus on the available answer on just the letters corresponding to choices
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
for idx, conversation in enumerate(conversations):
# get the token ids of all the available letters of this problem
letters = conversation['letters']
letter_ids = []
for letter in letters:
if not letter in letter_to_id_cache:
encoded_letter = tokenizer.encode(letter)
assert len(encoded_letter) == 1, "Each letter must be a single token"
letter_to_id_cache[letter] = encoded_letter[0]
letter_ids.append(letter_to_id_cache[letter])
# focus logits just down to the answer position and the available letters of the answer
answer_pos = answer_time_positions[idx]
focus_logits = logits[idx, answer_pos, letter_ids]
# get the argmax letter (the predicted answer)
argmax_letter_id = focus_logits.argmax(dim=-1).item()
predicted_letter = letters[argmax_letter_id]
# evaluate the outcome
outcome = task_object.evaluate(conversation, predicted_letter)
num_passed += int(outcome)
total += 1
# Aggregate results across all ranks
if ddp:
num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
total_tensor = torch.tensor([total], dtype=torch.long, device=device)
dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
num_passed = num_passed_tensor.item()
total = total_tensor.item()
average = num_passed/total
print0(f"Final: {num_passed}/{total} ({100*average:.2f}%)")
return average
# -----------------------------------------------------------------------------
def run_chat_eval(task_name, model, tokenizer, engine,
batch_size=1, num_samples=1, max_new_tokens=512, temperature=0.0, top_k=50,
max_problems=None):
# Create the evaluation object
task_module = {
'HumanEval': HumanEval,
'MMLU': partial(MMLU, subset="all", split="test"),
'ARC-Easy': partial(ARC, subset="ARC-Easy", split="test"),
'ARC-Challenge': partial(ARC, subset="ARC-Challenge", split="test"),
'GSM8K': partial(GSM8K, subset="main", split="test"),
'SpellingBee': partial(SpellingBee, size=256, split="test"),
}[task_name]
task_object = task_module()
# Run the evaluation
if task_object.eval_type == 'generative':
acc = run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=max_problems)
elif task_object.eval_type == 'categorical':
acc = run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=max_problems)
else:
raise ValueError(f"Unsupported task evaluation type: {task_object.eval_type}")
return acc
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--source', type=str, required=True, help="Source of the model: sft|mid|rl")
parser.add_argument('-a', '--task-name', type=str, default=None, help="Task name. Default = all tasks. Use | to split multiple tasks.")
parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
parser.add_argument('-t', '--temperature', type=float, default=0.0)
parser.add_argument('-m', '--max-new-tokens', type=int, default=512)
parser.add_argument('-n', '--num-samples', type=int, default=1)
parser.add_argument('-k', '--top-k', type=int, default=50)
parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch size for categorical evaluation')
parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
parser.add_argument('-x', '--max-problems', type=int, default=None, help='Max problems to evaluate')
parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
args = parser.parse_args()
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
engine = Engine(model, tokenizer)
# Get the tasks to evaluate on
all_tasks = ['ARC-Easy', 'ARC-Challenge', 'MMLU', 'GSM8K', 'HumanEval', 'SpellingBee']
baseline_accuracies = {
'ARC-Easy': 0.25, # multiple choice 1 of 4 => 25%
'ARC-Challenge': 0.25, # multiple choice 1 of 4 => 25%
'MMLU': 0.25, # multiple choice 1 of 4 => 25%
'GSM8K': 0.0, # open-ended => 0%
'HumanEval': 0.0, # open-ended => 0%
'SpellingBee': 0.0, # open-ended => 0%
}
task_names = all_tasks if args.task_name is None else args.task_name.split('|')
# Run all the task evaluations sequentially
results = {}
for task_name in task_names:
with autocast_ctx:
acc = run_chat_eval(
task_name,
model, tokenizer, engine,
batch_size=args.batch_size,
num_samples=args.num_samples,
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_k=args.top_k,
max_problems=args.max_problems,
)
results[task_name] = acc
print0(f"{task_name} accuracy: {100 * acc:.2f}%")
# Log to report
from nanochat.report import get_report
all_tasks_were_evaluated = all(task_name in results for task_name in all_tasks)
# calculate the ChatCORE metric if we can (similar to CORE, it's the mean centered accuracy)
# this way, ChatCORE ranges from 0 (at random baseline) to 1 (peak performance)
chatcore_metric_dict = {}
if all_tasks_were_evaluated:
centered_mean = 0
for task_name, acc in results.items():
baseline_acc = baseline_accuracies.get(task_name, 0.0)
centered_acc = (acc - baseline_acc) / (1.0 - baseline_acc)
centered_mean += centered_acc
chatcore_metric = centered_mean / len(results)
chatcore_metric_dict = {"ChatCORE metric": chatcore_metric}
get_report().log(section="Chat evaluation " + args.source, data=[
vars(args), # CLI args
results,
chatcore_metric_dict,
])
compute_cleanup()
|