| --- |
| language: |
| - en |
| license: apache-2.0 |
| library_name: onnx |
| tags: |
| - text-generation |
| - nanoGPT |
| - onnx |
| model_details: |
| parameters: |
| total: 353550000 |
| datasets: |
| - HuggingFaceFW/fineweb-edu |
| --- |
| |
| Welcome to Apex 1 Instruct 350M, our latest Instruct-Model based on FineWeb-Edu. |
|
|
| **Hey there! If you're interested in our models try: https://huggingface.co/LH-Tech-AI/Apex-1.5-Instruct-350M - Apex 1.5: Improved reasoning and logic. Fixed wrong facts and hallucinations by increasing FineWeb-Edu ratio while finetuning to 4:1.** |
|
|
| # 1. Model Details |
| - **Parameters:** 353.55M |
| - **Layers:** 24 |
| - **Heads:** 16 |
| - **Embedding Dim:** 1024 |
| - **Context Length:** 1024 |
| - **Format:** ONNX (Opset 18) |
|
|
| # 2. Trainingcode |
| ```python |
| import os |
| import time |
| import math |
| import pickle |
| from contextlib import nullcontext |
| |
| import queue |
| |
| import logging |
| |
| import numpy as np |
| import torch |
| from torch.nn.parallel import DistributedDataParallel as DDP |
| from torch.distributed import init_process_group, destroy_process_group |
| |
| from model import GPTConfig, GPT |
| |
| # ----------------------------------------------------------------------------- |
| # default config values designed to train a gpt2 (124M) on OpenWebText |
| # I/O |
| out_dir = 'out' |
| eval_interval = 2000 |
| log_interval = 1 |
| eval_iters = 200 |
| eval_only = False # if True, script exits right after the first eval |
| always_save_checkpoint = True # if True, always save a checkpoint after each eval |
| init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' |
| # wandb logging |
| wandb_log = False # disabled by default |
| wandb_project = 'owt' |
| wandb_run_name = 'gpt2' # 'run' + str(time.time()) |
| # data |
| dataset = 'openwebtext' |
| gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes |
| batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size |
| block_size = 1024 |
| # model |
| n_layer = 12 |
| n_head = 12 |
| n_embd = 768 |
| dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ |
| bias = False # do we use bias inside LayerNorm and Linear layers? |
| # adamw optimizer |
| learning_rate = 6e-4 # max learning rate |
| max_iters = 600000 # total number of training iterations |
| weight_decay = 1e-1 |
| beta1 = 0.9 |
| beta2 = 0.95 |
| grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 |
| # learning rate decay settings |
| decay_lr = True # whether to decay the learning rate |
| warmup_iters = 2000 # how many steps to warm up for |
| lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla |
| min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla |
| # DDP settings |
| backend = 'nccl' # 'nccl', 'gloo', etc. |
| # system |
| device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks |
| dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler |
| compile = True # use PyTorch 2.0 to compile the model to be faster |
| # ----------------------------------------------------------------------------- |
| config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))] |
| exec(open('configurator.py').read()) # overrides from command line or config file |
| config = {k: globals()[k] for k in config_keys} # will be useful for logging |
| # ----------------------------------------------------------------------------- |
| |
| logger = None |
| db_conn = None |
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s %(levelname)s: %(message)s', |
| handlers=[logging.StreamHandler()] |
| ) |
| logger = logging.getLogger("Train") |
| |
| # various inits, derived attributes, I/O setup |
| ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? |
| if ddp: |
| init_process_group(backend=backend) |
| ddp_rank = int(os.environ['RANK']) |
| ddp_local_rank = int(os.environ['LOCAL_RANK']) |
| ddp_world_size = int(os.environ['WORLD_SIZE']) |
| device = f'cuda:{ddp_local_rank}' |
| torch.cuda.set_device(device) |
| master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. |
| seed_offset = ddp_rank # each process gets a different seed |
| # world_size number of processes will be training simultaneously, so we can scale |
| # down the desired gradient accumulation iterations per process proportionally |
| assert gradient_accumulation_steps % ddp_world_size == 0 |
| gradient_accumulation_steps //= ddp_world_size |
| else: |
| # if not ddp, we are running on a single gpu, and one process |
| master_process = True |
| seed_offset = 0 |
| ddp_world_size = 1 |
| tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size |
| logger.info(f"tokens per iteration will be: {tokens_per_iter:,}") |
| |
| |
| if master_process: |
| os.makedirs(out_dir, exist_ok=True) |
| log_dir = "/home/350m_fineweb" |
| os.makedirs(log_dir, exist_ok=True) |
| log_file = os.path.join(log_dir, "training.log") |
| |
| file_handler = logging.FileHandler(log_file) |
| file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) |
| logger.addHandler(file_handler) |
| |
| logger.info(f"Logging in Datei gestartet: {log_file}") |
| |
| torch.manual_seed(1337 + seed_offset) |
| torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul |
| torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn |
| device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast |
| # note: float16 data type will automatically use a GradScaler |
| ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] |
| ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) |
| |
| # poor man's data loader |
| |
| data_handles = { |
| split: { |
| name: np.memmap(os.path.join(path, f'{split}.bin'), dtype=np.uint16, mode='r') |
| for name, path in data_sources.items() |
| } |
| for split in ['train', 'val'] |
| } |
| |
| def get_batch(split): |
| source = 'fineweb' |
| data = data_handles[split][source] |
| |
| ix = torch.randint(len(data) - block_size, (batch_size,)) |
| x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix]) |
| y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix]) |
| |
| if device_type == 'cuda': |
| # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) |
| x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True) |
| else: |
| x, y = x.to(device), y.to(device) |
| return x, y |
| |
| # init these up here, can override if init_from='resume' (i.e. from a checkpoint) |
| iter_num = 0 |
| best_val_loss = 1e9 |
| |
| # attempt to derive vocab_size from the dataset |
| meta_path = os.path.join(data_sources['fineweb'], 'meta.pkl') |
| meta_vocab_size = None |
| if os.path.exists(meta_path): |
| with open(meta_path, 'rb') as f: |
| meta = pickle.load(f) |
| meta_vocab_size = meta['vocab_size'] |
| logger.info(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") |
| |
| # model init |
| model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size, |
| bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line |
| if init_from == 'scratch': |
| # init a new model from scratch |
| logger.info("Initializing a new model from scratch") |
| # determine the vocab size we'll use for from-scratch training |
| if meta_vocab_size is None: |
| logger.info("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)") |
| model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 |
| gptconf = GPTConfig(**model_args) |
| model = GPT(gptconf) |
| elif init_from == 'resume': |
| logger.info(f"Resuming training from {out_dir}") |
| # resume training from a checkpoint. |
| ckpt_path = os.path.join(out_dir, sorted( |
| [f for f in os.listdir(out_dir) if f.startswith("ckpt_") and f.endswith(".pt")] |
| )[-1]) |
| checkpoint = torch.load(ckpt_path, map_location=device) |
| checkpoint_model_args = checkpoint['model_args'] |
| # force these config attributes to be equal otherwise we can't even resume training |
| # the rest of the attributes (e.g. dropout) can stay as desired from command line |
| for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: |
| model_args[k] = checkpoint_model_args[k] |
| # create the model |
| gptconf = GPTConfig(**model_args) |
| model = GPT(gptconf) |
| state_dict = checkpoint['model'] |
| # fix the keys of the state dictionary :( |
| # honestly no idea how checkpoints sometimes get this prefix, have to debug more |
| unwanted_prefix = '_orig_mod.' |
| for k,v in list(state_dict.items()): |
| if k.startswith(unwanted_prefix): |
| state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) |
| model.load_state_dict(state_dict) |
| iter_num = checkpoint['iter_num'] |
| best_val_loss = checkpoint['best_val_loss'] |
| elif init_from.startswith('gpt2'): |
| logger.info(f"Initializing from OpenAI GPT-2 weights: {init_from}") |
| # initialize from OpenAI GPT-2 weights |
| override_args = dict(dropout=dropout) |
| model = GPT.from_pretrained(init_from, override_args) |
| # read off the created config params, so we can store them into checkpoint correctly |
| for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: |
| model_args[k] = getattr(model.config, k) |
| # crop down the model block size if desired, using model surgery |
| if block_size < model.config.block_size: |
| model.crop_block_size(block_size) |
| model_args['block_size'] = block_size # so that the checkpoint will have the right value |
| model.to(device) |
| |
| # initialize a GradScaler. If enabled=False scaler is a no-op |
| scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) |
| |
| # optimizer |
| optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) |
| if init_from == 'resume': |
| optimizer.load_state_dict(checkpoint['optimizer']) |
| checkpoint = None # free up memory |
| |
| # compile the model |
| if compile: |
| logger.info("compiling the model... (takes a ~minute)") |
| unoptimized_model = model |
| model = torch.compile(model) # requires PyTorch 2.0 |
| |
| # wrap model into DDP container |
| if ddp: |
| model = DDP(model, device_ids=[ddp_local_rank]) |
| |
| # helps estimate an arbitrarily accurate loss over either split using many batches |
| @torch.no_grad() |
| def estimate_loss(): |
| out = {} |
| model.eval() |
| for split in ['train', 'val']: |
| losses = torch.zeros(eval_iters) |
| for k in range(eval_iters): |
| X, Y = get_batch(split) |
| with ctx: |
| logits, loss = model(X, Y) |
| losses[k] = loss.item() |
| out[split] = losses.mean() |
| model.train() |
| return out |
| |
| # learning rate decay scheduler (cosine with warmup) |
| def get_lr(it): |
| # 1) linear warmup for warmup_iters steps |
| if it < warmup_iters: |
| return learning_rate * (it + 1) / (warmup_iters + 1) |
| # 2) if it > lr_decay_iters, return min learning rate |
| if it > lr_decay_iters: |
| return min_lr |
| # 3) in between, use cosine decay down to min learning rate |
| decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) |
| assert 0 <= decay_ratio <= 1 |
| coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 |
| return min_lr + coeff * (learning_rate - min_lr) |
| |
| # logging |
| if wandb_log and master_process: |
| import wandb |
| wandb.init(project=wandb_project, name=wandb_run_name, config=config) |
| |
| # training loop |
| X, Y = get_batch('train') # fetch the very first batch |
| t0 = time.time() |
| local_iter_num = 0 # number of iterations in the lifetime of this process |
| raw_model = model.module if ddp else model # unwrap DDP container if needed |
| running_mfu = -1.0 |
| while True: |
| |
| # determine and set the learning rate for this iteration |
| lr = get_lr(iter_num) if decay_lr else learning_rate |
| for param_group in optimizer.param_groups: |
| param_group['lr'] = lr |
| |
| # evaluate the loss on train/val sets and write checkpoints |
| if iter_num % eval_interval == 0 and master_process: |
| losses = estimate_loss() |
| logger.info(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") |
| if wandb_log: |
| wandb.log({ |
| "iter": iter_num, |
| "train/loss": losses['train'], |
| "val/loss": losses['val'], |
| "lr": lr, |
| "mfu": running_mfu*100, # convert to percentage |
| }) |
| if losses['val'] < best_val_loss or always_save_checkpoint: |
| best_val_loss = losses['val'] |
| if iter_num > 0: |
| checkpoint = { |
| 'model': raw_model.state_dict(), |
| 'optimizer': optimizer.state_dict(), |
| 'model_args': model_args, |
| 'iter_num': iter_num, |
| 'best_val_loss': best_val_loss, |
| 'config': config, |
| } |
| logger.info(f"💾 SAVING CHECKPOINT TO {out_dir}") |
| ckpt_name = f"ckpt_{iter_num:07d}.pt" |
| ckpt_path = os.path.join(out_dir, ckpt_name) |
| torch.save(checkpoint, ckpt_path) |
| if iter_num == 0 and eval_only: |
| break |
| |
| # forward backward update, with optional gradient accumulation to simulate larger batch size |
| # and using the GradScaler if data type is float16 |
| for micro_step in range(gradient_accumulation_steps): |
| if ddp: |
| # in DDP training we only need to sync gradients at the last micro step. |
| # the official way to do this is with model.no_sync() context manager, but |
| # I really dislike that this bloats the code and forces us to repeat code |
| # looking at the source of that context manager, it just toggles this variable |
| model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1) |
| with ctx: |
| logits, loss = model(X, Y) |
| loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation |
| # immediately async prefetch next batch while model is doing the forward pass on the GPU |
| X, Y = get_batch('train') |
| # backward pass, with gradient scaling if training in fp16 |
| scaler.scale(loss).backward() |
| # clip the gradient |
| if grad_clip != 0.0: |
| scaler.unscale_(optimizer) |
| torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) |
| # step the optimizer and scaler if training in fp16 |
| scaler.step(optimizer) |
| scaler.update() |
| # flush the gradients as soon as we can, no need for this memory anymore |
| optimizer.zero_grad(set_to_none=True) |
| |
| # timing and logging |
| t1 = time.time() |
| dt = t1 - t0 |
| t0 = t1 |
| if iter_num % log_interval == 0 and master_process: |
| # get loss as float. note: this is a CPU-GPU sync point |
| # scale up to undo the division above, approximating the true total loss (exact would have been a sum) |
| lossf = loss.item() * gradient_accumulation_steps |
| if local_iter_num >= 5: # let the training loop settle a bit |
| mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) |
| running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu |
| |
| if logger: |
| log_msg = f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%" |
| logger.info(log_msg) |
| |
| |
| if iter_num % 100 == 0: |
| |
| remaining_iters = max_iters - iter_num |
| est_seconds = remaining_iters * dt |
| days = int(est_seconds // 86400) |
| hours = int((est_seconds % 86400) // 3600) |
| minutes = int((est_seconds % 3600) // 60) |
| |
| logger.info(f"⏳ ETA: Resttime ca. {days}d, {hours}h, {minutes}m until iteration {max_iters}") |
| logger.info("📝 LIVE-SAMPLE:") |
| |
| model.eval() |
| |
| with torch.no_grad(): |
| import tiktoken |
| enc = tiktoken.get_encoding("gpt2") |
| |
| prompt = "Artificial Intelligence is " |
| start_ids = enc.encode(prompt, allowed_special={""}) |
| context = torch.tensor(start_ids, dtype=torch.long, device=device).unsqueeze(0) |
| |
| generated_tokens = raw_model.generate(context, max_new_tokens=200)[0].tolist() |
| |
| valid_tokens = [t for t in generated_tokens if t < enc.n_vocab] |
| |
| try: |
| decoded_text = enc.decode(valid_tokens, errors='replace') |
| logger.info(f"\n{decoded_text}") |
| except Exception as e: |
| logger.error(f"Sampling-Fehler: {e}") |
| |
| model.train() |
| logger.info("-" * 50) |
| iter_num += 1 |
| local_iter_num += 1 |
| |
| # termination conditions |
| if iter_num > max_iters: |
| break |
| |
| if ddp: |
| destroy_process_group() |
| ``` |
|
|
| To use this code, first you'll have to clone the nanoGPT git repository from Karpathy. |
|
|
| Then, run: |
|
|
| ```bash |
| python3 train.py \ |
| --dataset=fineweb-edu \ |
| --n_layer=24 \ |
| --n_head=16 \ |
| --n_embd=1024 \ |
| --block_size=1024 \ |
| --batch_size=4 \ |
| --gradient_accumulation_steps=32 \ |
| --learning_rate=6e-4 \ |
| --max_iters=60000 \ |
| --eval_interval=1000 \ |
| --eval_iters=100 \ |
| --log_interval=5 \ |
| --weight_decay=0.1 \ |
| --warmup_iters=2000 \ |
| --lr_decay_iters=60000 \ |
| --min_lr=6e-5 \ |
| --dtype=bfloat16 \ |
| --compile=True \ |
| --always_save_checkpoint=True \ |
| --init_from=scratch \ |
| --out_dir=/home/user/350m_fineweb |
| ``` |
|
|
| # 3. Finetuning |
| To finetune your model to answer your questions, run this code to prepare the finetuning data: |
| ```python |
| import os |
| import numpy as np |
| import tiktoken |
| from datasets import load_dataset |
| from tqdm import tqdm |
| |
| OUTPUT_DIR = "data/alpaca_cleaned_mixed" |
| TOKENIZER_NAME = "gpt2" |
| SEED = 1337 |
| |
| FINEWEB_SAMPLES = 2500 |
| |
| enc = tiktoken.get_encoding(TOKENIZER_NAME) |
| EOS_TOKEN = "<|endoftext|>" |
| |
| def format_prompt_with_mask(instruction, input_text, output): |
| """ |
| Formatiert den Prompt und erstellt die Loss-Maske. |
| Format: |
| Instruction: ... |
| Input: ... (optional) |
| Response: ... <|endoftext|> |
| """ |
| if input_text and input_text.strip(): |
| prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n" |
| else: |
| prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n" |
| |
| completion_text = f"{output}{EOS_TOKEN}" |
| |
| prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'}) |
| completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'}) |
| |
| full_ids = prompt_ids + completion_ids |
| |
| mask = [0] * len(prompt_ids) + [1] * len(completion_ids) |
| |
| return full_ids, mask |
| |
| def main(): |
| np.random.seed(SEED) |
| print(f"🚀 Starting Prepare-Script for Apex 1 Instruct 350M...") |
| print(f"📚 Tokenizer: {TOKENIZER_NAME}") |
| |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| |
| print("📥 Loading 'yahma/alpaca-cleaned' (Chat-Instructions)...") |
| alpaca = load_dataset("yahma/alpaca-cleaned", split='train') |
| |
| print(f"📥 Loading 'HuggingFaceFW/fineweb-edu' (Sample-10BT) for {FINEWEB_SAMPLES} Samples...") |
| fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True) |
| |
| all_tokens = [] |
| all_masks = [] |
| |
| print("⚙️ Processing Alpaca...") |
| for ex in tqdm(alpaca, desc="Alpaca"): |
| ids, mask = format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']) |
| all_tokens.extend(ids) |
| all_masks.extend(mask) |
| |
| alpaca_len = len(all_tokens) |
| print(f" -> Alpaca Tokens: {alpaca_len:,}") |
| |
| print("⚙️ Processing FineWeb (Anti-Forgetting)...") |
| fw_iter = iter(fineweb) |
| fw_count = 0 |
| fw_tokens_count = 0 |
| |
| for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"): |
| try: |
| ex = next(fw_iter) |
| text = ex['text'] + EOS_TOKEN |
| ids = enc.encode(text, allowed_special={EOS_TOKEN}) |
| |
| all_tokens.extend(ids) |
| all_masks.extend([1] * len(ids)) |
| |
| fw_tokens_count += len(ids) |
| fw_count += 1 |
| except StopIteration: |
| break |
| |
| print(f" -> FineWeb Tokens: {fw_tokens_count:,} (from {fw_count} documents)") |
| |
| total_tokens = len(all_tokens) |
| print(f"\n💾 Saving {total_tokens:,} Tokens in '{OUTPUT_DIR}'...") |
| |
| token_arr = np.array(all_tokens, dtype=np.uint16) |
| token_arr.tofile(os.path.join(OUTPUT_DIR, "train.bin")) |
| |
| mask_arr = np.array(all_masks, dtype=np.uint8) |
| mask_arr.tofile(os.path.join(OUTPUT_DIR, "train_mask.bin")) |
| |
| print("\n🔍 --- SANITY CHECK ---") |
| print("I decode the first 50 tokens of the first sample, to check, if everything is okay.") |
| print("Green (TRAIN) = The things the model learns. Grey (IGNORE) = The things the model only reads.") |
| |
| check_len = 100 |
| sample_ids = all_tokens[:check_len] |
| sample_mask = all_masks[:check_len] |
| |
| decoded_parts = [] |
| for t_id, m_val in zip(sample_ids, sample_mask): |
| token_str = enc.decode([t_id]) |
| if m_val == 1: |
| decoded_parts.append(f"\033[92m{token_str}\033[0m") |
| else: |
| decoded_parts.append(f"\033[90m{token_str}\033[0m") |
| |
| print("".join(decoded_parts)) |
| print("\n(Legend: \033[90mGrey=Prompt/Ignored\033[0m, \033[Green=Response/Learned\033[0m)") |
| |
| if len(token_arr) != len(mask_arr): |
| print("\n❌ Warning: Token and Mask Array have different lengths! Something has gone wrong!") |
| else: |
| print("\n✅ Everything seems to be fine. The arrays are synchronized. You can now start the training.") |
| |
| if __name__ == "__main__": |
| main() |
| ``` |
|
|
| Finally, run this to start the finetuning based on your prepared finetuning data: |
| ```python |
| import os |
| import time |
| import math |
| import torch |
| from model import GPTConfig, GPT |
| |
| import numpy as np |
| |
| out_dir = '/home/user/350m_Apex_Final' |
| init_from = '/home/user/350m_fineweb' |
| dataset = 'alpaca_cleaned_mixed' |
| |
| batch_size = 4 |
| gradient_accumulation_steps = 32 |
| block_size = 1024 |
| learning_rate = 2e-5 |
| max_iters = 1500 |
| weight_decay = 0.1 |
| dropout = 0.1 |
| warmup_iters = 0 |
| min_lr = 3e-6 |
| beta1, beta2 = 0.9, 0.95 |
| device = 'cuda' |
| dtype = 'bfloat16' |
| compile = True |
| save_interval = 500 |
| |
| os.makedirs(out_dir, exist_ok=True) |
| torch.manual_seed(1337) |
| device_type = 'cuda' if 'cuda' in device else 'cpu' |
| ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] |
| ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) |
| |
| data_dir = os.path.join('data', dataset) |
| train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r') |
| train_mask = np.memmap(os.path.join(data_dir, 'train_mask.bin'), dtype=np.uint8, mode='r') |
| |
| def get_batch(): |
| ix = torch.randint(len(train_data) - block_size, (batch_size,)) |
| x = torch.stack([torch.from_numpy((train_data[i:i+block_size]).astype(np.int64)) for i in ix]) |
| y = torch.stack([torch.from_numpy((train_data[i+1:i+1+block_size]).astype(np.int64)) for i in ix]) |
| m = torch.stack([torch.from_numpy((train_mask[i+1:i+1+block_size]).astype(np.int64)) for i in ix]) |
| |
| y[m == 0] = -100 |
| |
| x, y = x.to(device), y.to(device) |
| return x, y |
| |
| print(f"📥 Loading Pretraining-Checkpoint from {init_from}...") |
| ckpt_files = sorted([f for f in os.listdir(init_from) if f.endswith('.pt')]) |
| if not ckpt_files: |
| raise FileNotFoundError("No checkpoint found in init_from directory!") |
| |
| ckpt_path = os.path.join(init_from, ckpt_files[-1]) |
| checkpoint = torch.load(ckpt_path, map_location=device) |
| gptconf = GPTConfig(**checkpoint['model_args']) |
| model = GPT(gptconf) |
| state_dict = checkpoint['model'] |
| |
| unwanted_prefix = '_orig_mod.' |
| for k,v in list(state_dict.items()): |
| if k.startswith(unwanted_prefix): |
| state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) |
| |
| model.load_state_dict(state_dict) |
| model.to(device) |
| |
| if compile: |
| print("🚀 Compiling Model...") |
| model = torch.compile(model) |
| |
| optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) |
| scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) |
| |
| def get_lr(it): |
| if it < warmup_iters: return learning_rate * it / warmup_iters |
| if it > max_iters: return min_lr |
| decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters) |
| coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) |
| return min_lr + coeff * (learning_rate - min_lr) |
| |
| print(f"🛠️ Starting Finetuning...") |
| model.train() |
| t0 = time.time() |
| |
| for iter_num in range(max_iters + 1): |
| lr = get_lr(iter_num) |
| for param_group in optimizer.param_groups: |
| param_group['lr'] = lr |
| |
| for micro_step in range(gradient_accumulation_steps): |
| X, Y = get_batch() |
| with ctx: |
| logits, loss = model(X, Y) |
| loss = loss / gradient_accumulation_steps |
| scaler.scale(loss).backward() |
| |
| scaler.unscale_(optimizer) |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| scaler.step(optimizer) |
| scaler.update() |
| optimizer.zero_grad(set_to_none=True) |
| |
| if iter_num % 10 == 0: |
| dt = time.time() - t0 |
| print(f"Iter {iter_num}: Loss {loss.item()*gradient_accumulation_steps:.4f}, Time {dt*1000:.2f}ms, LR {lr:.2e}") |
| t0 = time.time() |
| |
| if iter_num > 0 and iter_num % save_interval == 0: |
| checkpoint_name = f'Apex_350M_iter_{iter_num}.pt' |
| save_path = os.path.join(out_dir, checkpoint_name) |
| print(f"💾 Saving checkpoint: {checkpoint_name}") |
| raw_model = model._orig_mod if compile else model |
| checkpoint_data = { |
| 'model': raw_model.state_dict(), |
| 'model_args': checkpoint['model_args'], |
| 'iter_num': iter_num, |
| 'lr': lr, |
| } |
| torch.save(checkpoint_data, save_path) |
| |
| print(f"💾 Finetuning done. Saving Apex 1 Instruct 350M...") |
| final_checkpoint = { |
| 'model': model.state_dict() if not compile else model._orig_mod.state_dict(), |
| 'model_args': checkpoint['model_args'], |
| 'config': checkpoint.get('config', {}), |
| } |
| torch.save(final_checkpoint, os.path.join(out_dir, 'Apex_350m_Final.pt')) |
| print("✅ Apex 1 Instruct 350M saved successfully!") |
| ``` |
|
|
| # 4. Testing Apex 1 Instruct 350M |
| To test the model you trained, you can simply run this Python code: |
| ```python |
| import torch |
| import tiktoken |
| from model import GPTConfig, GPT |
| |
| # --- Config --- |
| ckpt_path = '/home/user/350m_Apex_Final/Apex_350M_iter_1500.pt' |
| device = 'cuda' |
| enc = tiktoken.get_encoding("gpt2") |
| |
| print("Loading Apex 1 Instruct 350M...") |
| checkpoint = torch.load(ckpt_path, map_location=device) |
| gptconf = GPTConfig(**checkpoint['model_args']) |
| model = GPT(gptconf) |
| |
| state_dict = checkpoint['model'] |
| unwanted_prefix = '_orig_mod.' |
| for k,v in list(state_dict.items()): |
| if k.startswith(unwanted_prefix): |
| state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) |
| |
| model.load_state_dict(state_dict) |
| model.eval() |
| model.to(device) |
| print(f"Model {ckpt_path} ready!\n") |
| |
| def run_chat(): |
| print("--- Apex 1 Instruct 350M Chatbot (Type 'exit' to quit) ---") |
| |
| while True: |
| user_input = input("You: ") |
| if user_input.lower() in ["exit", "quit"]: |
| break |
| |
| prompt = f"Instruction:\n{user_input}\n\nResponse:\n" |
| |
| x = torch.tensor(enc.encode(prompt), dtype=torch.long, device=device)[None, ...] |
| |
| print("Apex 1: ", end="", flush=True) |
| with torch.no_grad(): |
| with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16): |
| y = model.generate(x, max_new_tokens=500, temperature=0.65, top_k=25) |
| |
| full_text = enc.decode(y[0].tolist()) |
| |
| if "Response:\n" in full_text: |
| response = full_text.split("Response:\n")[-1] |
| else: |
| response = full_text |
| |
| response = response.split("<|endoftext|>")[0].split("Instruction:")[0].strip() |
| print(response + "\n") |
| |
| if __name__ == "__main__": |
| run_chat() |
| ``` |
|
|
| # 5. Our training results |
| We did the pretraining on a single RTX 5060 Ti 16GB for 42,000 iterations for ~8 days. |
| Out final `val loss` value was **2.8175** and our final `train loss` was **2.8008**. |
|
|
| # 6. Thanks to... |
| 1. Andrej Karpathy for his nanoGPT Code and his YouTube Videos in the make-mode-series |
| 2. HuggingfaceTW for the Fineweb-Edu-10BT-Sample Training Dataset |
| 3. Yahma for the alpaca-cleaned dataset for the finetuning |
| 4. My dad for his support <3 |
| 5. My GPU for training and running my new model ;-) |
|
|
| --- |
| license: apache-2.0 |
| datasets: |
| - HuggingFaceFW/fineweb-edu |
| language: |
| - en |
| pipeline_tag: question-answering |
| --- |