# Train models using HuggingFace libraries

This notebook takes parameters from a params.json file which is automatically
created by Substratus K8s operator.

The following parameters influence what happens in this notebook:
- `dataset_urls`: A comma separated list of URLs. The URLs should point to
  json files that contain your training dataset. If unset a json or jsonl
  file should be present under the `/content/data/` directory.
- `prompt_template`: The prompt template to use for training
- `push_to_hub`: if this variable is set a repo id, then the trained
  model will get pushed to HuggingFace hub. For example,
  set it to "substratusai/my-model" to publish to substratusai HF org.

In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)


params

{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',
 'inference_prompt_template': '## Instruction\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n\nOnly use the API reference to understand the syntax of the request.\n\n## Natural Language Query\n{nlcommand}\n\n## Schema\n{schema}\n\n## API reference\n{apiRef}\n\n## Answer\n```graphql\n',
 'logging_steps': 50,
 'modules_to_save': 'embed_tokens, lm_head',
 'num_train_epochs': 3,
 'per_device_eval_batch_size': 1,
 'per_device_train_batch_size': 1,
 'prompt_template': '## Instruction\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n\nOnly use the API reference to understand the syntax of the 

In [2]:
import os 
from datasets import load_dataset

dataset_urls = params.get("dataset_urls")
if dataset_urls:
    urls = [u.strip() for u in dataset_urls.split(",")]
    print(f"Using the following URLs for the dataset: {urls}")
    data = load_dataset("json", data_files=urls)
else:
    data = load_dataset("json", data_files="/content/data/*.json*")
data

Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],
        num_rows: 3190
    })
})

In [3]:
import transformers
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/model/"
trained_model_path = "/content/artifacts"
trained_model_path_lora = "/content/artifacts/lora"

tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [4]:
model.config

LlamaConfig {
  "_name_or_path": "/content/model/",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
default_prompt = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{prompt}
### Response:
{completion}
"""

prompt = params.get("prompt_template", default_prompt)

eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)
if prompt[-len(eos_token):] != eos_token:
    prompt = prompt + eos_token

print(prompt)


## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
{nlcommand}

## Schema
{schema}

## API reference
{apiRef}

## Answer
{output}
</s>


In [6]:
! nvidia-smi

Sun Oct 22 00:37:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    31W /  72W |   3570MiB / 23034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA L4           Off  | 00000000:00:05.0 Off |                    0 |
| N/A   58C    P0    30W /  72W |   4096MiB / 23034MiB |      0%      Defaul

|   2  NVIDIA L4           Off  | 00000000:00:06.0 Off |                    0 |
| N/A   56C    P0    30W /  72W |   4096MiB / 23034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA L4           Off  | 00000000:00:07.0 Off |                    0 |
| N/A   60C    P0    32W /  72W |   3570MiB / 23034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID            

Prompt before fine tuning

In [7]:
from typing import Dict
# source: https://github.com/artidoro/qlora
DEFAULT_PAD_TOKEN = params.get("pad_token", "[PAD]")

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg

if tokenizer._pad_token is None:
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
        tokenizer=tokenizer,
        model=model,
    )

if isinstance(tokenizer, transformers.LlamaTokenizer):
    # LLaMA tokenizer may not have correct special tokens set.
    # Check and add them if missing to prevent them from being parsed into different tokens.
    # Note that these are present in the vocabulary.
    # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
    print('Adding special tokens.')
    tokenizer.add_special_tokens({
            "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
            "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
            "unk_token": tokenizer.convert_ids_to_tokens(
                model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
            ),
    })

tokenizer

LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
from typing import Dict

data = data.map(lambda x: tokenizer(prompt.format_map(x)))

print("After tokenizing:", data)

Map:   0%|          | 0/3190 [00:00<?, ? examples/s]

After tokenizing: DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath', 'input_ids', 'attention_mask'],
        num_rows: 3190
    })
})


In [9]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

target_modules = params.get("target_modules")
if target_modules:
    target_modules = [mod.strip() for mod in target_modules.split(",")]

modules_to_save = params.get("modules_to_save")
if modules_to_save:
    modules_to_save = [mod.strip() for mod in modules_to_save.split(",")]

lora_config2 = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
    modules_to_save = modules_to_save
)
print(lora_config2)

model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config2)
model.print_trainable_parameters()

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)


trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144


In [10]:
from utils import parse_training_args

training_args = parse_training_args(params)
training_args

TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_

In [11]:
# data = data["train"].train_test_split(test_size=0.1)
# data


In [12]:
! mkdir -p {trained_model_path_lora}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
#    eval_dataset=data["test"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

checkpoint_path = Path("/content/artifacts/checkpoints")

# Only set resume_from_checkpoint True when directory exists and contains files
resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())
if resume_from_checkpoint:
    print("Resuming from checkpoint:", list(checkpoint_path.rglob("")))
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.0693
100,0.5153
150,0.2737
200,0.1733
250,0.1188
300,0.0842
350,0.0658
400,0.0545
450,0.0484
500,0.0442


TrainOutput(global_step=2391, training_loss=0.07075354540412868, metrics={'train_runtime': 15162.9574, 'train_samples_per_second': 0.631, 'train_steps_per_second': 0.158, 'total_flos': 3.0420974601928704e+17, 'train_loss': 0.07075354540412868, 'epoch': 3.0})

In [14]:
model.save_pretrained(trained_model_path_lora)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(32001, 4096)
          (modules_to_save): ModuleDict(
            (default): Embedding(32001, 4096)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_emb

In [15]:
! ls -lash {trained_model_path_lora}

total 1.2G
 512 -rw-r--r-- 1 root 3003   88 Oct 22 04:52 README.md
1.0K -rw-r--r-- 1 root 3003  550 Oct 22 04:53 adapter_config.json
1.2G -rw-r--r-- 1 root 3003 1.2G Oct 22 04:52 adapter_model.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
model = model.merge_and_unload().half()
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [17]:
! ls -l {trained_model_path}

total 0
drwxr-xr-x 1 root 3003 0 Oct 22 00:39 checkpoints
drwxr-xr-x 1 root 3003 0 Oct 22 00:39 lora
drwxr-xr-x 1 root 3003 0 Oct 22 00:33 src


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)

('/content/artifacts/tokenizer_config.json',
 '/content/artifacts/special_tokens_map.json',
 '/content/artifacts/tokenizer.model',
 '/content/artifacts/added_tokens.json',
 '/content/artifacts/tokenizer.json')

In [19]:
! ls -lash {trained_model_path}

total 13G
 512 -rw-r--r-- 1 root 3003   21 Oct 22 04:56 added_tokens.json
   0 drwxr-xr-x 1 root 3003    0 Oct 22 00:39 checkpoints
1.0K -rw-r--r-- 1 root 3003  648 Oct 22 04:54 config.json
 512 -rw-r--r-- 1 root 3003  183 Oct 22 04:54 generation_config.json
   0 drwxr-xr-x 1 root 3003    0 Oct 22 00:39 lora
9.3G -rw-r--r-- 1 root 3003 9.3G Oct 22 04:54 pytorch_model-00001-of-00002.bin
3.3G -rw-r--r-- 1 root 3003 3.3G Oct 22 04:56 pytorch_model-00002-of-00002.bin
 24K -rw-r--r-- 1 root 3003  24K Oct 22 04:56 pytorch_model.bin.index.json
1.0K -rw-r--r-- 1 root 3003  552 Oct 22 04:56 special_tokens_map.json
   0 drwxr-xr-x 1 root 3003    0 Oct 22 00:33 src
1.8M -rw-r--r-- 1 root 3003 1.8M Oct 22 04:56 tokenizer.json
489K -rw-r--r-- 1 root 3003 489K Oct 22 04:56 tokenizer.model
1.5K -rw-r--r-- 1 root 3003 1.1K Oct 22 04:56 tokenizer_config.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
from huggingface_hub import HfApi
import shutil

tokenizer_model_path_base = Path(model_path) / "tokenizer.model"
tokenizer_model_path_trained = Path(trained_model_path) / "tokenizer.model"
if tokenizer_model_path_base.exists() and not tokenizer_model_path_trained.exists():
    shutil.copy(tokenizer_model_path_base, tokenizer_model_path_trained)

repo_id = params.get("push_to_hub")
if repo_id:
    model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)
    hf_api = HfApi()
    # Upload tokenizer.model if it was in base model
    if tokenizer_model_path_base.exists():
        hf_api.upload_file(
            path_or_fileobj=tokenizer_model_path_base,
            path_in_repo=tokenizer_model_path_base.name,
            repo_id=repo_id,
        )
    logs_path = Path("/content/artifacts/src/train.ipynb")
    if logs_path.exists():
        hf_api.upload_file(
            path_or_fileobj=logs_path,
            path_in_repo=logs_path.name,
            repo_id=repo_id,
        )


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]