| | --- |
| | library_name: peft |
| | --- |
| | ## Training procedure |
| |
|
| |
|
| | The following `bitsandbytes` quantization config was used during training: |
| | - quant_method: bitsandbytes |
| | - load_in_8bit: False |
| | - load_in_4bit: True |
| | - llm_int8_threshold: 6.0 |
| | - llm_int8_skip_modules: None |
| | - llm_int8_enable_fp32_cpu_offload: False |
| | - llm_int8_has_fp16_weight: False |
| | - bnb_4bit_quant_type: nf4 |
| | - bnb_4bit_use_double_quant: False |
| | - bnb_4bit_compute_dtype: float16 |
| | ### Framework versions |
| | |
| | |
| | - PEFT 0.5.0 |
| | |
| | ## Get it started |
| | ```python |
| | import torch |
| | from datasets import Dataset |
| | from huggingface_hub import login |
| | from peft import PeftModel, PeftConfig |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, AddedToken |
| |
|
| | # load model and tokenizer |
| | login("[YOUR HF TOKEN HERE FOR USING LLAMA]") |
| | config = PeftConfig.from_pretrained("ChangeIsKey/llama-7b-lexical-substitution") |
| | base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map='auto') |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_fast=False, trust_remote_code=True) |
| | tokenizer.add_special_tokens({ "additional_special_tokens":[AddedToken("<|s|>"), AddedToken("<|answer|>"), AddedToken("<|end|>")]}) |
| | if tokenizer.pad_token is None: |
| | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
| | tokenizer.padding_side = 'left' |
| | base_model.resize_token_embeddings(len(tokenizer)) |
| |
|
| | model = PeftModel.from_pretrained(base_model, "ChangeIsKey/llama-7b-lexical-substitution") |
| | model.eval() |
| |
|
| | # let's use this model |
| | def formatting_func(records): |
| | text_batch = [] |
| | |
| | for i in range(len(records['example'])): |
| | example = records[i]['example'] |
| | start, end = records[i]['start'], records[i]['end'] |
| | |
| | target = f'**{example[start:end]}**' |
| | input_text = f'{example[:start]} {target} {example[end:]}' |
| | text_batch.append(f"{input_text}<|answer|>") |
| | |
| | return text_batch |
| | |
| | def tokenization(dataset): |
| | return tokenizer(formatting_func(dataset), |
| | truncation=True, |
| | max_length=512, |
| | padding=True, |
| | return_tensors="pt").to("cuda") |
| | |
| |
|
| | # a toy example |
| | examples = [{'example': 'The traffic jam on the highway made everyone late for work.', 'start': 12, 'end': 15}, |
| | {'example': 'I spread a generous layer of strawberry jam on my toast this morning', 'start': 40, 'end': 43}] |
| | dataset = Dataset.from_list(examples) |
| | |
| |
|
| | batch_size = 32 |
| | output = list() |
| | |
| | with torch.no_grad(): |
| | for i in range(0, len(dataset), batch_size): |
| | model_input = tokenization(dataset.select(range(i, min(dataset.num_rows, i + batch_size)))) |
| | |
| | output_ids = model.generate(**model_input, |
| | do_sample=True, |
| | num_return_sequences=1, |
| | max_new_tokens=30, |
| | temperature=0.00001, |
| | repetition_penalty=1/0.85, |
| | top_k=40, |
| | top_p=0.1) |
| | |
| | answers = tokenizer.batch_decode(output_ids, skip_special_tokens=False) |
| | |
| | for answer in answers: |
| | answer = " ".join(answer.split('<|answer|>')[1:]) |
| | substitutes = [s.strip() for s in answer.split('<|end|>')[:-1] if s.strip() != ""] |
| | output.append(", ".join(substitutes)) |
| | |
| | # output |
| | dataset = dataset.add_column('substitutes', output) |
| | for row in dataset: |
| | target = row['example'][row['start']:row['end']] |
| | print(f"Target: {target}\nExample: {row['example']}\nSubstitutes: {row['substitutes']}\n") |
| | ``` |