ChangeIsKey
/

llama-7b-lexical-substitution

PEFT

Safetensors

Model card Files Files and versions

xet

Community

FrancescoPeriti commited on Aug 30, 2024

Commit

9615074

verified ·

1 Parent(s): 517ddd6

Update README.md

Browse files

Files changed (1) hide show

README.md +61 -1

README.md CHANGED Viewed

@@ -22,10 +22,13 @@ The following `bitsandbytes` quantization config was used during training:
 ## Get it started
 ```python
-from peft import PeftModel, PeftConfig
 from huggingface_hub import login
 from transformers import AutoModelForCausalLM, AutoTokenizer, AddedToken
 login("[YOUR HF TOKEN HERE FOR USING LLAMA]")
 config = PeftConfig.from_pretrained("ChangeIsKey/llama-7b-lexical-substitution")
 base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map='auto')
@@ -39,4 +42,61 @@ base_model.resize_token_embeddings(len(tokenizer))
 model = PeftModel.from_pretrained(base_model, "ChangeIsKey/llama-7b-lexical-substitution")
 model.eval()
 ```

 ## Get it started
 ```python
+import torch
+from datasets import Dataset
 from huggingface_hub import login
+from peft import PeftModel, PeftConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, AddedToken
+# load model and tokenizer
 login("[YOUR HF TOKEN HERE FOR USING LLAMA]")
 config = PeftConfig.from_pretrained("ChangeIsKey/llama-7b-lexical-substitution")
 base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map='auto')
 model = PeftModel.from_pretrained(base_model, "ChangeIsKey/llama-7b-lexical-substitution")
 model.eval()
+# let's use this model
+def formatting_func(records):
+    text_batch = []
+    for i in range(len(records['example'])):
+        example = records[i]['example']
+        start, end = records[i]['start'], records[i]['end']
+        target = f'**{example[start:end]}**'
+        input_text = f'{example[:start]} {target} {example[end:]}'
+        text_batch.append(f"{input_text}<|answer|>")
+    return text_batch
+def tokenization(dataset):
+    return tokenizer(formatting_func(dataset),
+                       truncation=True,
+                       max_length=512,
+                       padding=True,
+                       return_tensors="pt").to("cuda")
+# a toy example
+examples = [{'example': 'The traffic jam on the highway made everyone late for work.', 'start': 12, 'end': 15},
+            {'example': 'I spread a generous layer of strawberry jam on my toast this morning', 'start': 40, 'end': 43}]
+dataset = Dataset.from_list(examples)
+batch_size = 32
+output = list()
+with torch.no_grad():
+    for i in range(0, len(dataset), batch_size):
+        model_input = tokenization(dataset.select(range(i, min(dataset.num_rows, i + batch_size))))
+        output_ids = model.generate(**model_input,
+                                    do_sample=True,
+                                    num_return_sequences=1,
+                                    max_new_tokens=30,
+                                    temperature=0.00001,
+                                    repetition_penalty=1/0.85,
+                                    top_k=40,
+                                    top_p=0.1)
+        answers = tokenizer.batch_decode(output_ids, skip_special_tokens=False)
+        for answer in answers:
+            answer = " ".join(answer.split('<|answer|>')[1:])
+            substitutes = [s.strip() for s in answer.split('<|end|>')[:-1] if s.strip() != ""]
+            output.append(", ".join(substitutes))
+# output
+dataset = dataset.add_column('substitutes', output)
+for row in dataset:
+    target = row['example'][row['start']:row['end']]
+    print(f"Target: {target}\nExample: {row['example']}\nSubstitutes: {row['substitutes']}\n")
 ```