| import torch | |
| import json | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| device_map = "auto" | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "/path/to/llamipa/adapter", | |
| return_dict=True, | |
| torch_dtype=torch.float16, | |
| device_map=device_map) | |
| tokenizer = AutoTokenizer.from_pretrained("/path/to/meta-llama3-8b/",add_eos_token=True) | |
| tokenizer.pad_token_id = tokenizer.eos_token_id + 1 | |
| tokenizer.padding_side = "right" | |
| pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.pad_token_id, max_new_tokens=100) | |
| test_dataset = load_dataset("json", data_files={'test':'/path/to/parser_test_15_gold.jsonl'})["test"] | |
| def formatting_prompts_func(example): | |
| output_texts = [] | |
| for i in range(len(example['sample'])): | |
| text = f"<|begin_of_text|>Identify the discourse structure (DS) for the new turn in the following excerpt :\n {example['sample'][i]}\n ### DS:" | |
| output_texts.append(text) | |
| return output_texts | |
| test_texts = formatting_prompts_func(test_dataset) | |
| print("Test Length:", len(test_texts)) | |
| f = open("/path/to/test-output-file.txt","w") | |
| for text in tqdm(test_texts): | |
| print(text) | |
| print(pipe(text)[0]["generated_text"], file=f) | |
| f.close() | |