| | from unsloth import FastLanguageModel |
| | import torch,sys |
| |
|
| | model_name_input = sys.argv[1] |
| |
|
| | max_seq_length = 4096 |
| | dtype = None |
| | load_in_4bit = True |
| |
|
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | |
| | model_name = model_name_input, |
| | max_seq_length = max_seq_length, |
| | dtype = dtype, |
| | load_in_4bit = load_in_4bit, |
| | |
| | ) |
| |
|
| | alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
| | |
| | ### Instruction: |
| | {} |
| | |
| | ### Input: |
| | {} |
| | |
| | ### Response: |
| | {}""" |
| |
|
| | EOS_TOKEN = tokenizer.eos_token |
| | def formatting_prompts_func(examples): |
| | instructions = examples["instruction"] |
| | inputs = examples["input"] |
| | outputs = examples["output"] |
| | texts = [] |
| | for instruction, input, output in zip(instructions, inputs, outputs): |
| | |
| | text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN |
| | texts.append(text) |
| | return { "text" : texts, } |
| | pass |
| |
|
| | |
| | from datasets import load_dataset |
| | dataset = load_dataset("json", data_files="data.json", split = "train") |
| | dataset = dataset.map(formatting_prompts_func, batched = True,) |
| |
|
| | FastLanguageModel.for_inference(model) |
| | |
| | samples = [] |
| | sample_size = 10 |
| | for x in range(0,sample_size): |
| | instruction = dataset[x]["instruction"] |
| | input = dataset[x]["input"] |
| | output = '' |
| | text = alpaca_prompt.format(instruction, input, output) |
| | sample = tokenizer([text],return_tensors = "pt").to("cuda") |
| | out = model.generate(**sample,max_new_tokens=4096,use_cache=True) |
| | out = tokenizer.batch_decode(out) |
| | samples.append(out[0]) |
| |
|
| | |
| | code = '''int __fastcall sub_75C80(int a1, int a2) |
| | { |
| | int result; // r0 |
| | _DWORD *i; // r3 |
| | |
| | result = a2 - *(_DWORD *)(a1 + 12); |
| | for ( i = *(_DWORD **)(a1 + 48); i; i = (_DWORD *)*i ) |
| | { |
| | if ( i[2] < result ) |
| | result = i[2]; |
| | } |
| | return result; |
| | }''' |
| |
|
| | text = alpaca_prompt.format(instruction, code, output) |
| | sample = tokenizer([text],return_tensors = "pt").to("cuda") |
| | out = model.generate(**sample,max_new_tokens=4096,use_cache=True) |
| | out = tokenizer.batch_decode(out) |
| | samples.append(out[0]) |
| |
|
| | print('Capturing pre training generation samples') |
| | with open(f'results/eval_log_{model_name_input.replace("/","_")}','w') as log: |
| | for r in samples: |
| | log.write(r) |
| |
|
| |
|
| |
|