| import torch |
| |
| import os |
| import yaml |
| from peft import LoraConfig, get_peft_model_state_dict |
| from torch.utils.data import DataLoader |
| import time |
|
|
| from typing import List, Tuple |
|
|
| import json |
| import re |
| import string |
| import copy |
| from dataclasses import field, dataclass, asdict |
| from typing import Sequence, Literal, Dict |
|
|
| import transformers |
| from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer |
| from transformers import Trainer |
| from transformers.modeling_utils import * |
| from transformers.trainer import _is_peft_model |
| from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES |
| from transformers.data.data_collator import DataCollator |
|
|
| from transformers.training_args import TrainingArguments |
| from transformers.tokenization_utils_base import PreTrainedTokenizerBase |
| from transformers.trainer_callback import TrainerCallback |
| from transformers.trainer_utils import EvalPrediction |
| from torch.utils.data import Dataset, IterableDataset |
| from datasets import load_dataset |
| |
| |
| |
| |
| from smpeft import PeftModel |
| from .config import MainConfig, convert_to_trainer_args |
| import draccus |
| import argparse |
| |
| import numpy as np |
| import random |
| import transformers |
|
|
| import argparse |
| from vllm import LLM, SamplingParams |
| from datetime import datetime |
|
|
| from .utils import set_seed_all |
|
|
| from multiprocessing import Process, Queue |
| import gc |
|
|
| IGNORE_INDEX = -100 |
| DEFAULT_PAD_TOKEN = "[PAD]" |
| DEFAULT_EOS_TOKEN = "</s>" |
| DEFAULT_BOS_TOKEN = "</s>" |
| DEFAULT_UNK_TOKEN = "</s>" |
|
|
| MAX_NEW_TOKENS = 50 |
| PROMPT_TEMPLATE = ( |
| "Below is an instruction that describes a task. " |
| "Write a response that appropriately completes the request.\n\n" |
| "### Instruction:\n{instruction}\n\n{input_section}\n" |
| "### Response:\n" |
| ) |
|
|
|
|
| def format_test_prompt(examples): |
| """ |
| Batched version of prompt formatting. |
| 'examples' is now a dictionary of lists. |
| """ |
| prompts = [] |
| |
| instructions = examples['instruction'] |
| inputs = examples.get('input', [""] * len(instructions)) |
| |
| for instr, inp in zip(instructions, inputs): |
| if inp and str(inp).strip(): |
| input_section = f"### Input:\n{inp}\n\n" |
| else: |
| input_section = "" |
| |
| source_text = PROMPT_TEMPLATE.format( |
| instruction=instr, |
| input_section=input_section |
| ) |
| prompts.append(source_text) |
| |
| |
| return {"prompt": prompts} |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def extract_answer(test_target, sentence: str) -> float: |
| |
| sentence_ = sentence.lower().strip() |
| match = re.search(r"the correct answer is\s+(answer\d+|solution\d+|option\d+|ending\d+|true|false)", sentence_) |
| if match: |
| return match.group(1) |
| |
| patterns = { |
| 'boolq': r'true|false', |
| 'piqa': r'solution1|solution2', |
| 'hellaswag': r'ending1|ending2|ending3|ending4', |
| 'winogrande': r'option1|option2', |
| 'default': r'answer1|answer2|answer3|answer4|answer5' |
| } |
| |
| target_pattern = patterns.get(test_target, patterns['default']) |
| pred_answers = re.findall(target_pattern, sentence_) |
| |
| return pred_answers[0] if pred_answers else "" |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def score_outputs(outputs, test_target_name, ground_truths, out_json): |
| results = [] |
| total_em = 0 |
| total_samples = len(ground_truths) |
|
|
| print("Calculating scores...") |
| for i, output in enumerate(outputs): |
| |
| prediction = output.outputs[0].text.strip() |
| extracted_pred = extract_answer(test_target_name, prediction) |
| if extracted_pred == "unknown" or extracted_pred == "": |
| print(f'Please check, task: {test_target_name}, idx {i}, pred {prediction}') |
| gt = ground_truths[i].lower().strip() |
| |
| is_correct = (extracted_pred == gt) |
| if is_correct: |
| total_em += 1 |
| results.append({ |
| "id": i, |
| "prediction": prediction, |
| "extracted_pred": extracted_pred, |
| "ground_truths": gt, |
| "is_correct": is_correct, |
| }) |
|
|
| |
| avg_acc = 100.0 * total_em / total_samples if total_samples > 0 else -1 |
|
|
| print("\n" + "="*40) |
| print(f"FINAL RESULTS (vLLM) {test_target_name}") |
| print("="*40) |
| print(f"Total Samples: {total_samples}") |
| print(f"Exact Match (EM): {avg_acc:.2f}%") |
| print("="*40) |
| |
| os.makedirs(out_json, exist_ok=True) |
| save_file = out_json + f'/{test_target_name}.json' |
| with open(save_file, "w", encoding="utf-8") as f: |
| json.dump({ |
| "metrics": {"EM": avg_acc}, |
| "details": results |
| }, f, indent=2, ensure_ascii=False) |
| return avg_acc |
| |
| def merge_process(queue, mainCfg: MainConfig, force_to_merge: bool = False): |
| try: |
| model_name = mainCfg.model.model_name |
| if mainCfg.model.merge_adapter_path is not None: |
| adapter = mainCfg.model.merge_adapter_path + "/ft2" |
| print(f'Merging... from mainCfg.model.merge_adapter_path {adapter}') |
| elif mainCfg.model.adapter_path is not None: |
| adapter = mainCfg.model.adapter_path + "/ft2" |
| print(f'From mainCfg.model.adapter_path {adapter}') |
| else: |
| raise KeyError('No adapter path') |
| |
| if mainCfg.model.merge_output_path is not None: |
| output_path = mainCfg.model.merge_output_path + "/merge" |
| out_json = mainCfg.model.merge_output_path |
| else: |
| output_path = mainCfg.model.adapter_path + "/merge" |
| out_json = mainCfg.model.adapter_path |
| |
| |
| if os.path.exists(output_path): |
| has_weights = any(f.endswith(".bin") or f.endswith(".safetensors") for f in os.listdir(output_path)) |
| else: |
| has_weights = False |
| |
| if not has_weights or force_to_merge: |
| model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu",low_cpu_mem_usage=True) |
| tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cpu') |
|
|
| |
| model = PeftModel.from_pretrained(model, adapter) |
| model = model.merge_and_unload() |
| |
| model.save_pretrained(output_path, safe_serialization=True, max_shard_size="10GB") |
| tokenizer.save_pretrained(output_path) |
| del model |
| del tokenizer |
| gc.collect() |
| gc.collect() |
| torch.cuda.empty_cache() |
| |
|
|
| print(f'The end of merging, from {adapter},\n \t \t to {output_path}') |
| else: |
| print("No need to merge") |
| queue.put((output_path, out_json)) |
| except Exception as e: |
| import traceback |
| error_msg = traceback.format_exc() |
| print(error_msg) |
| queue.put(error_msg) |
| print(f"Error in merge_process: {e}") |
|
|
| @draccus.wrap() |
| def main(mainCfg: MainConfig): |
| print('='*120) |
| set_seed_all(mainCfg.seed) |
| |
| queue = Queue() |
| p = Process(target=merge_process, args=(queue, mainCfg, False)) |
| p.start() |
| result = queue.get() |
| p.join() |
|
|
| if result is None: |
| raise RuntimeError("Model merging failed.") |
| model_path, out_json = result |
|
|
| |
| if not os.path.exists(model_path): |
| raise FileNotFoundError(f"Model directory does not exist: {model_path}") |
| print(f"Verified model path: {os.path.abspath(model_path)}") |
| out_json = out_json + "/results" |
| print('output json path: ', out_json) |
|
|
| llm = LLM( |
| model=model_path, |
| dtype="bfloat16", |
| gpu_memory_utilization=0.90, |
| max_model_len=mainCfg.infer.infer_max_seq_length |
| ) |
| stop_tokens = ["Instruction:", "Response:", "\n", "###", "Passage:", "Question:"] |
| sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_NEW_TOKENS, stop=stop_tokens) |
| |
| start_time0 = datetime.now() |
| final_res = {} |
| all_task_acc = [] |
| try: |
| for test_target_name in mainCfg.infer.datasets: |
| print("Loading dataset...", test_target_name) |
| |
| if mainCfg.infer.is_json: |
| data_files = f'./dataset/{test_target_name}/test.json' |
| if not os.path.exists(data_files): |
| raise FileNotFoundError(f"can not find dataset file : {data_files}") |
| test_dataset = load_dataset("json", data_files=data_files, split='train') |
| else: |
| raise KeyError('Not implemented yet') |
| |
| |
| ground_truths = test_dataset['answer'] |
| test_dataset = test_dataset.map( |
| format_test_prompt, |
| batched=True, |
| batch_size=10000, |
| num_proc=8, |
| desc="Formatting prompts" |
| ) |
| prompts = test_dataset['prompt'] |
|
|
| print(f"Generating for {len(prompts)} samples...") |
| |
| start_time = datetime.now() |
| outputs = llm.generate(prompts, sampling_params) |
| end_time = datetime.now() |
| print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '| task, ', test_target_name, ' duration: ', end_time - start_time) |
| |
| avg_acc = score_outputs(outputs=outputs,test_target_name=test_target_name, |
| out_json=out_json, ground_truths=ground_truths) |
| |
| final_res[test_target_name] = avg_acc |
| all_task_acc.append(avg_acc) |
| |
| del prompts |
| del outputs |
| del test_dataset |
| gc.collect() |
| except Exception as e: |
| print(f"Error in the for loop over datasets: {e}") |
| print('all_task_acc', all_task_acc) |
| avg_score = sum(all_task_acc) / len(all_task_acc) |
| final_res['average_score'] = avg_score |
| save_file = out_json + f'/FINAL.json' |
| with open(save_file, "w", encoding="utf-8") as f: |
| json.dump(final_res, f, indent=2, ensure_ascii=False) |
| print(f"Results saved to {save_file}, overall score: {avg_score}") |
|
|
| end_time0 = datetime.now() |
| print('end time: ', end_time0.strftime("%Y-%m-%d %H:%M:%S"), '| duration: ', end_time0 - start_time0) |
| |
| if __name__ == "__main__": |
| |
| |
| |
| |
| |
| |
| main() |
| |