| import torch |
| |
| import os |
| import yaml |
| from peft import LoraConfig, get_peft_model_state_dict |
| from torch.utils.data import DataLoader |
| import time |
|
|
| from typing import List, Tuple |
|
|
| import json |
| import re |
| import string |
| import copy |
| from dataclasses import field, dataclass, asdict |
| from typing import Sequence, Literal, Dict |
|
|
| import transformers |
| from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer |
| from transformers import Trainer |
| from transformers.modeling_utils import * |
| from transformers.trainer import _is_peft_model |
| from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES |
| from transformers.data.data_collator import DataCollator |
|
|
| from transformers.training_args import TrainingArguments |
| from transformers.tokenization_utils_base import PreTrainedTokenizerBase |
| from transformers.trainer_callback import TrainerCallback |
| from transformers.trainer_utils import EvalPrediction |
| from torch.utils.data import Dataset, IterableDataset |
| from datasets import load_dataset |
| |
| |
| |
| |
| from smpeft import PeftModel |
| from .config import MainConfig, convert_to_trainer_args |
| import draccus |
| import argparse |
| |
| import numpy as np |
| import random |
| import transformers |
|
|
| import argparse |
| from vllm import LLM, SamplingParams |
| from datetime import datetime |
|
|
| from .utils import set_seed_all |
|
|
| from multiprocessing import Process, Queue |
|
|
| IGNORE_INDEX = -100 |
| DEFAULT_PAD_TOKEN = "[PAD]" |
| DEFAULT_EOS_TOKEN = "</s>" |
| DEFAULT_BOS_TOKEN = "</s>" |
| DEFAULT_UNK_TOKEN = "</s>" |
|
|
| MAX_NEW_TOKENS = 50 |
| PROMPT_TEMPLATE = ( |
| "Below is an passage followed by a coresponding question that describes a task " |
| "Write a response that appropriately completes the request with your answer.\n\n" |
| "### Instruction:\n{instruction}\n\n### Response:" |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| def set_deterministic_seed(seed=42): |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed_all(seed) |
| transformers.set_seed(seed) |
| |
| |
|
|
| def normalize_answer(s): |
| """Lower text and remove punctuation, articles and extra whitespace.""" |
| def remove_articles(text): |
| return re.sub(r'\b(a|an|the)\b', ' ', text) |
| def white_space_fix(text): |
| return ' '.join(text.split()) |
| def remove_punc(text): |
| exclude = set(string.punctuation) |
| return ''.join(ch for ch in text if ch not in exclude) |
| return white_space_fix(remove_articles(remove_punc(s.lower()))) |
|
|
| def f1_score(prediction, ground_truth): |
| pred_tokens = normalize_answer(prediction).split() |
| truth_tokens = normalize_answer(ground_truth).split() |
| common = collections.Counter(pred_tokens) & collections.Counter(truth_tokens) |
| num_same = sum(common.values()) |
| if num_same == 0: return 0 |
| precision = 1.0 * num_same / len(pred_tokens) |
| recall = 1.0 * num_same / len(truth_tokens) |
| return (2 * precision * recall) / (precision + recall) |
|
|
| def exact_match_score(prediction, ground_truth): |
| return (normalize_answer(prediction) == normalize_answer(ground_truth)) |
|
|
| def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): |
| """DROP the highest scores.""" |
| return max([metric_fn(prediction, gt) for gt in ground_truths]) |
|
|
| def score_outputs(outputs, test_dataset, ids, all_ground_truths): |
| results = [] |
| total_em = 0 |
| total_f1 = 0 |
|
|
| print("Calculating scores...") |
| for i, output in enumerate(outputs): |
| |
| prediction = output.outputs[0].text.strip() |
| ground_truths = all_ground_truths[i] |
| |
| |
| em = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) |
| f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths) |
| |
| total_em += em |
| total_f1 += f1 |
| |
| results.append({ |
| "id": ids[i], |
| "prediction": prediction, |
| "ground_truths": ground_truths, |
| "em": em, |
| "f1": f1 |
| }) |
|
|
| |
| avg_em = 100.0 * total_em / len(test_dataset) |
| avg_f1 = 100.0 * total_f1 / len(test_dataset) |
|
|
| print("\n" + "="*40) |
| print("FINAL RESULTS (vLLM)") |
| print("="*40) |
| print(f"Total Samples: {len(test_dataset)}") |
| print(f"Exact Match (EM): {avg_em:.2f}%") |
| print(f"F1 Score : {avg_f1:.2f}%") |
| print("="*40) |
| return results, avg_em, avg_f1 |
| |
| def merge_process(queue, mainCfg: MainConfig, force_to_merge: bool = False): |
| try: |
| model_name = mainCfg.model.model_name |
| if mainCfg.model.merge_adapter_path is not None: |
| adapter = mainCfg.model.merge_adapter_path + "/ft2" |
| print(f'Merging... from mainCfg.model.merge_adapter_path {adapter}') |
| elif mainCfg.model.adapter_path is not None: |
| adapter = mainCfg.model.adapter_path + "/ft2" |
| print(f'From mainCfg.model.adapter_path {adapter}') |
| else: |
| raise KeyError('No adapter path') |
| |
| if mainCfg.model.merge_output_path is not None: |
| output_path = mainCfg.model.merge_output_path + "/merge" |
| out_json = mainCfg.model.merge_output_path |
| else: |
| output_path = mainCfg.model.adapter_path + "/merge" |
| out_json = mainCfg.model.adapter_path |
| |
| |
| if os.path.exists(output_path): |
| has_weights = any(f.endswith(".bin") or f.endswith(".safetensors") for f in os.listdir(output_path)) |
| else: |
| has_weights = False |
| |
| if not has_weights or force_to_merge: |
| |
| |
| model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu",low_cpu_mem_usage=True) |
| tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cpu') |
|
|
| |
| model = PeftModel.from_pretrained(model, adapter) |
| model = model.merge_and_unload() |
| model.save_pretrained(output_path, safe_serialization=False, max_shard_size="10GB") |
| tokenizer.save_pretrained(output_path) |
| del model |
| del tokenizer |
| gc.collect() |
| gc.collect() |
| torch.cuda.empty_cache() |
| |
| |
| print(f'The end of merging, from {adapter},\n \t \t to {output_path}') |
| else: |
| print("No need to merge") |
| queue.put((output_path, out_json)) |
| except Exception as e: |
| import traceback |
| error_msg = traceback.format_exc() |
| print(error_msg) |
| queue.put(error_msg) |
| print(f"Error in merge_process: {e}") |
|
|
| @draccus.wrap() |
| def main(mainCfg: MainConfig): |
| print('='*120) |
| set_seed_all(mainCfg.seed) |
|
|
| queue = Queue() |
| p = Process(target=merge_process, args=(queue, mainCfg, False)) |
| p.start() |
| result = queue.get() |
| p.join() |
|
|
| if result is None: |
| raise RuntimeError("Model merging failed.") |
| model_path, out_json = result |
| |
| |
| if not os.path.exists(model_path): |
| raise FileNotFoundError(f"Model directory does not exist: {model_path}") |
| print(f"Verified model path: {os.path.abspath(model_path)}") |
| print("Loading dataset...") |
| test_dataset = load_dataset(mainCfg.data.path, split="validation").select(range(mainCfg.data.total_test_samples)) |
| |
| def prepare_test_data(batch): |
| |
| |
| prompts = [] |
| for passage, question in zip(batch['passage'], batch['question']): |
| instr = f"Passage: {passage}\nQuestion: {question}" |
| |
| full_prompt = PROMPT_TEMPLATE.format(instruction=instr, input_section="") |
| prompts.append(full_prompt) |
| |
| |
| |
| |
| target_spans = [ans['spans'] for ans in batch['answers_spans']] |
| |
| |
| return { |
| "prompt": prompts, |
| "target_spans": target_spans |
| } |
| |
| test_dataset = test_dataset.map(prepare_test_data, |
| batched=True, batch_size=2000, num_proc=8) |
| prompts = test_dataset['prompt'] |
| ids = test_dataset['query_id'] |
| all_ground_truths = test_dataset['target_spans'] |
| |
| print('out', model_path) |
| |
| llm = LLM( |
| model=model_path, |
| dtype="bfloat16", |
| gpu_memory_utilization=0.90, |
| max_model_len=mainCfg.model.model_max_seq_length |
| ) |
| stop_tokens = ["Instruction:", "Response:", "\n", "###", "Passage:", "Question:"] |
| sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_NEW_TOKENS, stop=stop_tokens) |
| print(f"Generating for {len(prompts)} samples...") |
| |
| start_time = datetime.now() |
| outputs = llm.generate(prompts, sampling_params) |
| end_time = datetime.now() |
| print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '| duration: ', end_time - start_time) |
| |
| results, avg_em, avg_f1 = score_outputs(outputs=outputs, |
| test_dataset=test_dataset, ids=ids, all_ground_truths=all_ground_truths) |
|
|
| |
| save_file = out_json + '/drop_vllm_results.json' |
| with open(save_file, "w", encoding="utf-8") as f: |
| json.dump({ |
| "metrics": {"EM": avg_em, "F1": avg_f1}, |
| "details": results |
| }, f, indent=2, ensure_ascii=False) |
| print("Results saved to drop_vllm_results.json") |
|
|
| if __name__ == "__main__": |
| main() |