|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments |
|
|
from datasets import load_dataset |
|
|
import yaml |
|
|
|
|
|
def load_config(config_path): |
|
|
import yaml |
|
|
with open(config_path, "r") as f: |
|
|
return yaml.safe_load(f) |
|
|
|
|
|
def main(): |
|
|
config = load_config("configs/train_config.yaml") |
|
|
|
|
|
model_dir = config["output_dir"] |
|
|
test_dataset_path = "data/processed/test.json" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_dir) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_dir) |
|
|
|
|
|
dataset = load_dataset("json", data_files={"test": test_dataset_path}) |
|
|
|
|
|
def tokenize_function(examples): |
|
|
tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512) |
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
return tokenized |
|
|
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./eval_output", |
|
|
per_device_eval_batch_size=8, |
|
|
) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
eval_dataset=tokenized_dataset["test"], |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
eval_result = trainer.evaluate() |
|
|
print(f"Evaluation results: {eval_result}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |