Spaces:
Running
Running
| #!/usr/bin/python3 | |
| # -*- coding: utf-8 -*- | |
| from dataclasses import dataclass, field | |
| import os | |
| from pathlib import Path | |
| import platform | |
| import re | |
| from typing import Dict, List, Optional, Union | |
| if platform.system() == "Windows": | |
| from project_settings import project_path | |
| else: | |
| project_path = os.path.abspath("./") | |
| project_path = Path(project_path) | |
| hf_hub_cache = (project_path / "cache/huggingface/hub").as_posix() | |
| os.environ["HUGGINGFACE_HUB_CACHE"] = hf_hub_cache | |
| from datasets import load_dataset | |
| import huggingface_hub | |
| import torch | |
| import torch.multiprocessing as mp | |
| from transformers import HfArgumentParser | |
| from transformers.data.data_collator import DataCollatorForLanguageModeling | |
| from transformers.models.auto import AutoModelForCausalLM, AutoTokenizer | |
| from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel | |
| from transformers.trainer import Trainer | |
| from transformers.trainer_callback import EarlyStoppingCallback | |
| from transformers.training_args import TrainingArguments | |
| class ScriptArguments: | |
| # dataset | |
| dataset_path: str = field(default="qgyd2021/lip_service_4chan") | |
| dataset_name: str = field(default=None) | |
| dataset_split: str = field(default=None) | |
| dataset_cache_dir: str = field(default=(project_path / "hub_datasets").as_posix()) | |
| dataset_streaming: bool = field(default=False) | |
| num_workers: int = field(default=None if platform.system() == "Windows" else os.cpu_count() // 2) | |
| # model | |
| pretrained_model_name_or_path: str = field( | |
| default="uer/gpt2-chinese-cluecorpussmall" | |
| ) | |
| # pretrained_model_name_or_path: str = field( | |
| # default=(project_path / "pretrained_models/gpt2-chinese-cluecorpussmall").as_posix() | |
| # ) | |
| hf_token: str = field(default=None) | |
| def get_args(): | |
| parser = HfArgumentParser(ScriptArguments) | |
| args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0] | |
| return args | |
| def train_model(local_rank, world_size, args): | |
| os.environ["RANK"] = f"{local_rank}" | |
| os.environ["LOCAL_RANK"] = f"{local_rank}" | |
| os.environ["WORLD_SIZE"] = f"{world_size}" | |
| os.environ["MASTER_ADDR"] = "localhost" | |
| os.environ["MASTER_PORT"] = "12355" | |
| huggingface_hub.login(token=args.hf_token) | |
| # dataset | |
| dataset_dict = load_dataset( | |
| path=args.dataset_path, | |
| name=args.dataset_name, | |
| split=args.dataset_split, | |
| cache_dir=args.dataset_cache_dir, | |
| # num_proc=args.num_workers if not args.dataset_streaming else None, | |
| streaming=args.dataset_streaming, | |
| ) | |
| print(dataset_dict) | |
| dataset = dataset_dict["train"] | |
| if args.dataset_streaming: | |
| valid_dataset = dataset.take(args.valid_dataset_size) | |
| train_dataset = dataset.skip(args.valid_dataset_size) | |
| train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None) | |
| else: | |
| dataset = dataset.train_test_split(test_size=4000, seed=None) | |
| train_dataset = dataset["train"] | |
| valid_dataset = dataset["test"] | |
| # pretrained model | |
| model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path) | |
| tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path) | |
| # map | |
| def encode(examples: dict): | |
| questions_ = examples.pop("question") | |
| answers_ = examples.pop("answer") | |
| utterances = list() | |
| for question, answer in zip(questions_, answers_): | |
| if not isinstance(question, str): | |
| continue | |
| if not isinstance(answer, str): | |
| continue | |
| utterance = question + tokenizer.sep_token + answer | |
| utterances.append(utterance) | |
| utterances = tokenizer.__call__( | |
| text=utterances, | |
| truncation=True, | |
| padding="longest", | |
| max_length=512, | |
| return_special_tokens_mask=True, | |
| ) | |
| return utterances | |
| train_dataset = train_dataset.map( | |
| encode, | |
| batched=True, | |
| drop_last_batch=True, | |
| batch_size=10, | |
| num_proc=None, | |
| cache_file_name="train.cache" | |
| ) | |
| valid_dataset = valid_dataset.map( | |
| encode, | |
| batched=True, | |
| drop_last_batch=True, | |
| batch_size=10, | |
| num_proc=None, | |
| cache_file_name="valid.cache" | |
| ) | |
| dataset_info = f""" | |
| train dataset: {len(train_dataset)} | |
| valid dataset: {len(valid_dataset)} | |
| """ | |
| dataset_info = re.sub(r"[\u0020]{4,}", "", dataset_info) | |
| print(dataset_info) | |
| # for k, v in model.named_parameters(): | |
| # if k.__contains__(".bias"): | |
| # v.requires_grad = True | |
| # else: | |
| # v.requires_grad = False | |
| # for k, v in model.named_parameters(): | |
| # if v.requires_grad is True: | |
| # print(k) | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, mlm=False | |
| ) | |
| # training_args | |
| training_args = TrainingArguments( | |
| output_dir="output_dir", | |
| evaluation_strategy="steps", | |
| per_device_train_batch_size=8, | |
| gradient_accumulation_steps=4, | |
| learning_rate=2e-4, | |
| weight_decay=0, | |
| max_grad_norm=1.0, | |
| num_train_epochs=1.0, | |
| warmup_steps=1000, | |
| logging_steps=100, | |
| save_strategy="steps", | |
| save_steps=100, | |
| save_total_limit=2, | |
| no_cuda=False, | |
| fp16=True if torch.cuda.is_available() else False, | |
| local_rank=local_rank, | |
| ddp_backend="nccl", | |
| remove_unused_columns=True, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="loss", | |
| greater_is_better=False, | |
| report_to="tensorboard", | |
| push_to_hub=True, | |
| hub_model_id="lib_service_4chan", | |
| hub_strategy="every_save", | |
| gradient_checkpointing=True, | |
| ) | |
| partial_state_str = f""" | |
| distributed_type: {training_args.distributed_state.distributed_type} | |
| local_process_index: {training_args.distributed_state.local_process_index} | |
| num_processes: {training_args.distributed_state.num_processes} | |
| process_index: {training_args.distributed_state.process_index} | |
| device: {training_args.distributed_state.device} | |
| """ | |
| partial_state_str = re.sub(r"[\u0020]{4,}", "", partial_state_str) | |
| print(partial_state_str) | |
| environ = f""" | |
| RANK: {os.environ.get("RANK", -1)} | |
| WORLD_SIZE: {os.environ.get("WORLD_SIZE", -1)} | |
| LOCAL_RANK: {os.environ.get("LOCAL_RANK", -1)} | |
| """ | |
| environ = re.sub(r"[\u0020]{4,}", "", environ) | |
| print(environ) | |
| callbacks = [ | |
| EarlyStoppingCallback(early_stopping_patience=5) | |
| ] | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=data_collator, | |
| train_dataset=train_dataset, | |
| eval_dataset=valid_dataset, | |
| tokenizer=tokenizer, | |
| callbacks=callbacks | |
| ) | |
| train_result = trainer.train() | |
| # 保存最好的 checkpoint | |
| final_save_path = os.path.join(training_args.output_dir, "final") | |
| trainer.save_model(final_save_path) # Saves the tokenizer too | |
| # 保存训练指标 | |
| metrics = train_result.metrics | |
| trainer.log_metrics("train", metrics) | |
| trainer.save_metrics("train", metrics) | |
| trainer.save_state() | |
| tokenizer.save_pretrained(final_save_path) | |
| return | |
| def train_on_cpu(): | |
| args = get_args() | |
| train_model(0, 1, args) | |
| return | |
| def train_on_kaggle_notebook(): | |
| """ | |
| train on kaggle notebook with GPU T4 x2 | |
| from shutil import copyfile | |
| copyfile(src = "../input/tempdataset/step_2_train_model.py", dst = "../working/step_2_train_model.py") | |
| import step_2_train_model | |
| step_2_train_model.train_on_kaggle_notebook() | |
| """ | |
| args = get_args() | |
| world_size = torch.cuda.device_count() | |
| print("world_size: {}".format(world_size)) | |
| mp.spawn(train_model, | |
| args=(world_size, args), | |
| nprocs=world_size, | |
| join=True) | |
| return | |
| if __name__ == '__main__': | |
| train_on_cpu() | |