Spaces:
No application file
No application file
| from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq | |
| from datasets import load_dataset | |
| from model_load import model | |
| from tokenizer import tokenizer, tokenize | |
| from data_load import preprocess | |
| from utils.utils import inspect_tokenized_dataset , print_label_lengths , print_field_lengths | |
| from loss.trainer import MyTrainer | |
| import torch | |
| # β ν¨λ© ν ν° νμΈ | |
| if tokenizer.pad_token is None: | |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
| print(f"[λλ²κΉ ] pad_token μΆκ°λ¨ β {tokenizer.pad_token}") | |
| # β λͺ¨λΈ ν ν¬λμ΄μ ν¬κΈ° μ‘°μ | |
| model.resize_token_embeddings(len(tokenizer)) | |
| print(f"[λλ²κΉ ] λͺ¨λΈ μλ² λ© ν¬κΈ° μ¬μ‘°μ μλ£ β {len(tokenizer)}") | |
| # β λ°μ΄ν° λ‘λ | |
| dataset_path = "yahma/alpaca-cleaned" # λλ "./my_dataset.json" | |
| if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"): | |
| raw_data = load_dataset("json", data_files=dataset_path, split="train") | |
| else: | |
| raw_data = load_dataset(dataset_path, split="train") | |
| # β λ°μ΄ν° μ μ²λ¦¬ | |
| processed_dataset = preprocess(raw_data) | |
| print_field_lengths(processed_dataset, stage="μ μ²λ¦¬ ν") | |
| # β ν ν¬λμ΄μ§ | |
| tokenized_dataset = processed_dataset.map(tokenize, batched=True, remove_columns=processed_dataset.column_names) | |
| print_field_lengths(tokenized_dataset, stage="ν ν¬λμ΄μ§ ν") | |
| # β ν μΉ ν μ νμμΌλ‘ λ³ν | |
| tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) | |
| print(f"[λλ²κΉ ] ν μ ν¬λ§·μΌλ‘ μ€μ μλ£") | |
| print_field_lengths(tokenized_dataset, stage="ν μ ν¬λ§· ν") | |
| # β κ° νλ νμ νμΈ | |
| assert isinstance(tokenized_dataset[0]["input_ids"], torch.Tensor), "input_idsκ° Tensorκ° μλλλ€" | |
| assert isinstance(tokenized_dataset[0]["labels"], torch.Tensor), "labelsκ° Tensorκ° μλλλ€" | |
| # β inspect_tokenized_dataset μ€ν | |
| # inspect_tokenized_dataset(tokenized_dataset) | |
| # β TrainingArguments μ€μ | |
| training_args = TrainingArguments( | |
| output_dir="./output", | |
| per_device_train_batch_size=4, | |
| gradient_accumulation_steps=4, | |
| num_train_epochs=3, | |
| logging_dir="./logs", | |
| report_to="none", | |
| deepspeed="ds_config.json", | |
| save_total_limit=1, | |
| save_strategy="epoch", | |
| fp16=True, | |
| ) | |
| print("[λλ²κΉ ] TrainingArguments μ€μ μλ£") | |
| # β Trainer μ€μ | |
| trainer = MyTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| tokenizer=tokenizer, | |
| # data_collator=data_collator, # νμμ νμ±ν | |
| ) | |
| print("[λλ²κΉ ] Trainer μΈμ€ν΄μ€ μμ± μλ£") | |
| # β νμ΅ μμ | |
| print("[λλ²κΉ ] νμ΅ μμ") | |
| trainer.train() | |