import os os.environ["TOKENIZERS_PARALLELISM"] = "false" import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from torchvision.transforms.functional import to_tensor, to_pil_image import torchvision.transforms as transforms from transformers import AutoModel from transformers import AutoTokenizer, AutoConfig import torch import torch.nn as nn from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader from torch.cuda.amp import autocast, GradScaler from torch.utils.data.distributed import DistributedSampler from torch.utils.data import RandomSampler, SequentialSampler from tqdm import tqdm import random import numpy as np # from collections import OrderedDict from rich import print import time import cv2 # from glob import glob import string from torch.optim import AdamW from transformers import get_linear_schedule_with_warmup from models import get_model from dataset import MyDataset from utils import save_checkpoint, AverageMeter, ProgressMeter # if __name__ == '__main__': # torch.distributed.init_process_group(backend="nccl") # local_rank = torch.distributed.get_rank() # torch.cuda.set_device(local_rank) # device = torch.device("cuda", local_rank) # scaler = GradScaler() # else: # device = 'cuda' if torch.cuda.is_available() else 'cpu' if __name__ == '__main__': # 檢查是否為分散式訓練模式(例如 torchrun 啟動) if "RANK" in os.environ and "WORLD_SIZE" in os.environ: torch.distributed.init_process_group(backend="nccl") local_rank = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) print(f"[Distributed] Rank {os.environ['RANK']} using device {local_rank}") else: # 單機單卡訓練模式 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"[Single] Using device: {device}") # AMP scaler 建議新版寫法 scaler = torch.amp.GradScaler(device='cuda' if torch.cuda.is_available() else 'cpu') local_rank = int(os.environ.get("LOCAL_RANK", 0)) print_raw = print def print(*info): if local_rank == 0: print_raw(*info) def crossentropy(y_true, y_pred): return F.cross_entropy(y_pred, y_true, label_smoothing=0.2) def evaluate(predictions, labels): nb_all = len(predictions) acc = sum([int(p==l) for p, l in zip(predictions, labels)]) / (nb_all + 1e-8) eval_results = {'acc': acc} return eval_results def train_epoch(model, optimizer, epoch, dataloader, sampler, tokenizer, scheduler): print(f"\n\n=> train") data_time = AverageMeter('- data', ':4.3f') batch_time = AverageMeter('- batch', ':6.3f') losses = AverageMeter('- loss', ':.4e') acces = AverageMeter('- acc', ':.4f') progress = ProgressMeter( len(dataloader), data_time, batch_time, losses, acces, prefix=f"Epoch: [{epoch}]") end = time.time() model.train() if hasattr(sampler, "set_epoch"): sampler.set_epoch(epoch) predictions, labels = [], [] for batch_index, data_batch in enumerate(dataloader): optimizer.zero_grad() context_str_batch, target_batch = data_batch # data tokenizer context_token_batch = tokenizer(context_str_batch, padding=True, truncation=True, max_length=500, return_tensors='pt') # to gpu context_token_batch = {k:v.to(device) for k,v in context_token_batch.items()} target_batch = target_batch.to(device) # forward data_input_batch = context_token_batch output_batch = model(**data_input_batch) pred_batch = output_batch.softmax(dim=-1) loss_batch = crossentropy(target_batch, output_batch) loss = torch.mean(loss_batch) # print(loss) loss.backward() optimizer.step() if scheduler is not None: scheduler.step() loss_value = loss.item() losses.update(loss_value, len(target_batch)) pred = torch.argmax(pred_batch, dim=-1) predictions.extend(pred.cpu().numpy()) labels.extend(target_batch.cpu().numpy()) acc_batch = (target_batch==pred).sum().cpu().numpy() / (len(target_batch) + 1e-8) acces.update(acc_batch, len(target_batch)) batch_time.update(time.time() - end) end = time.time() if batch_index % 50 == 0: progress.print(batch_index) results = evaluate(predictions, labels) print(results) return results def val_epoch(model, optimizer, epoch, dataloader, sampler, tokenizer): print(f"\n\n=> val") data_time = AverageMeter('- data', ':4.3f') batch_time = AverageMeter('- batch', ':6.3f') losses = AverageMeter('- loss', ':.4e') acces = AverageMeter('- acc', ':.4f') progress = ProgressMeter( len(dataloader), data_time, batch_time, losses, acces, prefix=f"Epoch: [{epoch}]") end = time.time() model.train() if hasattr(sampler, "set_epoch"): sampler.set_epoch(epoch) predictions, labels = [], [] for batch_index, data_batch in enumerate(dataloader): optimizer.zero_grad() context_str_batch, target_batch = data_batch # data tokenizer context_token_batch = tokenizer(context_str_batch, padding=True, truncation=True, max_length=500, return_tensors='pt') # to gpu context_token_batch = {k:v.to(device) for k,v in context_token_batch.items()} target_batch = target_batch.to(device) # forward data_input_batch = context_token_batch output_batch = model(**data_input_batch) pred_batch = output_batch.softmax(dim=-1) loss_batch = crossentropy(target_batch, output_batch) loss = torch.mean(loss_batch) # print(pred_batch) # print(target_batch) # print(loss) loss_value = loss.item() losses.update(loss_value, len(target_batch)) pred = torch.argmax(pred_batch, dim=-1) predictions.extend(pred.cpu().numpy()) labels.extend(target_batch.cpu().numpy()) acc_batch = (target_batch==pred).sum().cpu().numpy() / (len(target_batch) + 1e-8) acces.update(acc_batch, len(target_batch)) batch_time.update(time.time() - end) end = time.time() if batch_index % 50 == 0: progress.print(batch_index) results = evaluate(predictions, labels) print(results) return results def gogogo(): output_dir = '/home/elaine/Desktop/macbert_code/checkpoints_travel' ann_file_tra = '/home/elaine/Desktop/macbert_code/dataset/travel_train_9000.csv' ann_file_val = '/home/elaine/Desktop/macbert_code/dataset/travel_val_9000.csv' checkpoint_file = None batch_size = 4 epochs = 20 cache_dir = ' /home/elaine/Desktop/macbert_code/cache' model_cfg = { "pretrained_transformers": "hfl/chinese-macbert-base", "cache_dir": cache_dir } # 模型與 tokenizer model_dict = get_model(model_cfg, mode='base') model = model_dict['model'] tokenizer = model_dict['tokenizer'] print(model) # 優化器參數設計 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8) scheduler = None # 如果你需要可啟用 # Dataset 與 DataLoader(單卡不使用 DistributedSampler) data_loader_cfg = {} tra_dataset = MyDataset(ann_file_tra, data_loader_cfg, mode='tra') val_dataset = MyDataset(ann_file_val, {}, mode='val') # Sampler(單卡用 RandomSampler / SequentialSampler) sampler_tra = RandomSampler(tra_dataset) sampler_val = SequentialSampler(val_dataset) tra_loader = DataLoader(tra_dataset, batch_size=batch_size, num_workers=8, pin_memory=True, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=8, pin_memory=True, shuffle=False) # checkpoint resume if checkpoint_file is not None and os.path.exists(checkpoint_file): checkpoint = torch.load(checkpoint_file, map_location='cpu') init_epoch = checkpoint['epoch'] + 1 model.load_state_dict({k.replace('module.', ''): v for k, v in checkpoint['state_dict'].items()}) optimizer.load_state_dict(checkpoint['optimizer']) if torch.cuda.is_available(): for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(f"=> Resume: loaded checkpoint {checkpoint_file} (epoch {checkpoint['epoch']})") else: init_epoch = 1 print(f"=> No checkpoint. ") # 將模型送上 GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # 開始訓練 acc = 0. for epoch in range(init_epoch, epochs + 1): results_tra = train_epoch(model, optimizer, epoch, tra_loader, sampler_tra, tokenizer, scheduler) results_val = val_epoch(model, optimizer, epoch, val_loader, sampler_val, tokenizer) acc_val = results_val['acc'] if acc_val >= acc: acc = acc_val save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'best_acc': acc, 'optimizer': optimizer.state_dict(), }, outname=f'{output_dir}/checkpoint_epoch{epoch:03d}_acc{acc:.4f}.pth.tar', local_rank=0) if __name__ == '__main__': gogogo()