Spaces:
Sleeping
Sleeping
| import logging | |
| import yaml | |
| import torch | |
| import time | |
| from torch import nn, optim | |
| from torch.utils.data import DataLoader | |
| from tqdm import tqdm | |
| import os | |
| import pandas as pd | |
| from .classifier import Classifier | |
| from .bert import Bert | |
| from .dataset import TextDataset | |
| def setup_logging(config): | |
| logging.basicConfig( | |
| filename=os.path.join(config['logging']['log_dir'], "log.log"), | |
| filemode='w', | |
| level=config['logging']['level'], | |
| format=config['logging']['format'] | |
| ) | |
| return logging.getLogger(__name__) | |
| def evaluate(model, dataloader, criterion): | |
| model.eval() | |
| total_loss = 0.0 | |
| correct = 0 | |
| total = 0 | |
| with torch.no_grad(): | |
| for texts, labels in dataloader: | |
| labels = labels.float().to(device) | |
| outputs = model(texts).squeeze() | |
| loss = criterion(outputs, labels) | |
| total_loss += loss.item() * labels.size(0) | |
| correct += ((outputs >= 0.5).float() == labels).sum().item() | |
| total += labels.size(0) | |
| return total_loss / total, correct / total | |
| if __name__ == "__main__": | |
| config = yaml.safe_load(open("config.yaml")) | |
| logger = setup_logging(config) | |
| logger.info("Starting training process") | |
| logger.info(f"Configuration: {config}") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Инициализация модели | |
| model = Classifier(Bert(config['model']['bert_name'])).to(device) | |
| logger.info("Model initialized") | |
| # Загрузка данных | |
| train_dataset = torch.load(config['data']['train_path']) | |
| test_dataset = torch.load(config['data']['test_path']) | |
| train_loader = DataLoader( | |
| train_dataset, | |
| batch_size=int(config['data']['batch_size']), | |
| shuffle=True | |
| ) | |
| test_loader = DataLoader( | |
| test_dataset, | |
| batch_size=int(config['data']['batch_size']), | |
| shuffle=False | |
| ) | |
| logger.info(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}") | |
| # Оптимизатор | |
| optimizer = optim.Adam( | |
| model.parameters(), | |
| lr=float(config['training']['learning_rate']) | |
| ) | |
| criterion = nn.BCELoss() | |
| # Для записи результатов обучения | |
| results = [] | |
| for epoch in range(int(config['training']['epochs'])): | |
| start_time = time.time() | |
| # Обучение | |
| model.train() | |
| train_loss, train_correct = 0.0, 0 | |
| for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}", ncols=100): | |
| labels = labels.float().to(device) | |
| optimizer.zero_grad() | |
| outputs = model(texts).squeeze() | |
| loss = criterion(outputs, labels) | |
| loss.backward() | |
| optimizer.step() | |
| train_loss += loss.item() * labels.size(0) | |
| train_correct += ((outputs >= 0.5).float() == labels).sum().item() | |
| # Оценка | |
| train_loss /= len(train_dataset) | |
| train_acc = train_correct / len(train_dataset) | |
| test_loss, test_acc = evaluate(model, test_loader, criterion) | |
| # Сохранение результатов | |
| results.append({ | |
| "epoch": epoch + 1, | |
| "train_loss": train_loss, | |
| "test_loss": test_loss, | |
| "train_acc": train_acc, | |
| "test_acc": test_acc | |
| }) | |
| # Логирование | |
| epoch_time = time.time() - start_time | |
| logger.info(f"Epoch {epoch+1} [{epoch_time:.1f}s]") | |
| logger.info(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f}") | |
| logger.info(f"Test Loss: {test_loss:.4f} | Acc: {test_acc:.4f}") | |
| torch.save(model.state_dict(), | |
| os.path.join(config['training']['save_dir'], f"model_{epoch+1}.pth")) | |
| # Сохранение результатов в CSV | |
| results_df = pd.DataFrame(results) | |
| results_df.to_csv(os.path.join(config['logging']['log_dir'], "training_results.csv"), index=False) | |
| # Финализация обучения | |
| logger.info("Training completed") | |