| import torch
|
| import torch.nn as nn
|
| import torch.optim as optim
|
| import torchvision
|
| import torchvision.transforms as transforms
|
| from torchvision.datasets import ImageFolder
|
| from torch.utils.data import DataLoader
|
| from sklearn.metrics import f1_score, precision_score, recall_score
|
| from codecarbon import EmissionsTracker
|
| from thop import profile
|
| from tqdm import tqdm
|
| import time
|
| import pandas as pd
|
| import numpy as np
|
| import os
|
| import warnings
|
| from datetime import timedelta
|
|
|
|
|
| MODEL_NAME = "vgg16"
|
| DATASET_NAME = "CustomImageNet300"
|
| DATA_PATH = r'C:\Users\shanm\Dataset Download\custom image net'
|
| BATCH_SIZE = 32
|
| EPOCHS = 50
|
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
| SAVE_DIR = "saved_models"
|
| os.makedirs(SAVE_DIR, exist_ok=True)
|
| CSV_FILENAME = f"{MODEL_NAME}_{DATASET_NAME}_stats.csv"
|
|
|
| warnings.filterwarnings("ignore")
|
| os.environ["CODECARBON_LOG_LEVEL"] = "error"
|
|
|
| def main():
|
|
|
| transform = transforms.Compose([
|
| transforms.Resize(256),
|
| transforms.CenterCrop(224),
|
| transforms.RandomHorizontalFlip(),
|
| transforms.ToTensor(),
|
| transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
| ])
|
|
|
| if not os.path.exists(DATA_PATH):
|
| print(f"[ERROR] Dataset path not found: {DATA_PATH}")
|
| return
|
|
|
| trainset = ImageFolder(root=DATA_PATH, transform=transform)
|
| trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
|
|
|
|
|
| model = torchvision.models.vgg16(weights=None)
|
| model.classifier[6] = nn.Linear(4096, 300)
|
| model.to(DEVICE)
|
|
|
| dummy_input = torch.randn(1, 3, 224, 224).to(DEVICE)
|
| flops, params = profile(model, inputs=(dummy_input, ), verbose=False)
|
|
|
| criterion = nn.CrossEntropyLoss()
|
| optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
|
| scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
|
|
|
| results = []
|
| cumulative_total_energy = 0
|
| total_start_time = time.time()
|
| best_acc = 0.0
|
|
|
| print(f"\n[MODEL INFO] FLOPs: {flops/1e9:.2f} G | Parameters: {params/1e6:.2f} M")
|
| print("="*125)
|
| print(f"TRAINING {MODEL_NAME.upper()} ON {DATASET_NAME}")
|
| print("-" * 125)
|
|
|
| try:
|
| for epoch in range(1, EPOCHS + 1):
|
|
|
| tracker = EmissionsTracker(measure_power_secs=1, save_to_file=False, log_level='error')
|
| tracker.start()
|
|
|
| model.train()
|
| epoch_start_time = time.time()
|
| running_loss, all_preds, all_labels, grad_norms = 0.0, [], [], []
|
|
|
| pbar = tqdm(enumerate(trainloader), total=len(trainloader), desc=f"Epoch {epoch}/{EPOCHS}")
|
|
|
| for i, (inputs, labels) in pbar:
|
| inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
|
|
|
| optimizer.zero_grad()
|
| outputs = model(inputs)
|
| loss = criterion(outputs, labels)
|
| loss.backward()
|
|
|
| grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100)
|
| grad_norms.append(grad_norm.item())
|
| optimizer.step()
|
|
|
| running_loss += loss.item()
|
| _, predicted = torch.max(outputs.data, 1)
|
|
|
| pbar.set_postfix({'loss': f'{running_loss/(i+1):.4f}'})
|
|
|
| all_preds.extend(predicted.cpu().numpy())
|
| all_labels.extend(labels.cpu().numpy())
|
|
|
| scheduler.step()
|
| duration = time.time() - epoch_start_time
|
|
|
|
|
| emissions_kg = tracker.stop()
|
|
|
|
|
| e_gpu = tracker.final_emissions_data.gpu_energy * 3600000
|
| e_cpu = tracker.final_emissions_data.cpu_energy * 3600000
|
| e_ram = tracker.final_emissions_data.ram_energy * 3600000
|
| total_energy = e_gpu + e_cpu + e_ram
|
| cumulative_total_energy += total_energy
|
|
|
| acc = (np.array(all_preds) == np.array(all_labels)).mean()
|
| f1 = f1_score(all_labels, all_preds, average='macro')
|
| vram_peak = torch.cuda.max_memory_allocated(DEVICE) / (1024**3) if torch.cuda.is_available() else 0
|
| elapsed_total = time.time() - total_start_time
|
| avg_per_epoch = elapsed_total / epoch
|
| eta = str(timedelta(seconds=int(avg_per_epoch * (EPOCHS - epoch))))
|
|
|
|
|
| epoch_stats = {
|
| "epoch": epoch,
|
| "loss": running_loss / len(trainloader),
|
| "accuracy": acc,
|
| "f1_score": f1,
|
| "precision": precision_score(all_labels, all_preds, average='macro', zero_division=0),
|
| "recall": recall_score(all_labels, all_preds, average='macro', zero_division=0),
|
| "epoch_energy_gpu_j": e_gpu,
|
| "epoch_energy_cpu_j": e_cpu,
|
| "epoch_energy_ram_j": e_ram,
|
| "epoch_total_energy_j": total_energy,
|
| "cumulative_total_energy_j": cumulative_total_energy,
|
| "carbon_emissions_kg": emissions_kg,
|
| "vram_peak_gb": vram_peak,
|
| "avg_power_gpu_w": tracker.final_emissions_data.gpu_power,
|
| "avg_power_cpu_w": tracker.final_emissions_data.cpu_power,
|
| "avg_power_ram_w": tracker.final_emissions_data.ram_power,
|
| "latency_ms": (duration / len(trainloader)) * 1000,
|
| "avg_grad_norm": np.mean(grad_norms),
|
| "eag_metric": acc / (total_energy / 1000) if total_energy > 0 else 0,
|
| "it_per_sec": len(trainloader) / duration,
|
| "total_iterations": len(trainloader),
|
| "epoch_duration_sec": duration,
|
| "cumulative_time_sec": elapsed_total,
|
| "model_flops": flops,
|
| "model_parameters": params
|
| }
|
| results.append(epoch_stats)
|
| pd.DataFrame(results).to_csv(CSV_FILENAME, index=False)
|
|
|
| if acc > best_acc:
|
| best_acc = acc
|
| torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"BEST_{MODEL_NAME}_{DATASET_NAME}.pth"))
|
| best_msg = " (Best Saved!)"
|
| else:
|
| best_msg = ""
|
|
|
| print(f"\nEpoch {epoch:02d} Summary: Loss: {epoch_stats['loss']:.4f} | Acc: {acc:.2%} | Energy: {total_energy:.2f}J | VRAM: {vram_peak:.2f}GB | ETA: {eta}{best_msg}\n")
|
| print("-" * 125)
|
|
|
| except Exception as e:
|
| print(f"\n[CRASH] Error: {e}")
|
| import traceback
|
| traceback.print_exc()
|
| finally:
|
| print(f"\n[SUCCESS] Training Complete. Results saved to {CSV_FILENAME}")
|
|
|
| if __name__ == '__main__':
|
| main() |