EDEN-Core-Scripts / test3 /eden_AlexNet_CIFAR100.py
Shanmuk4622's picture
Upload test3/eden_AlexNet_CIFAR100.py with huggingface_hub
5ff8c0d verified
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, precision_score, recall_score
from codecarbon import EmissionsTracker
from thop import profile
import time
import pandas as pd
import numpy as np
import os
import warnings
import copy
from datetime import timedelta
# --- Configuration ---
MODEL_NAME = "alexnet_EDEN"
DATASET_NAME = "CIFAR100"
DATA_PATH = r'C:\Users\shanm\Dataset Download\CIFAR100'
BATCH_SIZE = 128
ACCUMULATION_STEPS = 4 # Effective Batch Size = 512
EPOCHS = 15
E_UNFREEZE = 10
LAMBDA_L1 = 1e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)
CSV_FILENAME = f"{MODEL_NAME}_{DATASET_NAME}_stats.csv"
warnings.filterwarnings("ignore")
os.environ["CODECARBON_LOG_LEVEL"] = "error"
def main():
# --- Phase 1: Zero-Overhead Initialization (RAM Caching) ---
transform = transforms.Compose([
transforms.Resize(224), # AlexNet pre-trained expects 224x224
transforms.ToTensor(),
transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2673, 0.2564, 0.2762)),
])
print(f"[*] Caching {DATASET_NAME} to System RAM for zero-I/O overhead...")
try:
full_dataset = torchvision.datasets.CIFAR100(root=DATA_PATH, train=True, download=False, transform=transform)
except:
full_dataset = torchvision.datasets.CIFAR100(root=os.path.dirname(DATA_PATH), train=True, download=False, transform=transform)
all_data, all_targets = [], []
for i, (img, target) in enumerate(full_dataset):
all_data.append(img)
all_targets.append(target)
if i % 10000 == 0: print(f" Loaded {i}/50000 images...")
cached_trainset = TensorDataset(torch.stack(all_data), torch.tensor(all_targets))
trainloader = DataLoader(cached_trainset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
# --- Model Setup ---
model = torchvision.models.alexnet(weights='IMAGENET1K_V1')
model.classifier[6] = nn.Linear(4096, 100) # 100 classes for CIFAR-100
# 1. Calculate FLOPs on a temporary clone to avoid hook attribute errors
print("[*] Calculating hardware metrics (FLOPs/Params)...")
model_for_profile = copy.deepcopy(model).to(DEVICE)
dummy_input = torch.randn(1, 3, 224, 224).to(DEVICE)
flops, params = profile(model_for_profile, inputs=(dummy_input, ), verbose=False)
del model_for_profile # Clean up clone
# 2. Freeze backbone for EDEN Phase 2 (Initial State)
for param in model.features.parameters():
param.requires_grad = False
model.to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler() # For Automated Mixed Precision (AMP)
results = []
cumulative_total_energy = 0
total_start_time = time.time()
best_acc = 0.0
tracker = EmissionsTracker(measure_power_secs=1, save_to_file=False, log_level='error')
print(f"\n[MODEL INFO] FLOPs: {flops/1e9:.2f} G | Parameters: {params/1e6:.2f} M | Batch Size: {BATCH_SIZE}")
print(f"{'='*140}")
print(f"{'Epoch':<6} | {'Loss':<7} | {'Acc':<7} | {'Total(J)':<9} | {'VRAM(GB)':<9} | {'EAG':<8} | {'Status'}")
print(f"{'-'*140}")
for epoch in range(1, EPOCHS + 1):
# --- EDEN Progressive Unfreezing ---
if epoch == E_UNFREEZE:
for param in model.parameters():
param.requires_grad = True
for param_group in optimizer.param_groups:
param_group['lr'] = 1e-5
status_msg = "UNFROZEN"
else:
status_msg = "FROZEN" if epoch < E_UNFREEZE else "FINE-TUNING"
model.train()
tracker.start()
epoch_start_time = time.time()
running_loss, all_preds, all_labels, grad_norms = 0.0, [], [], []
optimizer.zero_grad()
for i, (inputs, labels) in enumerate(trainloader):
inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
with torch.cuda.amp.autocast():
outputs = model(inputs)
cls_loss = criterion(outputs, labels)
# EDEN Sparse Training Penalty (L1)
l1_penalty = sum(p.abs().sum() for p in model.parameters() if p.requires_grad)
loss = (cls_loss + LAMBDA_L1 * l1_penalty) / ACCUMULATION_STEPS
scaler.scale(loss).backward()
# Gradient Accumulation Logic
if (i + 1) % ACCUMULATION_STEPS == 0:
scaler.unscale_(optimizer)
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
grad_norms.append(grad_norm.item())
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
running_loss += cls_loss.item()
_, predicted = torch.max(outputs.data, 1)
all_preds.extend(predicted.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
emissions_kg = tracker.stop()
duration = time.time() - epoch_start_time
# Energy Metrics (kWh -> Joules)
e_gpu = tracker.final_emissions_data.gpu_energy * 3600000
e_cpu = tracker.final_emissions_data.cpu_energy * 3600000
e_ram = tracker.final_emissions_data.ram_energy * 3600000
total_energy = e_gpu + e_cpu + e_ram
cumulative_total_energy += total_energy
acc = (np.array(all_preds) == np.array(all_labels)).mean()
f1 = f1_score(all_labels, all_preds, average='macro')
vram_peak = torch.cuda.max_memory_allocated(DEVICE) / (1024**3)
eag = acc / (total_energy / 1000) if total_energy > 0 else 0
# CSV Logging
epoch_stats = {
"epoch": epoch, "status": status_msg, "loss": running_loss / len(trainloader),
"accuracy": acc, "f1_score": f1,
"precision": precision_score(all_labels, all_preds, average='macro', zero_division=0),
"recall": recall_score(all_labels, all_preds, average='macro', zero_division=0),
"energy_gpu_j": e_gpu, "energy_cpu_j": e_cpu, "energy_ram_j": e_ram,
"total_energy_j": total_energy, "cumulative_total_energy_j": cumulative_total_energy,
"carbon_kg": emissions_kg, "vram_gb": vram_peak,
"latency_ms": (duration / len(trainloader)) * 1000,
"eag_metric": eag, "grad_norm": np.mean(grad_norms) if grad_norms else 0,
"model_flops": flops, "model_params": params,
"batch_size": BATCH_SIZE, "accumulation_steps": ACCUMULATION_STEPS
}
results.append(epoch_stats)
pd.DataFrame(results).to_csv(CSV_FILENAME, index=False)
if acc > best_acc:
best_acc = acc
torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"BEST_{MODEL_NAME}_{DATASET_NAME}.pth"))
best_tag = "*"
else:
best_tag = ""
print(f"{epoch:02d}/50 | {epoch_stats['loss']:.4f} | {acc:.2%} | {total_energy:<9.2f} | {vram_peak:<9.3f} | {eag:<8.4f} | {status_msg}{best_tag}")
print(f"{'='*140}\n[FINISH] AlexNet on CIFAR-100 saved to {CSV_FILENAME}")
if __name__ == '__main__':
main()