Upload eden_hf_upload.py with huggingface_hub

54418ce verified 19 days ago

18.7 kB

	"""
	Project EDEN - Hugging Face Upload Master Script
	Applies all 6 refinements:
	1. Hardware transparency (1080 Ti / Xeon W-2125)
	2. E2AM Phase mapping per model
	3. Phase 1 Zero-Overhead Initialization highlight
	4. Standardized Green Delta table in every README
	5. YAML tags with co2_eq_emissions + dataset_size
	6. Citation section in Main Repo
	"""

	import os
	import json
	import glob
	import math
	from huggingface_hub import HfApi, create_repo, upload_file

	# ─── CONFIG ──────────────────────────────────────────────────────────────────
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	HF_USER = "Shanmuk4622" # HF username (no org found, uploading under user)
	HF_ORG = HF_USER # use user namespace
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DRY_RUN = False # Live upload

	api = HfApi(token=HF_TOKEN)

	# ─── HARDWARE PROFILE ────────────────────────────────────────────────────────
	HARDWARE = {
	"gpu": "NVIDIA GeForce GTX 1080 Ti (11 GB VRAM, 250 W TDP)",
	"cpu": "Intel Xeon W-2125 (4 cores / 8 threads @ 4.00 GHz)",
	"ram": "63.66 GB System RAM",
	"os": "Windows 10",
	}

	# ─── E2AM PHASE MAP ──────────────────────────────────────────────────────────
	# Maps folder -> technique label for README
	PHASE_MAP = {
	"test1": "Phase 2 – Progressive Unfreezing + AMP (E2AM SOTA)",
	"test2": "Baseline – Standard Full Training (Reference Study)",
	"test3": "Phase 2 – EDEN Classic Energy-Aware Sparse Training",
	}

	PHASE_DETAIL = {
	"test1": (
	"Phase 1 – Zero-Overhead Initialization: Dataset pre-loaded into pinned "
	"System RAM to eliminate disk I/O power spikes.\n\n"
	"Phase 2 – Progressive Unfreezing: Backbone frozen for the first "
	"`E_unfreeze` epochs (only the classification head trains). At `E_unfreeze`, "
	"all layers are unfrozen and the learning rate is decayed. "
	"Gradient accumulation over N micro-batches simulates large batch sizes "
	"without proportional VRAM cost, slashing power-draw spikes.\n\n"
	"AMP (Automated Mixed Precision): `torch.cuda.amp.autocast()` halves "
	"GPU memory bandwidth, reducing energy per backward pass.\n\n"
	"Sparse Regularisation: L1 penalty `λ·Σ\|W\|` applied to trainable "
	"weights, driving dead neurons to zero and enabling future pruning."
	),
	"test2": (
	"Standard full fine-tuning used as the Brute-Force Baseline for "
	"energy comparison. All layers trained from epoch 1 with a fixed learning "
	"rate and no gradient accumulation. Included for transparent EAG benchmarking."
	),
	"test3": (
	"Phase 1 – Zero-Overhead Initialization: Dataset cached in System RAM.\n\n"
	"Phase 2 – EDEN Classic: Energy-aware training loop on classic CNN "
	"architectures. Applies the same EAG early-exit criterion "
	"(`EAG < γ_EAG` for 3 consecutive epochs → terminate), L1 sparsity "
	"penalty, and AMP to architectures like ResNet, VGG, AlexNet, DenseNet, "
	"InceptionV3, and UNet."
	),
	}

	# ─── DATASET META ────────────────────────────────────────────────────────────
	DATASET_META = {
	"CIFAR-10": {"size": "60,000 images – 10 classes (32×32 px)", "hf_name": "cifar10"},
	"CIFAR-100": {"size": "60,000 images – 100 classes (32×32 px)", "hf_name": "cifar100"},
	"Custom-ImageNet300": {"size": "~450,000 images – 300 classes (224 px)", "hf_name": "imagenet"},
	"unknown": {"size": "N/A", "hf_name": "unknown"},
	}

	# CO2: 0.475 kg CO2e per kWh (global average grid factor)
	KG_CO2_PER_KWH = 0.000000475 # per Joule

	# ─── HELPERS ─────────────────────────────────────────────────────────────────
	def parse_name(filename):
	fn = filename.lower().replace("\\", "/")
	dataset = "unknown"
	arch = "unknown"
	if "cifar100" in fn: dataset = "CIFAR-100"
	elif "cifar10" in fn: dataset = "CIFAR-10"
	elif "imagenet" in fn: dataset = "Custom-ImageNet300"
	if "efficientnet" in fn: arch = "EfficientNetV2"
	elif "convnext" in fn: arch = "ConvNeXtV2"
	elif "mobilevit" in fn: arch = "MobileViTv3"
	elif "resnet50" in fn: arch = "ResNet50"
	elif "resnet18" in fn: arch = "ResNet18"
	elif "vgg16" in fn: arch = "VGG16"
	elif "alexnet" in fn: arch = "AlexNet"
	elif "inception" in fn: arch = "InceptionV3"
	elif "densenet" in fn: arch = "DenseNet121"
	elif "unet" in fn: arch = "UNet"
	return arch, dataset

	def joules_to_co2(joules):
	kwh = joules / 3_600_000
	return kwh * 0.475 # kg CO2e

	def folder_to_phase_label(folder):
	return {"test1": "SOTA Optimized", "test2": "Baseline", "test3": "EDEN Classic"}.get(folder, folder)

	# ─── LOAD STATS ──────────────────────────────────────────────────────────────
	with open(os.path.join(BASE_DIR, "results_summary.json")) as f:
	results = json.load(f)

	stats_map = {}
	for r in results:
	arch, dataset = parse_name(r["file"])
	folder = r["folder"]
	key = f"{folder}_{arch}_{dataset}"
	if key not in stats_map or (r["energy"] > 0 and stats_map[key]["energy"] == 0):
	stats_map[key] = r

	# Build baseline map (ResNet50 from test2 per dataset)
	baselines = {}
	for key, v in stats_map.items():
	folder, *rest = key.split("_")
	arch = v.get("arch") or parse_name(v["file"])[0]
	if folder == "test2":
	_, ds = parse_name(v["file"])
	if ds not in baselines:
	baselines[ds] = v
	# prefer ResNet50
	if parse_name(v["file"])[0] == "ResNet50":
	baselines[ds] = v

	# ─── COLLECT ALL MODELS ──────────────────────────────────────────────────────
	pth_files = glob.glob(os.path.join(BASE_DIR, "*/.pth"), recursive=True)
	models = []
	for pth in pth_files:
	rel = os.path.relpath(pth, BASE_DIR)
	parts = rel.split(os.sep)
	folder = parts[0]
	arch, dataset = parse_name(rel)
	key = f"{folder}_{arch}_{dataset}"
	stat = stats_map.get(key, {})
	models.append({
	"pth": rel, "arch": arch, "dataset": dataset,
	"folder": folder,
	"accuracy": stat.get("accuracy", 0),
	"energy": stat.get("energy", 0),
	"time": stat.get("time", 0),
	"csv": stat.get("file", "N/A"),
	})

	# ─── README GENERATOR ────────────────────────────────────────────────────────
	def build_readme(model):
	arch = model["arch"]
	dataset = model["dataset"]
	folder = model["folder"]
	acc = model["accuracy"]
	energy = model["energy"]
	t = model["time"]
	phase = folder_to_phase_label(folder)
	ds_meta = DATASET_META.get(dataset, DATASET_META["unknown"])
	co2 = joules_to_co2(energy) if energy else 0

	baseline = baselines.get(dataset, {})
	b_acc = baseline.get("accuracy", 0)
	b_energy = baseline.get("energy", 0)
	b_arch = parse_name(baseline.get("file",""))[0] if baseline else "Baseline"

	# Green Delta
	if b_energy and energy:
	energy_savings_pct = (b_energy - energy) / b_energy * 100
	d_acc = acc - b_acc
	d_j = energy - b_energy
	eag = d_acc / d_j if d_j != 0 else float("nan")
	eag_str = f"{eag:.4e}"
	savings_str = f"{energy_savings_pct:.2f}%"
	acc_delta = f"{d_acc*100:+.2f}%"
	else:
	energy_savings_pct = 0
	eag_str = "N/A"
	savings_str = "N/A"
	acc_delta = "N/A"

	# YAML tags
	arch_tag = arch.lower().replace(" ","")
	yaml_co2 = f"{co2:.4f}" if co2 else "0"

	yaml = f"""---
	language: en
	license: apache-2.0
	tags:
	- image-classification
	- green-ai
	- energy-efficiency
	- computer-vision
	- {arch_tag}
	- eden-framework
	- e2am
	- sustainable-ai
	datasets:
	- {ds_meta['hf_name']}
	metrics:
	- accuracy
	co2_eq_emissions:
	emissions: {yaml_co2}
	unit: kg
	source: Estimated via CodeCarbon (grid factor 0.475 kg CO2e/kWh)
	hardware_used: NVIDIA GeForce GTX 1080 Ti
	dataset_info:
	dataset_size: "{ds_meta['size']}"
	---"""

	# Technique section
	technique = PHASE_DETAIL.get(folder, "Standard training.")

	# Green Delta Table
	green_table = f"""\| Metric \| {b_arch} Baseline \| {arch} (EDEN) \| Δ \|
	\|---\|---\|---\|---\|
	\| Accuracy \| {b_acc:.4f} \| {acc:.4f} \| `{acc_delta}` \|
	\| Total Energy (J) \| {b_energy:,.0f} \| {energy:,.0f} \| `{savings_str} saved` \|
	\| CO₂ Emissions (kg) \| {joules_to_co2(b_energy):.4f} \| {co2:.4f} \| — \|
	\| EAG Score \| — \| {eag_str} \| ΔAcc/ΔJoules \|"""

	cite = f"""## Cite This Research
	If you use this model, please cite the EDEN / E2AM Framework:

	```bibtex
	@misc{{eden2025,
	title = {{Project EDEN: Energy-Driven Evolution of Networks}},
	author = {{EDEN Research Team}},
	year = {{2025}},
	note = {{Hugging Face Organization: ProjectEDEN}},
	url = {{https://huggingface.co/{HF_ORG}}}
	}}
	```"""

	readme = f"""{yaml}

	# EDEN-{arch}-{dataset} — {phase}

	> Primary KPI: EAG (Energy-to-Accuracy Gradient) = `{eag_str}` ΔAcc/ΔJoules

	## Abstract
	This model is part of Project EDEN (Energy-Driven Evolution of Networks), implementing the E2AM (Energy Efficient Advanced Model) Framework. The goal is to shift AI benchmarking from pure accuracy to Green SOTA — maximizing predictive power per Joule consumed.

	Applied Technique: {PHASE_MAP.get(folder, phase)}

	## Profiling Environment
	\| Component \| Specification \|
	\|---\|---\|
	\| GPU \| {HARDWARE['gpu']} \|
	\| CPU \| {HARDWARE['cpu']} \|
	\| RAM \| {HARDWARE['ram']} \|
	\| OS \| {HARDWARE['os']} \|
	\| Dataset \| {dataset} — {ds_meta['size']} \|

	## 🟢 Green Delta Table
	Comparing this model against the reference baseline (ResNet-50 equivalent)

	{green_table}

	> A positive EAG means this model learns more per Joule than the baseline.
	> A negative EAG indicates a trade-off where higher accuracy required more energy investment.

	## E2AM Algorithm — Applied Phases

	{technique}

	## Training Statistics
	\| Metric \| Value \|
	\|---\|---\|
	\| Final Accuracy \| {acc:.4f} ({acc*100:.2f}%) \|
	\| Total Energy Consumed \| {energy:,.0f} J ({energy/3_600_000:.4f} kWh) \|
	\| Training Time \| {t:,.0f} s ({t/3600:.2f} hrs) \|
	\| Estimated CO₂ \| {co2:.4f} kg CO₂e \|
	\| Training Log \| `{model['csv']}` \|

	{cite}
	"""
	return readme

	# ─── MAIN FRAMEWORK README ───────────────────────────────────────────────────
	def build_main_repo_readme():
	py_scripts = [os.path.relpath(p, BASE_DIR) for p in
	glob.glob(os.path.join(BASE_DIR, "*/.py"), recursive=True)
	if any(k in p for k in ["Algo_", "eden_", "mobilevit_model"])]

	scripts_md = "\n".join(f"- `{s}`" for s in sorted(py_scripts))

	return f"""---
	language: en
	license: apache-2.0
	tags:
	- green-ai
	- energy-efficiency
	- e2am
	- eden-framework
	- sustainable-ai
	- image-classification
	---

	# EDEN-Core-Scripts — E2AM Framework Repository

	> Project EDEN (Energy-Driven Evolution of Networks) — The complete algorithmic
	> toolkit for Green SOTA image classification research.

	## Why EDEN?
	As deep learning models scale exponentially, the carbon footprint of training has
	reached unsustainable levels. Project EDEN introduces the **EAG
	(Energy-to-Accuracy Gradient)** as the primary KPI — shifting the paradigm from
	chasing raw accuracy to optimising Green SOTA.

	## Profiling Environment
	\| Component \| Specification \|
	\|---\|---\|
	\| GPU \| {HARDWARE['gpu']} \|
	\| CPU \| {HARDWARE['cpu']} \|
	\| RAM \| {HARDWARE['ram']} \|
	\| OS \| {HARDWARE['os']} \|

	## The E2AM Algorithm — All Three Phases

	### Phase 1 — Zero-Overhead Initialization
	Dataset pre-loaded into pinned System RAM before training begins.
	This eliminates disk I/O power spikes that would otherwise inflate energy readings
	and distort EAG comparisons between architectures.

	### Phase 2 — Two-Stage Energy-Aware Training
	1. Frozen Head Training — Only the classification head trains for the first
	`E_unfreeze` epochs. The backbone consumes no backward-pass energy.
	2. Progressive Unfreezing — At epoch `E_unfreeze`, all layers unlock.
	Learning rate is decayed (`LR × 0.1`) for stable fine-tuning.
	3. Gradient Accumulation — Gradients accumulated over N micro-batches,
	simulating large batch sizes without VRAM spikes.
	4. AMP (Automated Mixed Precision) — `torch.cuda.amp.autocast()` halves
	bandwidth per backward pass.
	5. Sparse L1 Penalty — `L_total = CrossEntropy + λ·Σ\|W_trainable\|`
	6. EAG Early-Exit — Training terminates if `EAG < γ_EAG` for 3 consecutive
	epochs, preventing wasted compute.

	### Phase 3 — Hardware-Aware Deployment (Post-Training)
	- Saliency-Energy Pruning — Filters with lowest `∂Accuracy/∂W ÷ Energy_cost`
	are pruned.
	- INT8 Quantization — Weights converted for edge-deployment readiness.
	- Dynamic Depth Routing — Simple images bypass the middle 50 % of layers
	via residual skip connections, slashing inference energy.

	## EAG — The Expert KPI
	```
	EAG = ΔAccuracy / ΔJoules
	```
	EAG allows apples-to-apples comparison of any two models regardless of
	architecture family. A higher EAG = more learning per unit of carbon footprint.

	## Scripts in This Repository
	{scripts_md}

	## Cite This Research
	```bibtex
	@misc{{eden2025,
	title = {{Project EDEN: Energy-Driven Evolution of Networks}},
	author = {{EDEN Research Team}},
	year = {{2025}},
	note = {{Hugging Face Organization: ProjectEDEN}},
	url = {{https://huggingface.co/{HF_ORG}}}
	}}
	```
	"""

	# ─── OUTPUT / UPLOAD ─────────────────────────────────────────────────────────
	OUT_DIR = os.path.join(BASE_DIR, "hf_readmes")
	os.makedirs(OUT_DIR, exist_ok=True)

	# 1. Main repo README
	main_readme = build_main_repo_readme()
	main_readme_path = os.path.join(OUT_DIR, "EDEN-Core-Scripts_README.md")
	with open(main_readme_path, "w", encoding="utf-8") as f:
	f.write(main_readme)
	print("✓ Main repo README written.")

	# 2. Per-model READMEs (deduplicated by repo name)
	generated_repos = set()
	repo_model_map = {} # repo_name -> (model, readme_text)

	for m in models:
	if m["arch"] == "unknown" or m["dataset"] == "unknown": continue
	repo_name = f"EDEN-{m['arch']}-{m['dataset'].replace(' ','-')}"
	# prefer highest-accuracy model per repo
	if repo_name not in repo_model_map or m["accuracy"] > repo_model_map[repo_name][0]["accuracy"]:
	readme_text = build_readme(m)
	repo_model_map[repo_name] = (m, readme_text)

	for repo_name, (m, readme_text) in repo_model_map.items():
	path = os.path.join(OUT_DIR, f"{repo_name}_README.md")
	with open(path, "w", encoding="utf-8") as f:
	f.write(readme_text)
	print(f"✓ {repo_name} README written.")

	print(f"\n{'='*60}")
	print(f"Generated {len(repo_model_map)+1} README files in: {OUT_DIR}")

	if not DRY_RUN:
	print("\nStarting HF upload...")

	# Upload Main Repo README
	try:
	create_repo(repo_id=f"{HF_ORG}/EDEN-Core-Scripts", token=HF_TOKEN,
	repo_type="model", exist_ok=True, private=False)
	upload_file(path_or_fileobj=main_readme_path,
	path_in_repo="README.md",
	repo_id=f"{HF_ORG}/EDEN-Core-Scripts",
	token=HF_TOKEN, repo_type="model")
	# Upload all .py scripts
	for py in glob.glob(os.path.join(BASE_DIR, "*/.py"), recursive=True):
	rel = os.path.relpath(py, BASE_DIR)
	if any(k in rel for k in ["Algo_","eden_","mobilevit_model"]):
	upload_file(path_or_fileobj=py,
	path_in_repo=rel.replace("\\","/"),
	repo_id=f"{HF_ORG}/EDEN-Core-Scripts",
	token=HF_TOKEN, repo_type="model")
	print("✓ Uploaded EDEN-Core-Scripts")
	except Exception as e:
	print(f"✗ Core-Scripts error: {e}")

	# Upload per-model repos
	for repo_name, (m, readme_text) in repo_model_map.items():
	try:
	create_repo(repo_id=f"{HF_ORG}/{repo_name}", token=HF_TOKEN,
	repo_type="model", exist_ok=True, private=False)
	readme_path = os.path.join(OUT_DIR, f"{repo_name}_README.md")
	upload_file(path_or_fileobj=readme_path,
	path_in_repo="README.md",
	repo_id=f"{HF_ORG}/{repo_name}",
	token=HF_TOKEN, repo_type="model")
	# Upload weights
	pth_abs = os.path.join(BASE_DIR, m["pth"])
	if os.path.exists(pth_abs):
	upload_file(path_or_fileobj=pth_abs,
	path_in_repo=os.path.basename(m["pth"]),
	repo_id=f"{HF_ORG}/{repo_name}",
	token=HF_TOKEN, repo_type="model")
	# Upload CSV log
	if m["csv"] != "N/A":
	csv_abs = os.path.join(BASE_DIR, m["csv"])
	if os.path.exists(csv_abs):
	upload_file(path_or_fileobj=csv_abs,
	path_in_repo=os.path.basename(m["csv"]),
	repo_id=f"{HF_ORG}/{repo_name}",
	token=HF_TOKEN, repo_type="model")
	print(f"✓ Uploaded {repo_name}")
	except Exception as e:
	print(f"✗ {repo_name} error: {e}")

	print("\nAll uploads complete.")
	else:
	print("\n[DRY RUN] Set DRY_RUN=False to execute HF uploads.")