Spaces:

marcellorusso
/

orchid-ncd

Running

App Files Files Community

orchid-ncd / backend /scripts /hf_tools /upload_models.py

marcellorusso

Sync from GitHub: 69fcfd3

c38595c verified 8 days ago

raw

history blame contribute delete

13.3 kB

	"""Upload only essential model files to HuggingFace (best fold + metadata).

	Usage: python upload_models.py [--cleanup] [--exp all\|exp6_ce\|exp7_supcon]
	"""
	import json
	import os
	import sys
	from pathlib import Path

	from dotenv import load_dotenv
	from huggingface_hub import HfApi

	# Carica HF_TOKEN dal .env (root progetto: resnet18/.env)
	_env = Path(__file__).resolve().parent.parent.parent / ".env"
	if _env.exists():
	load_dotenv(_env)

	HF_REPO = "marcellorusso/orchid-ncd-models"
	EXP_BASE = os.path.join(os.path.dirname(__file__), "..", "..", "experiments")
	EXP_BASE = os.path.abspath(EXP_BASE)

	MODELS = ["resnet18", "resnet50", "convnext_tiny", "convnext_small", "dinov2_small", "dinov2_base"]

	VARIANT_DISPLAY = {
	"resnet18": "ResNet-18",
	"resnet50": "ResNet-50",
	"convnext_tiny": "ConvNeXt-Tiny",
	"convnext_small": "ConvNeXt-Small",
	"dinov2_small": "DINOv2-Small",
	"dinov2_base": "DINOv2-Base",
	}

	EXPERIMENTS = [
	{"prefix": "exp6_clean_split", "exp": "exp6_ce", "label": "Exp 6 (CE)"},
	{"prefix": "exp7_supcon", "exp": "exp7_supcon", "label": "Exp 7 (SupCon)"},
	{"prefix": "exp8_sphor", "exp": "exp8_sphor", "label": "Exp 8 (SpHOR)"},
	{"prefix": "exp9_ncd", "exp": "exp9_ncd", "label": "Exp 9 (NCD)"},
	{"prefix": "exp10_mgh_ncd", "exp": "exp10_mgh_ncd", "label": "Exp 10 (MGH-NCD)"},
	]


	def upload_model(api, model, exp, exp_prefix):
	exp_name = f"{exp_prefix}_{model}"
	local_dir = os.path.join(EXP_BASE, exp_name)
	remote_dir = f"{exp}/{exp_name}"

	results_path = os.path.join(local_dir, "results.json")
	if not os.path.exists(results_path):
	print(f" SKIP {exp_name}: no results.json")
	return False

	results = json.load(open(results_path))
	best_fold = results.get("best_fold")
	if best_fold is None:
	print(f" SKIP {exp_name}: no best_fold in results")
	return False

	best_pt = f"best_fold_{best_fold}.pt"
	if not os.path.exists(os.path.join(local_dir, best_pt)):
	print(f" SKIP {exp_name}: {best_pt} not found")
	return False

	mean_f1 = results.get("mean_f1", 0)
	test_f1 = results.get("test_metrics", {}).get("f1_macro", 0)
	print(f"\n Uploading {exp_name} (Val F1={mean_f1:.4f}, Test F1={test_f1:.4f})")

	files_to_upload = [
	("results.json", "results.json"),
	("config.json", "config.json"),
	(best_pt, best_pt),
	]

	adapter_dir = os.path.join(local_dir, f"best_fold_{best_fold}_adapter")

	for local_name, remote_name in files_to_upload:
	local_path = os.path.join(local_dir, local_name)
	if not os.path.exists(local_path):
	continue
	size_kb = os.path.getsize(local_path) // 1024
	print(f" {remote_name} ({size_kb}KB)")
	api.upload_file(
	repo_id=HF_REPO,
	path_or_fileobj=local_path,
	path_in_repo=f"{remote_dir}/{remote_name}",
	)

	if os.path.isdir(adapter_dir):
	print(f" best_fold_{best_fold}_adapter/")
	api.upload_folder(
	repo_id=HF_REPO,
	folder_path=adapter_dir,
	path_in_repo=f"{remote_dir}/best_fold_{best_fold}_adapter",
	)

	emb_dir = os.path.join(local_dir, "embeddings")
	if os.path.isdir(emb_dir):
	print(f" embeddings/")
	api.upload_folder(
	repo_id=HF_REPO,
	folder_path=emb_dir,
	path_in_repo=f"{remote_dir}/embeddings",
	)

	print(f" Done: {exp_name}")
	return True


	def cleanup_old_files(api):
	"""Remove old fold weights and non-best fold files from HF."""
	print("\n Cleaning up old files from HF...")
	all_files = api.list_repo_files(HF_REPO)
	removed = 0

	for exp_config in EXPERIMENTS:
	exp = exp_config["exp"]
	exp_prefix = exp_config["prefix"]

	for model in MODELS:
	exp_name = f"{exp_prefix}_{model}"
	prefix = f"{exp}/{exp_name}/"

	local_results = os.path.join(EXP_BASE, exp_name, "results.json")
	if not os.path.exists(local_results):
	continue
	best_fold = json.load(open(local_results)).get("best_fold")
	if best_fold is None:
	continue

	best_pt = f"best_fold_{best_fold}.pt"
	best_adapter = f"best_fold_{best_fold}_adapter"

	for f in all_files:
	if not f.startswith(prefix):
	continue
	fname = f[len(prefix):]

	if fname in ("results.json", "config.json", best_pt):
	continue
	if fname.startswith(best_adapter):
	continue

	if fname.startswith("best_fold_") and fname.endswith(".pt"):
	print(f" DELETE {f}")
	api.delete_file(repo_id=HF_REPO, path_in_repo=f)
	removed += 1
	elif fname.startswith("fold_") and fname.endswith(".json"):
	print(f" DELETE {f}")
	api.delete_file(repo_id=HF_REPO, path_in_repo=f)
	removed += 1
	elif fname.startswith("REPORT_"):
	print(f" DELETE {f}")
	api.delete_file(repo_id=HF_REPO, path_in_repo=f)
	removed += 1

	print(f" Cleaned up {removed} old files")


	def _build_results_table(exp, exp_prefix):
	rows = []
	for model in MODELS:
	exp_name = f"{exp_prefix}_{model}"
	local_results = os.path.join(EXP_BASE, exp_name, "results.json")
	if not os.path.exists(local_results):
	continue
	r = json.load(open(local_results))
	mean_f1 = r.get("mean_f1", 0)
	std_f1 = r.get("std_f1", 0)
	tm = r.get("test_metrics", {})
	test_acc = tm.get("accuracy", 0)
	test_f1 = tm.get("f1_macro", 0)
	best_fold = r.get("best_fold", "?")
	num_folds = r.get("num_folds", 5)
	display = VARIANT_DISPLAY.get(model, model)
	rows.append(
	f"\| {display} \| {mean_f1:.4f} ± {std_f1:.4f} \| {test_f1:.4f} \| {test_acc:.4f} \| {best_fold} \| {num_folds} \|"
	)
	return rows


	def _build_results_yaml(exp6_rows, exp7_rows, exp8_rows, exp9_rows=None, exp10_rows=None):
	"""Generate structured YAML for model evaluation results."""
	lines = ["model-index:"]
	all_rows = []
	for exp_label, rows in [("Exp 6 (CE)", exp6_rows), ("Exp 7 (SupCon)", exp7_rows),
	("Exp 8 (SpHOR)", exp8_rows), ("Exp 9 (NCD)", exp9_rows or []),
	("Exp 10 (Ultimate)", exp10_rows or [])]:
	for row in rows:
	model_name = row.split("\|")[1].strip()
	f1_val = row.split("\|")[2].strip().split(" ")[0]
	all_rows.append((exp_label, model_name, f1_val))

	if not all_rows:
	return ""

	for i, (exp, model, f1) in enumerate(all_rows):
	lines.append(f" - name: {model}")
	lines.append(" results:")
	lines.append(" - task:")
	lines.append(" type: image-classification")
	lines.append(" dataset:")
	lines.append(' name: orchid-ncd-dataset')
	lines.append(' type: marcellorusso/orchid-ncd-dataset')
	lines.append(" metrics:")
	lines.append(" - name: Macro F1 Score")
	lines.append(" type: f1_macro")
	lines.append(f" value: {f1}")
	lines.append(" source:")
	lines.append(f" name: {exp}")
	lines.append(f" url: https://huggingface.co/datasets/marcellorusso/orchid-ncd-dataset")

	return "\n".join(lines)


	def update_readme(api):
	"""Update HF model repo README with current results."""
	header = "\| Model \| Val F1 (macro) \| Test F1 \| Test Acc \| Best Fold \| Folds \|\n\|---\|---\|---\|---\|---\|---\|"

	exp6_rows = _build_results_table("exp6_ce", "exp6_clean_split")
	exp7_rows = _build_results_table("exp7_supcon", "exp7_supcon")
	exp8_rows = _build_results_table("exp8_sphor", "exp8_sphor")
	exp9_rows = _build_results_table("exp9_ncd", "exp9_ncd")
	exp10_rows = _build_results_table("exp10_mgh_ncd", "exp10_mgh_ncd")

	sections = []

	if exp6_rows:
	sections.append(f"""## Exp 6: Cross-Entropy

	5-fold stratified cross-validation on deduplicated clean split (2,232 train, 300 test).

	Training recipe: epochs=100, patience=15, effective batch=32 (gradient accumulation ×4), per-architecture LR from registry.

	{header}
	{chr(10).join(exp6_rows)}""")

	if exp7_rows:
	sections.append(f"""## Exp 7: Supervised Contrastive Learning

	Two-phase training: SupCon pretraining (InfoNCE, τ=0.07) → CE fine-tuning with frozen backbone.

	Recipe: same per-architecture optimizer/LR, projection dim=128, CE Phase LR=0.01, patience=15 on val metrics.

	{header}
	{chr(10).join(exp7_rows)}""")

	if exp8_rows:
	sections.append(f"""## Exp 8: Spherical Orthogonal Prototypes (SpHOR)

	Two-phase training: SupCon + Spherical Orthogonal Prototypes → CE fine-tuning with frozen backbone.

	Recipe: same as Exp 7, with spherical prototype repulsion (repulse=0.01).

	{header}
	{chr(10).join(exp8_rows)}""")

	if exp9_rows:
	sections.append(f"""## Exp 9: Novel Class Discovery (NCD)

	NCD scenario: O. majellensis excluded from training (hidden novel class).

	Recipe: same as Exp 8.

	{header}
	{chr(10).join(exp9_rows)}""")

	if exp10_rows:
	sections.append(f"""## Exp 10: Ultimate Experiment

	Final experiment combining multi-granularity features, hard negative mining, deep fine-tuning, and OSR ensemble.

	Recipe: LoRA r=32 + partial unfreeze for DINOv2, full fine-tuning for ConvNeXt/ResNet. Hard negative weight 5.0 on (majellensis, sphegodes) pairs.

	{header}
	{chr(10).join(exp10_rows)}""")

	results_yaml = _build_results_yaml(exp6_rows, exp7_rows, exp8_rows, exp9_rows, exp10_rows)

	readme = f"""---
	library_name: pytorch
	tags:
	- computer-vision
	- image-classification
	- fine-grained-classification
	- ophrys-orchids
	- resnet
	- convnext
	- dinov2
	datasets:
	- marcellorusso/orchid-ncd-dataset
	license: mit
	pipeline_tag: image-classification
	{results_yaml}
	---

	# OrchID-NCD Models

	Trained model weights for the [OrchID-NCD](https://huggingface.co/spaces/marcellorusso/orchid-ncd) project — ultra-fine-grained visual classification of Ophrys orchids.

	Fine-grained classification of six cryptic Ophrys species (O. exaltata, O. garganica, O. incubacea, O. majellensis, O. sphegodes, O. sphegodes Palena) using ResNet-18/50, ConvNeXt-Tiny/Small, and DINOv2-Small/Base.

	{chr(10).join(sections)}

	## Structure

	```
	exp6_ce/ — Exp 6 (Cross-Entropy)
	exp6_clean_split_resnet18/
	results.json — aggregated metrics + test results
	config.json — training configuration
	best_fold_N.pt — weights of the best fold
	best_fold_N_adapter/ — LoRA adapter (DINOv2 only)
	exp7_supcon/ — Exp 7 (SupCon + CE fine-tune)
	exp7_supcon_resnet18/
	...
	exp8_sphor/ — Exp 8 (SpHOR)
	exp8_sphor_resnet18/
	...
	exp9_ncd/ — Exp 9 (NCD)
	exp9_ncd_resnet18/
	...
	exp10_mgh_ncd/ — Exp 10 (MGH-NCD)
	exp10_mgh_ncd_resnet18/
	...
	```

	## Usage

	The classifier in the [OrchID-NCD Space](https://huggingface.co/spaces/marcellorusso/orchid-ncd) downloads these weights at startup and uses them for inference.

	## Links

	- [Dataset](https://huggingface.co/datasets/marcellorusso/orchid-ncd-dataset)
	- [Live Demo](https://huggingface.co/spaces/marcellorusso/orchid-ncd)
	- [GitHub](https://github.com/squidslab/OrchID)
	"""

	import tempfile
	with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
	f.write(readme)
	f.flush()
	api.upload_file(
	repo_id=HF_REPO,
	path_or_fileobj=f.name,
	path_in_repo="README.md",
	)
	os.unlink(f.name)
	print(" README.md updated")


	def main():
	token = os.environ.get("HF_TOKEN")
	api = HfApi(token=token)
	api.whoami()
	print("Authenticated OK")

	do_cleanup = "--cleanup" in sys.argv

	# Determine which experiments to upload
	exp_filter = "all"
	for arg in sys.argv:
	if arg.startswith("--exp="):
	exp_filter = arg.split("=")[1]
	elif arg == "--exp" and sys.argv.index(arg) + 1 < len(sys.argv):
	exp_filter = sys.argv[sys.argv.index(arg) + 1]

	uploaded = 0
	total = 0
	for exp_config in EXPERIMENTS:
	exp = exp_config["exp"]
	exp_prefix = exp_config["prefix"]
	label = exp_config["label"]

	if exp_filter != "all" and exp_filter not in exp:
	continue

	print(f"\n=== Uploading {label} ({exp}) ===")
	for model in MODELS:
	total += 1
	try:
	if upload_model(api, model, exp, exp_prefix):
	uploaded += 1
	except Exception as e:
	print(f" ERROR {model}: {e}")

	print(f"\nUploaded {uploaded}/{total} models")

	if do_cleanup:
	cleanup_old_files(api)

	update_readme(api)

	print("\nDone!")


	if __name__ == "__main__":
	main()