orchid-ncd / backend /scripts /hf_tools /upload_models.py
marcellorusso's picture
Sync from GitHub: 69fcfd3
c38595c verified
"""Upload only essential model files to HuggingFace (best fold + metadata).
Usage: python upload_models.py [--cleanup] [--exp all|exp6_ce|exp7_supcon]
"""
import json
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import HfApi
# Carica HF_TOKEN dal .env (root progetto: resnet18/.env)
_env = Path(__file__).resolve().parent.parent.parent / ".env"
if _env.exists():
load_dotenv(_env)
HF_REPO = "marcellorusso/orchid-ncd-models"
EXP_BASE = os.path.join(os.path.dirname(__file__), "..", "..", "experiments")
EXP_BASE = os.path.abspath(EXP_BASE)
MODELS = ["resnet18", "resnet50", "convnext_tiny", "convnext_small", "dinov2_small", "dinov2_base"]
VARIANT_DISPLAY = {
"resnet18": "ResNet-18",
"resnet50": "ResNet-50",
"convnext_tiny": "ConvNeXt-Tiny",
"convnext_small": "ConvNeXt-Small",
"dinov2_small": "DINOv2-Small",
"dinov2_base": "DINOv2-Base",
}
EXPERIMENTS = [
{"prefix": "exp6_clean_split", "exp": "exp6_ce", "label": "Exp 6 (CE)"},
{"prefix": "exp7_supcon", "exp": "exp7_supcon", "label": "Exp 7 (SupCon)"},
{"prefix": "exp8_sphor", "exp": "exp8_sphor", "label": "Exp 8 (SpHOR)"},
{"prefix": "exp9_ncd", "exp": "exp9_ncd", "label": "Exp 9 (NCD)"},
{"prefix": "exp10_mgh_ncd", "exp": "exp10_mgh_ncd", "label": "Exp 10 (MGH-NCD)"},
]
def upload_model(api, model, exp, exp_prefix):
exp_name = f"{exp_prefix}_{model}"
local_dir = os.path.join(EXP_BASE, exp_name)
remote_dir = f"{exp}/{exp_name}"
results_path = os.path.join(local_dir, "results.json")
if not os.path.exists(results_path):
print(f" SKIP {exp_name}: no results.json")
return False
results = json.load(open(results_path))
best_fold = results.get("best_fold")
if best_fold is None:
print(f" SKIP {exp_name}: no best_fold in results")
return False
best_pt = f"best_fold_{best_fold}.pt"
if not os.path.exists(os.path.join(local_dir, best_pt)):
print(f" SKIP {exp_name}: {best_pt} not found")
return False
mean_f1 = results.get("mean_f1", 0)
test_f1 = results.get("test_metrics", {}).get("f1_macro", 0)
print(f"\n Uploading {exp_name} (Val F1={mean_f1:.4f}, Test F1={test_f1:.4f})")
files_to_upload = [
("results.json", "results.json"),
("config.json", "config.json"),
(best_pt, best_pt),
]
adapter_dir = os.path.join(local_dir, f"best_fold_{best_fold}_adapter")
for local_name, remote_name in files_to_upload:
local_path = os.path.join(local_dir, local_name)
if not os.path.exists(local_path):
continue
size_kb = os.path.getsize(local_path) // 1024
print(f" {remote_name} ({size_kb}KB)")
api.upload_file(
repo_id=HF_REPO,
path_or_fileobj=local_path,
path_in_repo=f"{remote_dir}/{remote_name}",
)
if os.path.isdir(adapter_dir):
print(f" best_fold_{best_fold}_adapter/")
api.upload_folder(
repo_id=HF_REPO,
folder_path=adapter_dir,
path_in_repo=f"{remote_dir}/best_fold_{best_fold}_adapter",
)
emb_dir = os.path.join(local_dir, "embeddings")
if os.path.isdir(emb_dir):
print(f" embeddings/")
api.upload_folder(
repo_id=HF_REPO,
folder_path=emb_dir,
path_in_repo=f"{remote_dir}/embeddings",
)
print(f" Done: {exp_name}")
return True
def cleanup_old_files(api):
"""Remove old fold weights and non-best fold files from HF."""
print("\n Cleaning up old files from HF...")
all_files = api.list_repo_files(HF_REPO)
removed = 0
for exp_config in EXPERIMENTS:
exp = exp_config["exp"]
exp_prefix = exp_config["prefix"]
for model in MODELS:
exp_name = f"{exp_prefix}_{model}"
prefix = f"{exp}/{exp_name}/"
local_results = os.path.join(EXP_BASE, exp_name, "results.json")
if not os.path.exists(local_results):
continue
best_fold = json.load(open(local_results)).get("best_fold")
if best_fold is None:
continue
best_pt = f"best_fold_{best_fold}.pt"
best_adapter = f"best_fold_{best_fold}_adapter"
for f in all_files:
if not f.startswith(prefix):
continue
fname = f[len(prefix):]
if fname in ("results.json", "config.json", best_pt):
continue
if fname.startswith(best_adapter):
continue
if fname.startswith("best_fold_") and fname.endswith(".pt"):
print(f" DELETE {f}")
api.delete_file(repo_id=HF_REPO, path_in_repo=f)
removed += 1
elif fname.startswith("fold_") and fname.endswith(".json"):
print(f" DELETE {f}")
api.delete_file(repo_id=HF_REPO, path_in_repo=f)
removed += 1
elif fname.startswith("REPORT_"):
print(f" DELETE {f}")
api.delete_file(repo_id=HF_REPO, path_in_repo=f)
removed += 1
print(f" Cleaned up {removed} old files")
def _build_results_table(exp, exp_prefix):
rows = []
for model in MODELS:
exp_name = f"{exp_prefix}_{model}"
local_results = os.path.join(EXP_BASE, exp_name, "results.json")
if not os.path.exists(local_results):
continue
r = json.load(open(local_results))
mean_f1 = r.get("mean_f1", 0)
std_f1 = r.get("std_f1", 0)
tm = r.get("test_metrics", {})
test_acc = tm.get("accuracy", 0)
test_f1 = tm.get("f1_macro", 0)
best_fold = r.get("best_fold", "?")
num_folds = r.get("num_folds", 5)
display = VARIANT_DISPLAY.get(model, model)
rows.append(
f"| {display} | {mean_f1:.4f} Β± {std_f1:.4f} | {test_f1:.4f} | {test_acc:.4f} | {best_fold} | {num_folds} |"
)
return rows
def _build_results_yaml(exp6_rows, exp7_rows, exp8_rows, exp9_rows=None, exp10_rows=None):
"""Generate structured YAML for model evaluation results."""
lines = ["model-index:"]
all_rows = []
for exp_label, rows in [("Exp 6 (CE)", exp6_rows), ("Exp 7 (SupCon)", exp7_rows),
("Exp 8 (SpHOR)", exp8_rows), ("Exp 9 (NCD)", exp9_rows or []),
("Exp 10 (Ultimate)", exp10_rows or [])]:
for row in rows:
model_name = row.split("|")[1].strip()
f1_val = row.split("|")[2].strip().split(" ")[0]
all_rows.append((exp_label, model_name, f1_val))
if not all_rows:
return ""
for i, (exp, model, f1) in enumerate(all_rows):
lines.append(f" - name: {model}")
lines.append(" results:")
lines.append(" - task:")
lines.append(" type: image-classification")
lines.append(" dataset:")
lines.append(' name: orchid-ncd-dataset')
lines.append(' type: marcellorusso/orchid-ncd-dataset')
lines.append(" metrics:")
lines.append(" - name: Macro F1 Score")
lines.append(" type: f1_macro")
lines.append(f" value: {f1}")
lines.append(" source:")
lines.append(f" name: {exp}")
lines.append(f" url: https://huggingface.co/datasets/marcellorusso/orchid-ncd-dataset")
return "\n".join(lines)
def update_readme(api):
"""Update HF model repo README with current results."""
header = "| Model | Val F1 (macro) | Test F1 | Test Acc | Best Fold | Folds |\n|---|---|---|---|---|---|"
exp6_rows = _build_results_table("exp6_ce", "exp6_clean_split")
exp7_rows = _build_results_table("exp7_supcon", "exp7_supcon")
exp8_rows = _build_results_table("exp8_sphor", "exp8_sphor")
exp9_rows = _build_results_table("exp9_ncd", "exp9_ncd")
exp10_rows = _build_results_table("exp10_mgh_ncd", "exp10_mgh_ncd")
sections = []
if exp6_rows:
sections.append(f"""## Exp 6: Cross-Entropy
5-fold stratified cross-validation on deduplicated clean split (2,232 train, 300 test).
**Training recipe:** epochs=100, patience=15, effective batch=32 (gradient accumulation Γ—4), per-architecture LR from registry.
{header}
{chr(10).join(exp6_rows)}""")
if exp7_rows:
sections.append(f"""## Exp 7: Supervised Contrastive Learning
Two-phase training: SupCon pretraining (InfoNCE, Ο„=0.07) β†’ CE fine-tuning with frozen backbone.
**Recipe:** same per-architecture optimizer/LR, projection dim=128, CE Phase LR=0.01, patience=15 on val metrics.
{header}
{chr(10).join(exp7_rows)}""")
if exp8_rows:
sections.append(f"""## Exp 8: Spherical Orthogonal Prototypes (SpHOR)
Two-phase training: SupCon + Spherical Orthogonal Prototypes β†’ CE fine-tuning with frozen backbone.
**Recipe:** same as Exp 7, with spherical prototype repulsion (repulse=0.01).
{header}
{chr(10).join(exp8_rows)}""")
if exp9_rows:
sections.append(f"""## Exp 9: Novel Class Discovery (NCD)
NCD scenario: O. majellensis excluded from training (hidden novel class).
**Recipe:** same as Exp 8.
{header}
{chr(10).join(exp9_rows)}""")
if exp10_rows:
sections.append(f"""## Exp 10: Ultimate Experiment
Final experiment combining multi-granularity features, hard negative mining, deep fine-tuning, and OSR ensemble.
**Recipe:** LoRA r=32 + partial unfreeze for DINOv2, full fine-tuning for ConvNeXt/ResNet. Hard negative weight 5.0 on (majellensis, sphegodes) pairs.
{header}
{chr(10).join(exp10_rows)}""")
results_yaml = _build_results_yaml(exp6_rows, exp7_rows, exp8_rows, exp9_rows, exp10_rows)
readme = f"""---
library_name: pytorch
tags:
- computer-vision
- image-classification
- fine-grained-classification
- ophrys-orchids
- resnet
- convnext
- dinov2
datasets:
- marcellorusso/orchid-ncd-dataset
license: mit
pipeline_tag: image-classification
{results_yaml}
---
# OrchID-NCD Models
Trained model weights for the [OrchID-NCD](https://huggingface.co/spaces/marcellorusso/orchid-ncd) project β€” ultra-fine-grained visual classification of *Ophrys* orchids.
Fine-grained classification of six cryptic *Ophrys* species (*O. exaltata, O. garganica, O. incubacea, O. majellensis, O. sphegodes, O. sphegodes* Palena) using ResNet-18/50, ConvNeXt-Tiny/Small, and DINOv2-Small/Base.
{chr(10).join(sections)}
## Structure
```
exp6_ce/ β€” Exp 6 (Cross-Entropy)
exp6_clean_split_resnet18/
results.json β€” aggregated metrics + test results
config.json β€” training configuration
best_fold_N.pt β€” weights of the best fold
best_fold_N_adapter/ β€” LoRA adapter (DINOv2 only)
exp7_supcon/ β€” Exp 7 (SupCon + CE fine-tune)
exp7_supcon_resnet18/
...
exp8_sphor/ β€” Exp 8 (SpHOR)
exp8_sphor_resnet18/
...
exp9_ncd/ β€” Exp 9 (NCD)
exp9_ncd_resnet18/
...
exp10_mgh_ncd/ β€” Exp 10 (MGH-NCD)
exp10_mgh_ncd_resnet18/
...
```
## Usage
The classifier in the [OrchID-NCD Space](https://huggingface.co/spaces/marcellorusso/orchid-ncd) downloads these weights at startup and uses them for inference.
## Links
- [Dataset](https://huggingface.co/datasets/marcellorusso/orchid-ncd-dataset)
- [Live Demo](https://huggingface.co/spaces/marcellorusso/orchid-ncd)
- [GitHub](https://github.com/squidslab/OrchID)
"""
import tempfile
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(readme)
f.flush()
api.upload_file(
repo_id=HF_REPO,
path_or_fileobj=f.name,
path_in_repo="README.md",
)
os.unlink(f.name)
print(" README.md updated")
def main():
token = os.environ.get("HF_TOKEN")
api = HfApi(token=token)
api.whoami()
print("Authenticated OK")
do_cleanup = "--cleanup" in sys.argv
# Determine which experiments to upload
exp_filter = "all"
for arg in sys.argv:
if arg.startswith("--exp="):
exp_filter = arg.split("=")[1]
elif arg == "--exp" and sys.argv.index(arg) + 1 < len(sys.argv):
exp_filter = sys.argv[sys.argv.index(arg) + 1]
uploaded = 0
total = 0
for exp_config in EXPERIMENTS:
exp = exp_config["exp"]
exp_prefix = exp_config["prefix"]
label = exp_config["label"]
if exp_filter != "all" and exp_filter not in exp:
continue
print(f"\n=== Uploading {label} ({exp}) ===")
for model in MODELS:
total += 1
try:
if upload_model(api, model, exp, exp_prefix):
uploaded += 1
except Exception as e:
print(f" ERROR {model}: {e}")
print(f"\nUploaded {uploaded}/{total} models")
if do_cleanup:
cleanup_old_files(api)
update_readme(api)
print("\nDone!")
if __name__ == "__main__":
main()