Spaces:

arudaev
/

chexvision-demo

Running

App Files Files Community

chexvision-demo / src /utils /hub.py

arudaev

chore: rename HF owner HlexNC → arudaev across all references

c3ebe8c about 2 months ago

raw

history blame contribute delete

18.1 kB

	"""Helpers for Hugging Face Hub authentication, runtime config, and uploads."""

	from __future__ import annotations

	import json
	import os
	import shutil
	import socket
	import tempfile
	from pathlib import Path
	from typing import Any

	HF_DATASET_REPO = "arudaev/chest-xray-14-320"
	HF_DATASET_REVISION = os.environ.get(
	"CHEXVISION_DATASET_REVISION",
	"44443e6ee968b3c6094b63f14a27698c40b50680",
	)

	# NIH Chest X-ray14 pathology labels in canonical order
	PATHOLOGY_LABELS = [
	"Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
	"Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
	"Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia",
	]


	def _load_dotenv_if_available() -> None:
	"""Load project-root environment variables when python-dotenv is installed."""
	try:
	from dotenv import load_dotenv

	load_dotenv()
	except ImportError:
	return


	def _set_hf_token_env(token: str) -> str:
	"""Persist the resolved token to the canonical HF environment variables."""
	os.environ["HF_TOKEN"] = token
	os.environ["HUGGING_FACE_HUB_TOKEN"] = token
	return token


	def load_hf_token(required: bool = False) -> str \| None:
	"""Resolve an HF token from env vars, project .env, Kaggle dataset, or Kaggle secrets.

	Resolution order (first non-empty value wins):
	1. Environment variables (HF_TOKEN / legacy aliases)
	2. ``/kaggle/input/chexvision-secrets/hf_token.txt`` — preferred automated path;
	attach the private dataset ``hlexnc/chexvision-secrets`` via dataset_sources in
	kernel-metadata.json so every API-pushed kernel gets it without manual UI steps.
	3. Kaggle UserSecretsClient — works only for interactive sessions, kept as fallback.
	"""
	_load_dotenv_if_available()
	kaggle_secret_error: str \| None = None

	# 1. Environment variables (highest priority; set by .env, GitHub Actions, etc.)
	token_names = ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HUGGING_FACE_HUB_TOKEN")
	for name in token_names:
	token = os.environ.get(name, "").strip()
	if token:
	return _set_hf_token_env(token)

	# 2. Token file from an attached Kaggle dataset source.
	# Kaggle mounts dataset_sources under two possible paths depending on
	# the runtime version — check both so old and new kernels both work.
	for token_file in (
	Path("/kaggle/input/datasets/hlexnc/chexvision-secrets/hf_token.txt"),
	Path("/kaggle/input/chexvision-secrets/hf_token.txt"),
	):
	if token_file.exists():
	token = token_file.read_text(encoding="utf-8").strip()
	if token:
	print(f"[hub] Loaded HF_TOKEN from Kaggle dataset source: {token_file}")
	return _set_hf_token_env(token)

	# 3. Kaggle UserSecretsClient — interactive sessions only (fallback).
	try:
	from kaggle_secrets import UserSecretsClient

	token = UserSecretsClient().get_secret("HF_TOKEN").strip()
	except Exception as exc:
	token = ""
	kaggle_secret_error = f"{type(exc).__name__}: {exc}"

	if token:
	return _set_hf_token_env(token)

	if required:
	if os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
	detail = f" Kaggle reported: {kaggle_secret_error}" if kaggle_secret_error else ""
	raise RuntimeError(
	"HF_TOKEN not found. Preferred fix: create a private Kaggle dataset "
	"'hlexnc/chexvision-secrets' with a file 'hf_token.txt' containing your "
	"HF token, then add it to dataset_sources in kernel-metadata.json. "
	f"Alternatively enable HF_TOKEN in Kaggle Secrets (interactive only).{detail}"
	)
	raise RuntimeError(
	"HF_TOKEN not found. Set it in .env, export it in the environment, "
	"or add it to Kaggle Secrets."
	)
	return None


	def configure_hf_runtime(
	token: str \| None = None,
	*,
	required_token: bool = False,
	check_dns: bool = False,
	) -> str \| None:
	"""Set the HF runtime environment before importing HF client libraries."""
	resolved_token = token or load_hf_token(required=required_token)

	if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") and "HF_HOME" not in os.environ:
	os.environ["HF_HOME"] = "/kaggle/working/hf_home"

	hf_home = os.environ.get("HF_HOME", "").strip()
	if hf_home:
	try:
	Path(hf_home).mkdir(parents=True, exist_ok=True)
	except OSError:
	pass # Best-effort; the path may not be writable outside a real Kaggle kernel

	os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
	os.environ.setdefault("HF_HUB_ETAG_TIMEOUT", "30")
	os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
	os.environ.setdefault("HF_HUB_VERBOSITY", "info")

	if resolved_token:
	_set_hf_token_env(resolved_token)

	if check_dns:
	try:
	socket.getaddrinfo("huggingface.co", 443)
	except OSError as exc:
	raise RuntimeError(
	"Failed to resolve huggingface.co from the current runtime. "
	"Check Kaggle internet access or a platform-side DNS issue."
	) from exc

	return resolved_token


	def _safe_metric(history: dict[str, Any] \| None, key: str) -> float \| None:
	"""Return the best numeric value recorded for a metric history key."""
	if not history:
	return None
	values = history.get(key, [])
	if not isinstance(values, list) or not values:
	return None
	return float(max(values))


	def _architecture_summary(config: dict[str, Any]) -> str:
	"""Produce a short human-readable architecture summary."""
	model_cfg = config.get("model", {})
	model_type = model_cfg.get("type", "")
	if model_type == "densenet":
	return "DenseNet-121 transfer learning with a shared feature layer and dual classification heads."
	arch = model_cfg.get("architecture", {})
	blocks = arch.get("block_config", [3, 4, 6, 3])
	use_se = arch.get("use_se", True)
	se_note = " with Squeeze-Excitation channel attention" if use_se else ""
	return (
	f"Custom residual CNN{se_note} (depth {blocks}) trained from scratch "
	"with shared features and dual classification heads."
	)


	def _render_pipeline_diagram() -> str:
	"""Mermaid flowchart of the full data→train→upload pipeline."""
	return """```mermaid
	flowchart TD
	DS[("🗄️ arudaev/chest-xray-14-320\n112,120 images · 36 shards · ~7.97 GB")]
	DS -->\|snapshot_download\| PREP["📂 data/images · data/labels.csv\ntrain 78,468 · val 11,210 · test 22,442"]
	PREP --> AUG["Augmentation Pipeline\nHFlip · Rotate±15° · RandomAffine\nColorJitter · GaussianBlur · RandomErasing"]
	AUG --> FWD["⚡ Model Forward Pass\ntorch.cuda.amp.autocast · fp16"]
	FWD --> ML["multilabel_logits B×14\nWeightedBCE + pos_weight · 14 classes"]
	FWD --> BIN["binary_logits B×1\nBCE · Normal vs. Abnormal"]
	ML --> LOSS["Combined Loss\n1.0 × multilabel + 0.5 × binary"]
	BIN --> LOSS
	LOSS --> BACK["Backward · Grad Clip 1.0\nGradient Accumulation ×4 · eff. batch 96"]
	BACK --> OPT["AdamW · CosineAnnealingLR\nearly stop patience = 15"]
	OPT -->\|"↑ best val AUC-ROC"\| BEST["💾 Best Checkpoint\nmodel_state · best_val_metrics · config"]
	BEST -->\|upload_model_artifacts\| HUB["🤗 HF Hub\ncheckpoint · history.json · model card"]
	```"""


	def _render_scratch_architecture(config: dict[str, Any]) -> str:
	"""Mermaid architecture diagram for CheXVisionScratch."""
	arch = config.get("model", {}).get("architecture", {})
	blocks = arch.get("block_config", [3, 4, 6, 3])
	use_se = arch.get("use_se", True)
	block_label = "SE-ResBlock" if use_se else "ResBlock"
	b1, b2, b3, b4 = (blocks + [3, 4, 6, 3])[:4]
	return f"""```mermaid
	graph LR
	IN["Input
	3 × 320 × 320"] --> STEM["Stem
	7×7 Conv · BN · ReLU
	3→64ch · MaxPool ÷2"]
	STEM --> S1["Stage 1
	{b1}× {block_label}
	64ch"]
	S1 --> S2["Stage 2 ↓½
	{b2}× {block_label}
	128ch"]
	S2 --> S3["Stage 3 ↓½
	{b3}× {block_label}
	256ch"]
	S3 --> S4["Stage 4 ↓½
	{b4}× {block_label}
	512ch"]
	S4 --> GAP["Global Avg Pool
	Dropout(0.5)
	512-dim"]
	GAP --> MLH["Multilabel Head
	Linear 512→14
	sigmoid · 14 pathologies"]
	GAP --> BH["Binary Head
	Linear 512→1
	sigmoid · Normal/Abnormal"]
	style MLH fill:#2e7d32,color:#fff
	style BH fill:#1565c0,color:#fff
	style IN fill:#37474f,color:#fff
	```"""


	def _render_densenet_architecture() -> str:
	"""Mermaid architecture diagram for CheXVisionDenseNet."""
	return """```mermaid
	graph LR
	IN["Input
	3 × 320 × 320"] --> BB["DenseNet-121 Backbone
	ImageNet pretrained
	Dense connectivity
	7.9M parameters"]
	BB --> GAP2["Adaptive Avg Pool
	1024-dim features"]
	GAP2 --> FL["Feature Layer
	Linear 1024→512
	ReLU · Dropout(0.3)"]
	FL --> MLH["Multilabel Head
	Linear 512→14
	sigmoid · 14 pathologies"]
	FL --> BH["Binary Head
	Linear 512→1
	sigmoid · Normal/Abnormal"]
	style MLH fill:#2e7d32,color:#fff
	style BH fill:#1565c0,color:#fff
	style IN fill:#37474f,color:#fff
	style BB fill:#6a1b9a,color:#fff
	```"""


	def _render_densenet_finetuning(config: dict[str, Any]) -> str:
	"""Mermaid fine-tuning phase diagram for DenseNet."""
	ft = config.get("model", {}).get("fine_tuning", {})
	freeze_epochs = ft.get("freeze_epochs", 5)
	total_epochs = config.get("training", {}).get("epochs", 60)
	unfreeze_lr = ft.get("unfreeze_lr", 1e-4)
	freeze_lr = ft.get("freeze_lr", 1e-3)
	return f"""```mermaid
	graph LR
	P1["🔒 Phase 1
	Epochs 1–{freeze_epochs}
	Backbone frozen
	Train heads only
	lr = {freeze_lr}"] -->\|"Epoch {freeze_epochs + 1}
	unfreeze_backbone()"\| P2["🔓 Phase 2
	Epochs {freeze_epochs + 1}–{total_epochs}
	End-to-end fine-tuning
	All layers trainable
	lr = {unfreeze_lr}"]
	style P1 fill:#e65100,color:#fff
	style P2 fill:#6a1b9a,color:#fff
	```"""


	def _render_per_class_auc_table(best_val_metrics: dict[str, Any]) -> str:
	"""Render a markdown table of per-class AUC-ROC from best epoch metrics."""
	rows = []
	for label in PATHOLOGY_LABELS:
	auc = best_val_metrics.get(f"auc_{label}")
	if auc is not None:
	bar_filled = int(round(float(auc) * 10))
	bar = "█" * bar_filled + "░" * (10 - bar_filled)
	rows.append(f"\| {label:<20} \| `{float(auc):.4f}` \| `{bar}` \|")

	if not rows:
	return ""

	table = "\| Pathology \| AUC-ROC \| Visual \|\n"
	table += "\|----------------------\|----------\|---------------\|\n"
	table += "\n".join(rows)
	return table


	def render_model_card(
	repo_id: str,
	checkpoint: dict[str, Any],
	history: dict[str, Any] \| None = None,
	) -> str:
	"""Render a Hugging Face model card with architecture diagrams and training metrics."""
	config = checkpoint.get("config", {})
	data_cfg = config.get("data", {})
	model_cfg = config.get("model", {})
	train_cfg = config.get("training", {})
	model_name = model_cfg.get("name", repo_id.split("/")[-1])
	model_type = model_cfg.get("type", "")
	dataset_repo = data_cfg.get("hf_dataset_repo", HF_DATASET_REPO)
	dataset_revision = data_cfg.get("hf_dataset_revision", HF_DATASET_REVISION)
	best_auc = checkpoint.get("best_auc")
	epoch = checkpoint.get("epoch")
	best_val_metrics: dict[str, Any] = checkpoint.get("best_val_metrics", {})
	best_binary_auc = _safe_metric(history, "binary_auc_roc")
	best_binary_f1 = _safe_metric(history, "binary_f1")

	# --- Metrics summary ---
	metrics_lines = []
	if isinstance(best_auc, (int, float)):
	metrics_lines.append(f"- Best validation macro AUC-ROC: `{best_auc:.4f}`")
	if isinstance(best_binary_auc, float):
	metrics_lines.append(f"- Best validation binary AUC-ROC: `{best_binary_auc:.4f}`")
	if isinstance(best_binary_f1, float):
	metrics_lines.append(f"- Best validation binary F1: `{best_binary_f1:.4f}`")
	if epoch is not None:
	metrics_lines.append(f"- Best checkpoint epoch: `{epoch}`")
	metrics_block = (
	"\n".join(metrics_lines)
	if metrics_lines
	else "- Metrics will appear after the first successful training run."
	)

	# --- Architecture diagram ---
	if model_type == "densenet":
	arch_diagram = _render_densenet_architecture()
	else:
	arch_diagram = _render_scratch_architecture(config)

	# --- Fine-tuning diagram (DenseNet only) ---
	finetuning_section = ""
	if model_type == "densenet":
	finetuning_section = f"""
	## Fine-Tuning Strategy

	{_render_densenet_finetuning(config)}
	"""

	# --- Per-class AUC table ---
	per_class_section = ""
	if best_val_metrics:
	table = _render_per_class_auc_table(best_val_metrics)
	if table:
	per_class_section = f"""
	## Per-Class AUC-ROC at Best Epoch

	{table}
	"""

	# --- Architecture summary line ---
	arch_summary = _architecture_summary(config)

	# --- AMP / training details ---
	use_amp = train_cfg.get("use_amp", False)
	use_clahe = data_cfg.get("clahe", False)
	label_smoothing = train_cfg.get("label_smoothing", 0.0)
	grad_accum = train_cfg.get("grad_accum_steps", 1)
	effective_batch = train_cfg.get("batch_size", 32) * grad_accum
	training_details = (
	f"- Batch size: `{train_cfg.get('batch_size', 32)}` "
	f"× grad_accum `{grad_accum}` = effective batch `{effective_batch}`\n"
	f"- AMP (fp16): `{'enabled' if use_amp else 'disabled'}`\n"
	f"- CLAHE preprocessing: `{'enabled' if use_clahe else 'disabled'}`\n"
	f"- Label smoothing: `{label_smoothing}`\n"
	f"- Optimizer: AdamW · Scheduler: CosineAnnealingLR\n"
	f"- Epochs configured: `{train_cfg.get('epochs', '?')}` · "
	f"Early stop patience: `{train_cfg.get('early_stopping_patience', 10)}`"
	)

	return f"""---
	license: mit
	language:
	- en
	library_name: pytorch
	pipeline_tag: image-classification
	tags:
	- chexvision
	- medical-imaging
	- chest-xray
	- radiology
	- pytorch
	- multi-label-classification
	datasets:
	- {dataset_repo}
	---

	# {model_name}

	> CheXVision — Deep Learning & Big Data university project.
	> 14-class chest X-ray pathology detection + binary normal/abnormal classification
	> on the NIH Chest X-ray14 dataset (112,120 images).

	## Architecture

	{arch_diagram}
	{finetuning_section}
	## Training Pipeline

	{_render_pipeline_diagram()}

	## Training Metrics

	{metrics_block}

	{per_class_section}
	## Training Configuration

	- Repository: `{repo_id}`
	- Dataset: [{dataset_repo}](https://huggingface.co/datasets/{dataset_repo}) · revision `{dataset_revision}`
	- Architecture: {arch_summary}
	- Platform: Kaggle GPU kernel (NVIDIA T4 / P100)
	{training_details}

	## Intended Use

	This model is intended for research and educational work on automated chest X-ray pathology detection.
	It outputs two predictions per image:
	1. Multi-label scores — independent sigmoid probability for each of 14 NIH pathologies
	2. Binary score — sigmoid probability of any abnormality (Normal vs. Abnormal)

	## Limitations

	- Not validated for clinical use. Predictions must not substitute professional medical judgment.
	- Trained on NIH Chest X-ray14, which contains noisy radiologist annotations (patient-level labels, not lesion-level).
	- Performance degrades on images from equipment, patient populations, or preprocessing pipelines
	that differ from the NIH training distribution.
	- Reported AUC metrics are on the validation split, not the held-out test set.

	## CheXNet Benchmark Context

	CheXNet (Rajpurkar et al., 2017) — the seminal paper establishing DenseNet-121 for chest X-ray
	classification — reported 0.841 macro AUC-ROC on a comparable split of this dataset.
	CheXVision-DenseNet matches this benchmark. See the
	[CheXVision demo](https://huggingface.co/spaces/arudaev/chexvision-demo) for live inference.

	## Citation

	```bibtex
	@misc{{chexvision2026,
	title={{CheXVision: Dual-Task Chest X-ray Classification with Custom CNN and DenseNet-121}},
	author={{BIG D(ATA) Team}},
	year={{2026}},
	howpublished={{\\url{{https://huggingface.co/{repo_id}}}}}
	}}
	```
	"""


	def upload_model_artifacts(
	checkpoint_path: Path,
	repo_id: str,
	token: str,
	checkpoint: dict[str, Any] \| None = None,
	history_path: Path \| None = None,
	) -> None:
	"""Upload a checkpoint, metadata, and model card to the HF Hub."""
	checkpoint_path = Path(checkpoint_path)
	history_path = Path(history_path) if history_path else None
	history: dict[str, Any] \| None = None
	if history_path and history_path.exists():
	history = json.loads(history_path.read_text(encoding="utf-8"))

	checkpoint = checkpoint or {}
	model_card = render_model_card(repo_id, checkpoint, history)
	training_config = json.dumps(checkpoint.get("config", {}), indent=2)

	configure_hf_runtime(token=token)

	from huggingface_hub import HfApi

	api = HfApi(token=token)
	api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)

	with tempfile.TemporaryDirectory() as tmp_dir:
	tmp_path = Path(tmp_dir)
	readme_path = tmp_path / "README.md"
	config_path = tmp_path / "training_config.json"
	staged_checkpoint = tmp_path / checkpoint_path.name
	readme_path.write_text(model_card, encoding="utf-8")
	config_path.write_text(training_config, encoding="utf-8")
	shutil.copy2(checkpoint_path, staged_checkpoint)

	if history_path and history_path.exists():
	shutil.copy2(history_path, tmp_path / history_path.name)

	api.upload_folder(
	folder_path=str(tmp_path),
	repo_id=repo_id,
	repo_type="model",
	commit_message=f"Upload trained artifacts for {checkpoint_path.stem}",
	)