arudaev's picture
chore: rename HF owner HlexNC → arudaev across all references
c3ebe8c
"""Helpers for Hugging Face Hub authentication, runtime config, and uploads."""
from __future__ import annotations
import json
import os
import shutil
import socket
import tempfile
from pathlib import Path
from typing import Any
HF_DATASET_REPO = "arudaev/chest-xray-14-320"
HF_DATASET_REVISION = os.environ.get(
"CHEXVISION_DATASET_REVISION",
"44443e6ee968b3c6094b63f14a27698c40b50680",
)
# NIH Chest X-ray14 pathology labels in canonical order
PATHOLOGY_LABELS = [
"Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass",
"Nodule", "Pneumonia", "Pneumothorax", "Consolidation", "Edema",
"Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia",
]
def _load_dotenv_if_available() -> None:
"""Load project-root environment variables when python-dotenv is installed."""
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
return
def _set_hf_token_env(token: str) -> str:
"""Persist the resolved token to the canonical HF environment variables."""
os.environ["HF_TOKEN"] = token
os.environ["HUGGING_FACE_HUB_TOKEN"] = token
return token
def load_hf_token(required: bool = False) -> str | None:
"""Resolve an HF token from env vars, project .env, Kaggle dataset, or Kaggle secrets.
Resolution order (first non-empty value wins):
1. Environment variables (HF_TOKEN / legacy aliases)
2. ``/kaggle/input/chexvision-secrets/hf_token.txt`` — preferred automated path;
attach the private dataset ``hlexnc/chexvision-secrets`` via dataset_sources in
kernel-metadata.json so every API-pushed kernel gets it without manual UI steps.
3. Kaggle UserSecretsClient — works only for interactive sessions, kept as fallback.
"""
_load_dotenv_if_available()
kaggle_secret_error: str | None = None
# 1. Environment variables (highest priority; set by .env, GitHub Actions, etc.)
token_names = ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HUGGING_FACE_HUB_TOKEN")
for name in token_names:
token = os.environ.get(name, "").strip()
if token:
return _set_hf_token_env(token)
# 2. Token file from an attached Kaggle dataset source.
# Kaggle mounts dataset_sources under two possible paths depending on
# the runtime version — check both so old and new kernels both work.
for token_file in (
Path("/kaggle/input/datasets/hlexnc/chexvision-secrets/hf_token.txt"),
Path("/kaggle/input/chexvision-secrets/hf_token.txt"),
):
if token_file.exists():
token = token_file.read_text(encoding="utf-8").strip()
if token:
print(f"[hub] Loaded HF_TOKEN from Kaggle dataset source: {token_file}")
return _set_hf_token_env(token)
# 3. Kaggle UserSecretsClient — interactive sessions only (fallback).
try:
from kaggle_secrets import UserSecretsClient
token = UserSecretsClient().get_secret("HF_TOKEN").strip()
except Exception as exc:
token = ""
kaggle_secret_error = f"{type(exc).__name__}: {exc}"
if token:
return _set_hf_token_env(token)
if required:
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
detail = f" Kaggle reported: {kaggle_secret_error}" if kaggle_secret_error else ""
raise RuntimeError(
"HF_TOKEN not found. Preferred fix: create a private Kaggle dataset "
"'hlexnc/chexvision-secrets' with a file 'hf_token.txt' containing your "
"HF token, then add it to dataset_sources in kernel-metadata.json. "
f"Alternatively enable HF_TOKEN in Kaggle Secrets (interactive only).{detail}"
)
raise RuntimeError(
"HF_TOKEN not found. Set it in .env, export it in the environment, "
"or add it to Kaggle Secrets."
)
return None
def configure_hf_runtime(
token: str | None = None,
*,
required_token: bool = False,
check_dns: bool = False,
) -> str | None:
"""Set the HF runtime environment before importing HF client libraries."""
resolved_token = token or load_hf_token(required=required_token)
if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") and "HF_HOME" not in os.environ:
os.environ["HF_HOME"] = "/kaggle/working/hf_home"
hf_home = os.environ.get("HF_HOME", "").strip()
if hf_home:
try:
Path(hf_home).mkdir(parents=True, exist_ok=True)
except OSError:
pass # Best-effort; the path may not be writable outside a real Kaggle kernel
os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
os.environ.setdefault("HF_HUB_ETAG_TIMEOUT", "30")
os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
os.environ.setdefault("HF_HUB_VERBOSITY", "info")
if resolved_token:
_set_hf_token_env(resolved_token)
if check_dns:
try:
socket.getaddrinfo("huggingface.co", 443)
except OSError as exc:
raise RuntimeError(
"Failed to resolve huggingface.co from the current runtime. "
"Check Kaggle internet access or a platform-side DNS issue."
) from exc
return resolved_token
def _safe_metric(history: dict[str, Any] | None, key: str) -> float | None:
"""Return the best numeric value recorded for a metric history key."""
if not history:
return None
values = history.get(key, [])
if not isinstance(values, list) or not values:
return None
return float(max(values))
def _architecture_summary(config: dict[str, Any]) -> str:
"""Produce a short human-readable architecture summary."""
model_cfg = config.get("model", {})
model_type = model_cfg.get("type", "")
if model_type == "densenet":
return "DenseNet-121 transfer learning with a shared feature layer and dual classification heads."
arch = model_cfg.get("architecture", {})
blocks = arch.get("block_config", [3, 4, 6, 3])
use_se = arch.get("use_se", True)
se_note = " with Squeeze-Excitation channel attention" if use_se else ""
return (
f"Custom residual CNN{se_note} (depth {blocks}) trained from scratch "
"with shared features and dual classification heads."
)
def _render_pipeline_diagram() -> str:
"""Mermaid flowchart of the full data→train→upload pipeline."""
return """```mermaid
flowchart TD
DS[("🗄️ arudaev/chest-xray-14-320\n112,120 images · 36 shards · ~7.97 GB")]
DS -->|snapshot_download| PREP["📂 data/images · data/labels.csv\ntrain 78,468 · val 11,210 · test 22,442"]
PREP --> AUG["Augmentation Pipeline\nHFlip · Rotate±15° · RandomAffine\nColorJitter · GaussianBlur · RandomErasing"]
AUG --> FWD["⚡ Model Forward Pass\ntorch.cuda.amp.autocast · fp16"]
FWD --> ML["multilabel_logits B×14\nWeightedBCE + pos_weight · 14 classes"]
FWD --> BIN["binary_logits B×1\nBCE · Normal vs. Abnormal"]
ML --> LOSS["Combined Loss\n1.0 × multilabel + 0.5 × binary"]
BIN --> LOSS
LOSS --> BACK["Backward · Grad Clip 1.0\nGradient Accumulation ×4 · eff. batch 96"]
BACK --> OPT["AdamW · CosineAnnealingLR\nearly stop patience = 15"]
OPT -->|"↑ best val AUC-ROC"| BEST["💾 Best Checkpoint\nmodel_state · best_val_metrics · config"]
BEST -->|upload_model_artifacts| HUB["🤗 HF Hub\ncheckpoint · history.json · model card"]
```"""
def _render_scratch_architecture(config: dict[str, Any]) -> str:
"""Mermaid architecture diagram for CheXVisionScratch."""
arch = config.get("model", {}).get("architecture", {})
blocks = arch.get("block_config", [3, 4, 6, 3])
use_se = arch.get("use_se", True)
block_label = "SE-ResBlock" if use_se else "ResBlock"
b1, b2, b3, b4 = (blocks + [3, 4, 6, 3])[:4]
return f"""```mermaid
graph LR
IN["Input
3 × 320 × 320"] --> STEM["Stem
7×7 Conv · BN · ReLU
3→64ch · MaxPool ÷2"]
STEM --> S1["Stage 1
{b1}× {block_label}
64ch"]
S1 --> S2["Stage 2 ↓½
{b2}× {block_label}
128ch"]
S2 --> S3["Stage 3 ↓½
{b3}× {block_label}
256ch"]
S3 --> S4["Stage 4 ↓½
{b4}× {block_label}
512ch"]
S4 --> GAP["Global Avg Pool
Dropout(0.5)
512-dim"]
GAP --> MLH["Multilabel Head
Linear 512→14
sigmoid · 14 pathologies"]
GAP --> BH["Binary Head
Linear 512→1
sigmoid · Normal/Abnormal"]
style MLH fill:#2e7d32,color:#fff
style BH fill:#1565c0,color:#fff
style IN fill:#37474f,color:#fff
```"""
def _render_densenet_architecture() -> str:
"""Mermaid architecture diagram for CheXVisionDenseNet."""
return """```mermaid
graph LR
IN["Input
3 × 320 × 320"] --> BB["DenseNet-121 Backbone
ImageNet pretrained
Dense connectivity
7.9M parameters"]
BB --> GAP2["Adaptive Avg Pool
1024-dim features"]
GAP2 --> FL["Feature Layer
Linear 1024→512
ReLU · Dropout(0.3)"]
FL --> MLH["Multilabel Head
Linear 512→14
sigmoid · 14 pathologies"]
FL --> BH["Binary Head
Linear 512→1
sigmoid · Normal/Abnormal"]
style MLH fill:#2e7d32,color:#fff
style BH fill:#1565c0,color:#fff
style IN fill:#37474f,color:#fff
style BB fill:#6a1b9a,color:#fff
```"""
def _render_densenet_finetuning(config: dict[str, Any]) -> str:
"""Mermaid fine-tuning phase diagram for DenseNet."""
ft = config.get("model", {}).get("fine_tuning", {})
freeze_epochs = ft.get("freeze_epochs", 5)
total_epochs = config.get("training", {}).get("epochs", 60)
unfreeze_lr = ft.get("unfreeze_lr", 1e-4)
freeze_lr = ft.get("freeze_lr", 1e-3)
return f"""```mermaid
graph LR
P1["🔒 Phase 1
Epochs 1–{freeze_epochs}
Backbone frozen
Train heads only
lr = {freeze_lr}"] -->|"Epoch {freeze_epochs + 1}
unfreeze_backbone()"| P2["🔓 Phase 2
Epochs {freeze_epochs + 1}{total_epochs}
End-to-end fine-tuning
All layers trainable
lr = {unfreeze_lr}"]
style P1 fill:#e65100,color:#fff
style P2 fill:#6a1b9a,color:#fff
```"""
def _render_per_class_auc_table(best_val_metrics: dict[str, Any]) -> str:
"""Render a markdown table of per-class AUC-ROC from best epoch metrics."""
rows = []
for label in PATHOLOGY_LABELS:
auc = best_val_metrics.get(f"auc_{label}")
if auc is not None:
bar_filled = int(round(float(auc) * 10))
bar = "█" * bar_filled + "░" * (10 - bar_filled)
rows.append(f"| {label:<20} | `{float(auc):.4f}` | `{bar}` |")
if not rows:
return ""
table = "| Pathology | AUC-ROC | Visual |\n"
table += "|----------------------|----------|---------------|\n"
table += "\n".join(rows)
return table
def render_model_card(
repo_id: str,
checkpoint: dict[str, Any],
history: dict[str, Any] | None = None,
) -> str:
"""Render a Hugging Face model card with architecture diagrams and training metrics."""
config = checkpoint.get("config", {})
data_cfg = config.get("data", {})
model_cfg = config.get("model", {})
train_cfg = config.get("training", {})
model_name = model_cfg.get("name", repo_id.split("/")[-1])
model_type = model_cfg.get("type", "")
dataset_repo = data_cfg.get("hf_dataset_repo", HF_DATASET_REPO)
dataset_revision = data_cfg.get("hf_dataset_revision", HF_DATASET_REVISION)
best_auc = checkpoint.get("best_auc")
epoch = checkpoint.get("epoch")
best_val_metrics: dict[str, Any] = checkpoint.get("best_val_metrics", {})
best_binary_auc = _safe_metric(history, "binary_auc_roc")
best_binary_f1 = _safe_metric(history, "binary_f1")
# --- Metrics summary ---
metrics_lines = []
if isinstance(best_auc, (int, float)):
metrics_lines.append(f"- Best validation macro AUC-ROC: `{best_auc:.4f}`")
if isinstance(best_binary_auc, float):
metrics_lines.append(f"- Best validation binary AUC-ROC: `{best_binary_auc:.4f}`")
if isinstance(best_binary_f1, float):
metrics_lines.append(f"- Best validation binary F1: `{best_binary_f1:.4f}`")
if epoch is not None:
metrics_lines.append(f"- Best checkpoint epoch: `{epoch}`")
metrics_block = (
"\n".join(metrics_lines)
if metrics_lines
else "- Metrics will appear after the first successful training run."
)
# --- Architecture diagram ---
if model_type == "densenet":
arch_diagram = _render_densenet_architecture()
else:
arch_diagram = _render_scratch_architecture(config)
# --- Fine-tuning diagram (DenseNet only) ---
finetuning_section = ""
if model_type == "densenet":
finetuning_section = f"""
## Fine-Tuning Strategy
{_render_densenet_finetuning(config)}
"""
# --- Per-class AUC table ---
per_class_section = ""
if best_val_metrics:
table = _render_per_class_auc_table(best_val_metrics)
if table:
per_class_section = f"""
## Per-Class AUC-ROC at Best Epoch
{table}
"""
# --- Architecture summary line ---
arch_summary = _architecture_summary(config)
# --- AMP / training details ---
use_amp = train_cfg.get("use_amp", False)
use_clahe = data_cfg.get("clahe", False)
label_smoothing = train_cfg.get("label_smoothing", 0.0)
grad_accum = train_cfg.get("grad_accum_steps", 1)
effective_batch = train_cfg.get("batch_size", 32) * grad_accum
training_details = (
f"- Batch size: `{train_cfg.get('batch_size', 32)}` "
f"× grad_accum `{grad_accum}` = **effective batch `{effective_batch}`**\n"
f"- AMP (fp16): `{'enabled' if use_amp else 'disabled'}`\n"
f"- CLAHE preprocessing: `{'enabled' if use_clahe else 'disabled'}`\n"
f"- Label smoothing: `{label_smoothing}`\n"
f"- Optimizer: AdamW · Scheduler: CosineAnnealingLR\n"
f"- Epochs configured: `{train_cfg.get('epochs', '?')}` · "
f"Early stop patience: `{train_cfg.get('early_stopping_patience', 10)}`"
)
return f"""---
license: mit
language:
- en
library_name: pytorch
pipeline_tag: image-classification
tags:
- chexvision
- medical-imaging
- chest-xray
- radiology
- pytorch
- multi-label-classification
datasets:
- {dataset_repo}
---
# {model_name}
> **CheXVision** — Deep Learning & Big Data university project.
> 14-class chest X-ray pathology detection + binary normal/abnormal classification
> on the NIH Chest X-ray14 dataset (112,120 images).
## Architecture
{arch_diagram}
{finetuning_section}
## Training Pipeline
{_render_pipeline_diagram()}
## Training Metrics
{metrics_block}
{per_class_section}
## Training Configuration
- Repository: `{repo_id}`
- Dataset: [{dataset_repo}](https://huggingface.co/datasets/{dataset_repo}) · revision `{dataset_revision}`
- Architecture: {arch_summary}
- Platform: Kaggle GPU kernel (NVIDIA T4 / P100)
{training_details}
## Intended Use
This model is intended for research and educational work on automated chest X-ray pathology detection.
It outputs two predictions per image:
1. **Multi-label scores** — independent sigmoid probability for each of 14 NIH pathologies
2. **Binary score** — sigmoid probability of any abnormality (Normal vs. Abnormal)
## Limitations
- Not validated for clinical use. Predictions must not substitute professional medical judgment.
- Trained on NIH Chest X-ray14, which contains noisy radiologist annotations (patient-level labels, not lesion-level).
- Performance degrades on images from equipment, patient populations, or preprocessing pipelines
that differ from the NIH training distribution.
- Reported AUC metrics are on the validation split, not the held-out test set.
## CheXNet Benchmark Context
CheXNet (Rajpurkar et al., 2017) — the seminal paper establishing DenseNet-121 for chest X-ray
classification — reported **0.841 macro AUC-ROC** on a comparable split of this dataset.
CheXVision-DenseNet matches this benchmark. See the
[CheXVision demo](https://huggingface.co/spaces/arudaev/chexvision-demo) for live inference.
## Citation
```bibtex
@misc{{chexvision2026,
title={{CheXVision: Dual-Task Chest X-ray Classification with Custom CNN and DenseNet-121}},
author={{BIG D(ATA) Team}},
year={{2026}},
howpublished={{\\url{{https://huggingface.co/{repo_id}}}}}
}}
```
"""
def upload_model_artifacts(
checkpoint_path: Path,
repo_id: str,
token: str,
checkpoint: dict[str, Any] | None = None,
history_path: Path | None = None,
) -> None:
"""Upload a checkpoint, metadata, and model card to the HF Hub."""
checkpoint_path = Path(checkpoint_path)
history_path = Path(history_path) if history_path else None
history: dict[str, Any] | None = None
if history_path and history_path.exists():
history = json.loads(history_path.read_text(encoding="utf-8"))
checkpoint = checkpoint or {}
model_card = render_model_card(repo_id, checkpoint, history)
training_config = json.dumps(checkpoint.get("config", {}), indent=2)
configure_hf_runtime(token=token)
from huggingface_hub import HfApi
api = HfApi(token=token)
api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_path = Path(tmp_dir)
readme_path = tmp_path / "README.md"
config_path = tmp_path / "training_config.json"
staged_checkpoint = tmp_path / checkpoint_path.name
readme_path.write_text(model_card, encoding="utf-8")
config_path.write_text(training_config, encoding="utf-8")
shutil.copy2(checkpoint_path, staged_checkpoint)
if history_path and history_path.exists():
shutil.copy2(history_path, tmp_path / history_path.name)
api.upload_folder(
folder_path=str(tmp_path),
repo_id=repo_id,
repo_type="model",
commit_message=f"Upload trained artifacts for {checkpoint_path.stem}",
)