github-actions[bot]
🚀 Deploy method comparison app from GH action
2b239c3
Raw
History Blame Contribute Delete
9.63 kB
# Copyright 2026-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluate an existing checkpoint from the image generation method comparison.
Loads a trained PEFT checkpoint on top of the same base model that was used for training and runs the same evaluation
as at the end of a training run (test set DINOv2 similarity, drift), then generates the sample images. The results and
sample images are always stored as temporary results.
Example:
python evaluate.py -v /path/to/checkpoint/
The checkpoint directory must contain the trained PEFT adapter (i.e. an adapter_config.json and the adapter weights).
This can e.g. be the temporary directory reported by run.py when called without the --clean flag or a checkpoint
downloaded from the Hugging Face Hub bucket. The training parameters are taken from default_training_params.json; if
the checkpoint was trained with different parameters, place the corresponding training_params.json into the checkpoint
directory.
"""
import argparse
import datetime as dt
import os
import sys
import time
from collections.abc import Callable
import torch
from run import evaluate, generate_sample_images, measure_drift
from transformers import set_seed
from utils import (
FILE_NAME_TRAIN_PARAMS,
RESULT_PATH_TEST,
SAMPLE_IMAGE_PATH_TEST,
TrainConfig,
TrainResult,
TrainStatus,
get_artifact_stem,
get_base_model_info,
get_dataset_info,
get_dino_encoder,
get_file_size,
get_pipeline,
get_train_config,
init_accelerator,
log_results,
)
from data import get_train_valid_test_datasets
from peft import PeftConfig, PeftModel
from peft.utils import CONFIG_NAME, infer_device
def get_experiment_name(path_checkpoint: str) -> str:
if not os.path.isdir(path_checkpoint):
raise FileNotFoundError(f"Path {path_checkpoint} does not exist or is not a directory")
return os.path.basename(os.path.normpath(path_checkpoint))
def evaluate_checkpoint(
*,
pipeline,
train_config: TrainConfig,
print_verbose: Callable[..., None],
) -> TrainResult:
metrics = []
device_type = infer_device()
_, _, test_dataset = get_train_valid_test_datasets(train_config=train_config, print_fn=print_verbose)
processor, dino_model = get_dino_encoder(train_config.dino_model_id, train_config.dino_image_size)
torch_accelerator_module = getattr(torch, device_type, torch.cuda)
transformer = pipeline.transformer.to(device_type)
transformer.eval()
if hasattr(transformer, "get_nb_trainable_parameters"):
num_trainable_params, num_params = transformer.get_nb_trainable_parameters()
else:
num_params = sum(param.numel() for param in transformer.parameters())
num_trainable_params = sum(param.numel() for param in transformer.parameters() if param.requires_grad)
print_verbose(
f"trainable params: {num_trainable_params:,d} || all params: {num_params:,d} || "
f"trainable: {100 * num_trainable_params / num_params:.4f}%"
)
status = TrainStatus.FAILED
error_msg = ""
tic_eval_total = time.perf_counter()
torch_accelerator_module.empty_cache()
try:
print_verbose("Evaluation on test set follows.")
test_similarity = evaluate(
pipeline=pipeline,
ds_eval=test_dataset,
processor=processor,
dino_model=dino_model,
config=train_config,
num_repeats=3,
)
print_verbose("Calculating drift.")
test_drift = measure_drift(pipeline=pipeline, processor=processor, dino_model=dino_model, config=train_config)
metrics.append(
{
"test dino_similarity": test_similarity,
"drift": test_drift,
"eval time": time.perf_counter() - tic_eval_total,
}
)
print_verbose(f"Test DINOv2 similarity: {test_similarity:.4f}")
print_verbose(f"Test drift: {test_drift:.4f}")
except KeyboardInterrupt:
print_verbose("canceled evaluation")
status = TrainStatus.CANCELED
error_msg = "manually canceled"
except torch.OutOfMemoryError as exc:
print_verbose("out of memory error encountered")
status = TrainStatus.CANCELED
error_msg = str(exc)
except Exception as exc:
print_verbose(f"encountered an error: {exc}")
status = TrainStatus.CANCELED
error_msg = str(exc)
if status != TrainStatus.CANCELED:
status = TrainStatus.SUCCESS
# the train-related attributes are set to empty/zero values, as no training is performed
eval_result = TrainResult(
status=status,
train_time=0.0,
accelerator_memory_reserved_log=[],
accelerator_memory_max_train=0,
losses=[],
metrics=metrics,
error_msg=error_msg,
num_trainable_params=num_trainable_params,
num_total_params=num_params,
)
return eval_result
def main(*, path_checkpoint: str, experiment_name: str) -> None:
tic_total = time.perf_counter()
start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
print_verbose("===== The results of this evaluation run are stored as temporary results ======")
if not os.path.exists(os.path.join(path_checkpoint, CONFIG_NAME)):
raise FileNotFoundError(
f"Could not find a PEFT config at {path_checkpoint}. Note that evaluating full fine-tuning checkpoints is "
"not supported."
)
peft_config = PeftConfig.from_pretrained(path_checkpoint)
path_train_config = os.path.join(path_checkpoint, FILE_NAME_TRAIN_PARAMS)
if not os.path.exists(path_train_config):
print_verbose(
f"Could not find {FILE_NAME_TRAIN_PARAMS} in {path_checkpoint}, using the default training parameters"
)
train_config = get_train_config(path_train_config)
init_accelerator()
set_seed(train_config.seed)
model_info = get_base_model_info(train_config.model_id)
dataset_info = get_dataset_info(train_config.dataset_id)
# create the pipeline with the plain base model first, then load the trained adapter onto it; compilation, if
# enabled, must come last, mirroring the order in get_pipeline
pipeline = get_pipeline(
model_id=train_config.model_id,
dtype=train_config.dtype,
compile=False,
peft_config=None,
autocast_adapter_dtype=train_config.autocast_adapter_dtype,
use_gc=train_config.use_gc,
)
pipeline.transformer = PeftModel.from_pretrained(
pipeline.transformer,
path_checkpoint,
is_trainable=True, # to report the same number of trainable parameters as during training
autocast_adapter_dtype=train_config.autocast_adapter_dtype,
)
if train_config.compile:
pipeline.transformer = torch.compile(pipeline.transformer, dynamic=True)
print_verbose(pipeline.transformer)
eval_result = evaluate_checkpoint(
pipeline=pipeline,
train_config=train_config,
print_verbose=print_verbose,
)
file_size = get_file_size(pipeline.transformer, peft_config=peft_config, clean=True, print_fn=print_verbose)
time_total = time.perf_counter() - tic_total
log_results(
experiment_name=experiment_name,
train_result=eval_result,
time_total=time_total,
file_size=file_size,
model_info=model_info,
dataset_info=dataset_info,
start_date=start_date,
train_config=train_config,
peft_config=peft_config,
print_fn=print_verbose,
save_dir=RESULT_PATH_TEST, # results of evaluation-only runs are always treated as temporary results
)
if (eval_result.status == TrainStatus.SUCCESS) and train_config.sample_image_prompts:
print_verbose("Generating sample images")
try:
file_stem = get_artifact_stem(experiment_name, start_date, SAMPLE_IMAGE_PATH_TEST)
generate_sample_images(
pipeline=pipeline,
train_config=train_config,
sample_image_dir=SAMPLE_IMAGE_PATH_TEST,
file_stem=file_stem,
)
print_verbose(f"Stored sample images in {SAMPLE_IMAGE_PATH_TEST}")
except Exception as exc:
print_verbose(f"Sample image generation failed: {exc}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output")
parser.add_argument(
"path_checkpoint", type=str, help="Path to the directory containing the trained PEFT checkpoint"
)
args = parser.parse_args()
experiment_name = get_experiment_name(args.path_checkpoint)
if args.verbose:
def print_verbose(*args, **kwargs) -> None:
kwargs["file"] = sys.stderr
print(*args, **kwargs)
else:
def print_verbose(*args, **kwargs) -> None:
pass
main(path_checkpoint=args.path_checkpoint, experiment_name=experiment_name)