Spaces:

peft-internal-testing
/

PEFT-method-comparison-embed

Running

PEFT-method-comparison-embed / image-gen /evaluate.py

github-actions[bot]

🚀 Deploy method comparison app from GH action

2b239c3 about 19 hours ago

9.63 kB

	# Copyright 2026-present the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Evaluate an existing checkpoint from the image generation method comparison.

	Loads a trained PEFT checkpoint on top of the same base model that was used for training and runs the same evaluation
	as at the end of a training run (test set DINOv2 similarity, drift), then generates the sample images. The results and
	sample images are always stored as temporary results.

	Example:

	python evaluate.py -v /path/to/checkpoint/

	The checkpoint directory must contain the trained PEFT adapter (i.e. an adapter_config.json and the adapter weights).
	This can e.g. be the temporary directory reported by run.py when called without the --clean flag or a checkpoint
	downloaded from the Hugging Face Hub bucket. The training parameters are taken from default_training_params.json; if
	the checkpoint was trained with different parameters, place the corresponding training_params.json into the checkpoint
	directory.
	"""

	import argparse
	import datetime as dt
	import os
	import sys
	import time
	from collections.abc import Callable

	import torch
	from run import evaluate, generate_sample_images, measure_drift
	from transformers import set_seed
	from utils import (
	FILE_NAME_TRAIN_PARAMS,
	RESULT_PATH_TEST,
	SAMPLE_IMAGE_PATH_TEST,
	TrainConfig,
	TrainResult,
	TrainStatus,
	get_artifact_stem,
	get_base_model_info,
	get_dataset_info,
	get_dino_encoder,
	get_file_size,
	get_pipeline,
	get_train_config,
	init_accelerator,
	log_results,
	)

	from data import get_train_valid_test_datasets
	from peft import PeftConfig, PeftModel
	from peft.utils import CONFIG_NAME, infer_device


	def get_experiment_name(path_checkpoint: str) -> str:
	if not os.path.isdir(path_checkpoint):
	raise FileNotFoundError(f"Path {path_checkpoint} does not exist or is not a directory")
	return os.path.basename(os.path.normpath(path_checkpoint))


	def evaluate_checkpoint(
	*,
	pipeline,
	train_config: TrainConfig,
	print_verbose: Callable[..., None],
	) -> TrainResult:
	metrics = []
	device_type = infer_device()
	_, _, test_dataset = get_train_valid_test_datasets(train_config=train_config, print_fn=print_verbose)
	processor, dino_model = get_dino_encoder(train_config.dino_model_id, train_config.dino_image_size)

	torch_accelerator_module = getattr(torch, device_type, torch.cuda)
	transformer = pipeline.transformer.to(device_type)
	transformer.eval()

	if hasattr(transformer, "get_nb_trainable_parameters"):
	num_trainable_params, num_params = transformer.get_nb_trainable_parameters()
	else:
	num_params = sum(param.numel() for param in transformer.parameters())
	num_trainable_params = sum(param.numel() for param in transformer.parameters() if param.requires_grad)
	print_verbose(
	f"trainable params: {num_trainable_params:,d} \|\| all params: {num_params:,d} \|\| "
	f"trainable: {100 * num_trainable_params / num_params:.4f}%"
	)

	status = TrainStatus.FAILED
	error_msg = ""
	tic_eval_total = time.perf_counter()

	torch_accelerator_module.empty_cache()
	try:
	print_verbose("Evaluation on test set follows.")
	test_similarity = evaluate(
	pipeline=pipeline,
	ds_eval=test_dataset,
	processor=processor,
	dino_model=dino_model,
	config=train_config,
	num_repeats=3,
	)
	print_verbose("Calculating drift.")
	test_drift = measure_drift(pipeline=pipeline, processor=processor, dino_model=dino_model, config=train_config)
	metrics.append(
	{
	"test dino_similarity": test_similarity,
	"drift": test_drift,
	"eval time": time.perf_counter() - tic_eval_total,
	}
	)
	print_verbose(f"Test DINOv2 similarity: {test_similarity:.4f}")
	print_verbose(f"Test drift: {test_drift:.4f}")

	except KeyboardInterrupt:
	print_verbose("canceled evaluation")
	status = TrainStatus.CANCELED
	error_msg = "manually canceled"
	except torch.OutOfMemoryError as exc:
	print_verbose("out of memory error encountered")
	status = TrainStatus.CANCELED
	error_msg = str(exc)
	except Exception as exc:
	print_verbose(f"encountered an error: {exc}")
	status = TrainStatus.CANCELED
	error_msg = str(exc)

	if status != TrainStatus.CANCELED:
	status = TrainStatus.SUCCESS
	# the train-related attributes are set to empty/zero values, as no training is performed
	eval_result = TrainResult(
	status=status,
	train_time=0.0,
	accelerator_memory_reserved_log=[],
	accelerator_memory_max_train=0,
	losses=[],
	metrics=metrics,
	error_msg=error_msg,
	num_trainable_params=num_trainable_params,
	num_total_params=num_params,
	)
	return eval_result


	def main(*, path_checkpoint: str, experiment_name: str) -> None:
	tic_total = time.perf_counter()
	start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()

	print_verbose("===== The results of this evaluation run are stored as temporary results ======")

	if not os.path.exists(os.path.join(path_checkpoint, CONFIG_NAME)):
	raise FileNotFoundError(
	f"Could not find a PEFT config at {path_checkpoint}. Note that evaluating full fine-tuning checkpoints is "
	"not supported."
	)
	peft_config = PeftConfig.from_pretrained(path_checkpoint)

	path_train_config = os.path.join(path_checkpoint, FILE_NAME_TRAIN_PARAMS)
	if not os.path.exists(path_train_config):
	print_verbose(
	f"Could not find {FILE_NAME_TRAIN_PARAMS} in {path_checkpoint}, using the default training parameters"
	)
	train_config = get_train_config(path_train_config)
	init_accelerator()
	set_seed(train_config.seed)

	model_info = get_base_model_info(train_config.model_id)
	dataset_info = get_dataset_info(train_config.dataset_id)
	# create the pipeline with the plain base model first, then load the trained adapter onto it; compilation, if
	# enabled, must come last, mirroring the order in get_pipeline
	pipeline = get_pipeline(
	model_id=train_config.model_id,
	dtype=train_config.dtype,
	compile=False,
	peft_config=None,
	autocast_adapter_dtype=train_config.autocast_adapter_dtype,
	use_gc=train_config.use_gc,
	)
	pipeline.transformer = PeftModel.from_pretrained(
	pipeline.transformer,
	path_checkpoint,
	is_trainable=True, # to report the same number of trainable parameters as during training
	autocast_adapter_dtype=train_config.autocast_adapter_dtype,
	)
	if train_config.compile:
	pipeline.transformer = torch.compile(pipeline.transformer, dynamic=True)
	print_verbose(pipeline.transformer)

	eval_result = evaluate_checkpoint(
	pipeline=pipeline,
	train_config=train_config,
	print_verbose=print_verbose,
	)

	file_size = get_file_size(pipeline.transformer, peft_config=peft_config, clean=True, print_fn=print_verbose)

	time_total = time.perf_counter() - tic_total
	log_results(
	experiment_name=experiment_name,
	train_result=eval_result,
	time_total=time_total,
	file_size=file_size,
	model_info=model_info,
	dataset_info=dataset_info,
	start_date=start_date,
	train_config=train_config,
	peft_config=peft_config,
	print_fn=print_verbose,
	save_dir=RESULT_PATH_TEST, # results of evaluation-only runs are always treated as temporary results
	)

	if (eval_result.status == TrainStatus.SUCCESS) and train_config.sample_image_prompts:
	print_verbose("Generating sample images")
	try:
	file_stem = get_artifact_stem(experiment_name, start_date, SAMPLE_IMAGE_PATH_TEST)
	generate_sample_images(
	pipeline=pipeline,
	train_config=train_config,
	sample_image_dir=SAMPLE_IMAGE_PATH_TEST,
	file_stem=file_stem,
	)
	print_verbose(f"Stored sample images in {SAMPLE_IMAGE_PATH_TEST}")
	except Exception as exc:
	print_verbose(f"Sample image generation failed: {exc}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output")
	parser.add_argument(
	"path_checkpoint", type=str, help="Path to the directory containing the trained PEFT checkpoint"
	)
	args = parser.parse_args()

	experiment_name = get_experiment_name(args.path_checkpoint)

	if args.verbose:

	def print_verbose(args, *kwargs) -> None:
	kwargs["file"] = sys.stderr
	print(args, *kwargs)
	else:

	def print_verbose(args, *kwargs) -> None:
	pass

	main(path_checkpoint=args.path_checkpoint, experiment_name=experiment_name)