# Copyright 2026-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Evaluate an existing checkpoint from the image generation method comparison.

Loads a trained PEFT checkpoint on top of the same base model that was used for training and runs the same evaluation
as at the end of a training run (test set DINOv2 similarity, drift), then generates the sample images. The results and
sample images are always stored as temporary results.

Example:

python evaluate.py -v /path/to/checkpoint/

The checkpoint directory must contain the trained PEFT adapter (i.e. an adapter_config.json and the adapter weights).
This can e.g. be the temporary directory reported by run.py when called without the --clean flag or a checkpoint
downloaded from the Hugging Face Hub bucket. The training parameters are taken from default_training_params.json; if
the checkpoint was trained with different parameters, place the corresponding training_params.json into the checkpoint
directory.
"""

import argparse
import datetime as dt
import os
import sys
import time
from collections.abc import Callable

import torch
from run import evaluate, generate_sample_images, measure_drift
from transformers import set_seed
from utils import (
    FILE_NAME_TRAIN_PARAMS,
    RESULT_PATH_TEST,
    SAMPLE_IMAGE_PATH_TEST,
    TrainConfig,
    TrainResult,
    TrainStatus,
    get_artifact_stem,
    get_base_model_info,
    get_dataset_info,
    get_dino_encoder,
    get_file_size,
    get_pipeline,
    get_train_config,
    init_accelerator,
    log_results,
)

from data import get_train_valid_test_datasets
from peft import PeftConfig, PeftModel
from peft.utils import CONFIG_NAME, infer_device


def get_experiment_name(path_checkpoint: str) -> str:
    if not os.path.isdir(path_checkpoint):
        raise FileNotFoundError(f"Path {path_checkpoint} does not exist or is not a directory")
    return os.path.basename(os.path.normpath(path_checkpoint))


def evaluate_checkpoint(
    *,
    pipeline,
    train_config: TrainConfig,
    print_verbose: Callable[..., None],
) -> TrainResult:
    metrics = []
    device_type = infer_device()
    _, _, test_dataset = get_train_valid_test_datasets(train_config=train_config, print_fn=print_verbose)
    processor, dino_model = get_dino_encoder(train_config.dino_model_id, train_config.dino_image_size)

    torch_accelerator_module = getattr(torch, device_type, torch.cuda)
    transformer = pipeline.transformer.to(device_type)
    transformer.eval()

    if hasattr(transformer, "get_nb_trainable_parameters"):
        num_trainable_params, num_params = transformer.get_nb_trainable_parameters()
    else:
        num_params = sum(param.numel() for param in transformer.parameters())
        num_trainable_params = sum(param.numel() for param in transformer.parameters() if param.requires_grad)
    print_verbose(
        f"trainable params: {num_trainable_params:,d} || all params: {num_params:,d} || "
        f"trainable: {100 * num_trainable_params / num_params:.4f}%"
    )

    status = TrainStatus.FAILED
    error_msg = ""
    tic_eval_total = time.perf_counter()

    torch_accelerator_module.empty_cache()
    try:
        print_verbose("Evaluation on test set follows.")
        test_similarity = evaluate(
            pipeline=pipeline,
            ds_eval=test_dataset,
            processor=processor,
            dino_model=dino_model,
            config=train_config,
            num_repeats=3,
        )
        print_verbose("Calculating drift.")
        test_drift = measure_drift(pipeline=pipeline, processor=processor, dino_model=dino_model, config=train_config)
        metrics.append(
            {
                "test dino_similarity": test_similarity,
                "drift": test_drift,
                "eval time": time.perf_counter() - tic_eval_total,
            }
        )
        print_verbose(f"Test DINOv2 similarity: {test_similarity:.4f}")
        print_verbose(f"Test drift:             {test_drift:.4f}")

    except KeyboardInterrupt:
        print_verbose("canceled evaluation")
        status = TrainStatus.CANCELED
        error_msg = "manually canceled"
    except torch.OutOfMemoryError as exc:
        print_verbose("out of memory error encountered")
        status = TrainStatus.CANCELED
        error_msg = str(exc)
    except Exception as exc:
        print_verbose(f"encountered an error: {exc}")
        status = TrainStatus.CANCELED
        error_msg = str(exc)

    if status != TrainStatus.CANCELED:
        status = TrainStatus.SUCCESS
    # the train-related attributes are set to empty/zero values, as no training is performed
    eval_result = TrainResult(
        status=status,
        train_time=0.0,
        accelerator_memory_reserved_log=[],
        accelerator_memory_max_train=0,
        losses=[],
        metrics=metrics,
        error_msg=error_msg,
        num_trainable_params=num_trainable_params,
        num_total_params=num_params,
    )
    return eval_result


def main(*, path_checkpoint: str, experiment_name: str) -> None:
    tic_total = time.perf_counter()
    start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()

    print_verbose("===== The results of this evaluation run are stored as temporary results ======")

    if not os.path.exists(os.path.join(path_checkpoint, CONFIG_NAME)):
        raise FileNotFoundError(
            f"Could not find a PEFT config at {path_checkpoint}. Note that evaluating full fine-tuning checkpoints is "
            "not supported."
        )
    peft_config = PeftConfig.from_pretrained(path_checkpoint)

    path_train_config = os.path.join(path_checkpoint, FILE_NAME_TRAIN_PARAMS)
    if not os.path.exists(path_train_config):
        print_verbose(
            f"Could not find {FILE_NAME_TRAIN_PARAMS} in {path_checkpoint}, using the default training parameters"
        )
    train_config = get_train_config(path_train_config)
    init_accelerator()
    set_seed(train_config.seed)

    model_info = get_base_model_info(train_config.model_id)
    dataset_info = get_dataset_info(train_config.dataset_id)
    # create the pipeline with the plain base model first, then load the trained adapter onto it; compilation, if
    # enabled, must come last, mirroring the order in get_pipeline
    pipeline = get_pipeline(
        model_id=train_config.model_id,
        dtype=train_config.dtype,
        compile=False,
        peft_config=None,
        autocast_adapter_dtype=train_config.autocast_adapter_dtype,
        use_gc=train_config.use_gc,
    )
    pipeline.transformer = PeftModel.from_pretrained(
        pipeline.transformer,
        path_checkpoint,
        is_trainable=True,  # to report the same number of trainable parameters as during training
        autocast_adapter_dtype=train_config.autocast_adapter_dtype,
    )
    if train_config.compile:
        pipeline.transformer = torch.compile(pipeline.transformer, dynamic=True)
    print_verbose(pipeline.transformer)

    eval_result = evaluate_checkpoint(
        pipeline=pipeline,
        train_config=train_config,
        print_verbose=print_verbose,
    )

    file_size = get_file_size(pipeline.transformer, peft_config=peft_config, clean=True, print_fn=print_verbose)

    time_total = time.perf_counter() - tic_total
    log_results(
        experiment_name=experiment_name,
        train_result=eval_result,
        time_total=time_total,
        file_size=file_size,
        model_info=model_info,
        dataset_info=dataset_info,
        start_date=start_date,
        train_config=train_config,
        peft_config=peft_config,
        print_fn=print_verbose,
        save_dir=RESULT_PATH_TEST,  # results of evaluation-only runs are always treated as temporary results
    )

    if (eval_result.status == TrainStatus.SUCCESS) and train_config.sample_image_prompts:
        print_verbose("Generating sample images")
        try:
            file_stem = get_artifact_stem(experiment_name, start_date, SAMPLE_IMAGE_PATH_TEST)
            generate_sample_images(
                pipeline=pipeline,
                train_config=train_config,
                sample_image_dir=SAMPLE_IMAGE_PATH_TEST,
                file_stem=file_stem,
            )
            print_verbose(f"Stored sample images in {SAMPLE_IMAGE_PATH_TEST}")
        except Exception as exc:
            print_verbose(f"Sample image generation failed: {exc}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output")
    parser.add_argument(
        "path_checkpoint", type=str, help="Path to the directory containing the trained PEFT checkpoint"
    )
    args = parser.parse_args()

    experiment_name = get_experiment_name(args.path_checkpoint)

    if args.verbose:

        def print_verbose(*args, **kwargs) -> None:
            kwargs["file"] = sys.stderr
            print(*args, **kwargs)
    else:

        def print_verbose(*args, **kwargs) -> None:
            pass

    main(path_checkpoint=args.path_checkpoint, experiment_name=experiment_name)