# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2025-2026  Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors

import logging
import math
import os
import sys
import time
import json
import warnings
from dataclasses import asdict
from importlib.metadata import PackageNotFoundError, version
from os.path import commonprefix
from pathlib import Path
from typing import Any, cast

import huggingface_hub
import numpy as np
import optuna
import questionary
import torch
import torch.nn.functional as F
import transformers
from accelerate.utils import (
    is_mlu_available,
    is_musa_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
)
from huggingface_hub import ModelCard, ModelCardData
from optuna import Trial, TrialPruned
from optuna.exceptions import ExperimentalWarning
from optuna.samplers import TPESampler
from optuna.storages import JournalStorage
from optuna.storages.journal import JournalFileBackend, JournalFileOpenLock
from optuna.study import StudyDirection
from optuna.trial import TrialState
from pydantic import ValidationError
from questionary import Choice, Style
from rich.table import Table
from rich.traceback import install

from .analyzer import Analyzer
from .config import DirectionMethod, QuantizationMethod, Settings
from .direction import (
    blend_directions,
    compute_benign_subspace_basis,
    compute_direction_candidates,
    orthogonalize_directions,
    project_directions_out_of_subspace,
)
from .evaluator import Evaluator
from .model import AbliterationParameters, Model, get_model_class
from .utils import (
    empty_cache,
    format_duration,
    get_readme_intro,
    get_trial_parameters,
    load_prompts,
    print,
    print_memory_usage,
    prompt_password,
    prompt_path,
    prompt_select,
    prompt_text,
    set_random_seed,
)


def obtain_merge_strategy(settings: Settings) -> str | None:
    """
    Prompts the user for how to proceed with saving the model.
    Provides info to the user if the model is quantized on memory use.
    Returns "merge", "adapter", or None (if cancelled/invalid).
    """

    if settings.quantization == QuantizationMethod.BNB_4BIT:
        print()
        print(
            "Model was loaded with quantization. Merging requires reloading the base model."
        )
        print(
            "[yellow]WARNING: CPU merging requires dequantizing the entire model to system RAM.[/]"
        )
        print("[yellow]This can lead to system freezes if you run out of memory.[/]")

        try:
            # Estimate memory requirements by loading the model structure on the "meta" device.
            # This doesn't consume actual RAM but allows us to inspect the parameter count/dtype.
            #
            # Suppress warnings during meta device loading (e.g., "Some weights were not initialized").
            # These are expected and harmless since we're only inspecting model structure, not running inference.
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                meta_model = get_model_class(settings.model).from_pretrained(
                    settings.model,
                    device_map="meta",
                    torch_dtype=torch.bfloat16,
                    trust_remote_code=settings.trust_remote_code,
                )
                footprint_bytes = meta_model.get_memory_footprint()
                footprint_gb = footprint_bytes / (1024**3)
                print(
                    f"[yellow]Estimated RAM required (excluding overhead): [bold]~{footprint_gb:.2f} GB[/][/]"
                )
        except Exception:
            # Fallback if meta loading fails (e.g. owing to custom model code
            # or bitsandbytes quantization config issues on the meta device).
            print(
                "[yellow]Rule of thumb: You need approximately 3x the parameter count in GB RAM.[/]"
            )
            print(
                "[yellow]Example: A 27B model requires ~80GB RAM. A 70B model requires ~200GB RAM.[/]"
            )
        print()

        strategy = prompt_select(
            "How do you want to proceed?",
            choices=[
                Choice(
                    title="Merge LoRA into full model"
                    + (
                        ""
                        if settings.quantization == QuantizationMethod.NONE
                        else " (requires sufficient RAM)"
                    ),
                    value="merge",
                ),
                Choice(
                    title="Cancel",
                    value="cancel",
                ),
            ],
        )

        if strategy == "cancel":
            return None

        return strategy
    else:
        return "merge"


def run():
    # Enable expandable segments to reduce memory fragmentation on multi-GPU setups.
    if (
        "PYTORCH_ALLOC_CONF" not in os.environ
        and "PYTORCH_CUDA_ALLOC_CONF" not in os.environ
    ):
        os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"

    # Modified "Pagga" font from https://budavariam.github.io/asciiart-text/
    try:
        app_version = version("iconoclast-llm")
    except PackageNotFoundError:
        app_version = "dev"

    print(f"[cyan]ICONOCLAST[/]  v{app_version}")
    print("[grey50]Discriminative representation editing for open-weight models[/]")
    print()

    if (
        # There is at least one argument (argv[0] is the program name).
        len(sys.argv) > 1
        # No model has been explicitly provided.
        and "--model" not in sys.argv
        # The last argument is a parameter value rather than a flag (such as "--help").
        and not sys.argv[-1].startswith("-")
    ):
        # Assume the last argument is the model.
        sys.argv.insert(-1, "--model")

    try:
        # The required argument "model" must be provided by the user,
        # either on the command line or in the configuration file.
        settings = Settings()  # ty:ignore[missing-argument]
    except ValidationError as error:
        print(f"[red]Configuration contains [bold]{error.error_count()}[/] errors:[/]")

        for error in error.errors():
            print(f"[bold]{error['loc'][0]}[/]: [yellow]{error['msg']}[/]")

        print()
        print(
            "Run [bold]iconoclast --help[/] or see [bold]config.default.toml[/] for details about configuration parameters."
        )
        return

    print(f"Using random seed [bold]{settings.seed}[/]")
    set_random_seed(settings.seed)

    # Adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/commands/env.py
    if torch.cuda.is_available():
        count = torch.cuda.device_count()
        total_vram = sum(torch.cuda.mem_get_info(i)[1] for i in range(count))
        print(
            f"Detected [bold]{count}[/] CUDA device(s) ({total_vram / (1024**3):.2f} GB total VRAM):"
        )
        for i in range(count):
            vram = torch.cuda.mem_get_info(i)[1] / (1024**3)
            print(
                f"* GPU {i}: [bold]{torch.cuda.get_device_name(i)}[/] ({vram:.2f} GB)"
            )
    elif is_xpu_available():
        count = torch.xpu.device_count()
        print(f"Detected [bold]{count}[/] XPU device(s):")
        for i in range(count):
            print(f"* XPU {i}: [bold]{torch.xpu.get_device_name(i)}[/]")
    elif is_mlu_available():
        count = torch.mlu.device_count()  # ty:ignore[unresolved-attribute]
        print(f"Detected [bold]{count}[/] MLU device(s):")
        for i in range(count):
            print(f"* MLU {i}: [bold]{torch.mlu.get_device_name(i)}[/]")  # ty:ignore[unresolved-attribute]
    elif is_sdaa_available():
        count = torch.sdaa.device_count()  # ty:ignore[unresolved-attribute]
        print(f"Detected [bold]{count}[/] SDAA device(s):")
        for i in range(count):
            print(f"* SDAA {i}: [bold]{torch.sdaa.get_device_name(i)}[/]")  # ty:ignore[unresolved-attribute]
    elif is_musa_available():
        count = torch.musa.device_count()  # ty:ignore[unresolved-attribute]
        print(f"Detected [bold]{count}[/] MUSA device(s):")
        for i in range(count):
            print(f"* MUSA {i}: [bold]{torch.musa.get_device_name(i)}[/]")  # ty:ignore[unresolved-attribute]
    elif is_npu_available():
        print(f"NPU detected (CANN version: [bold]{torch.version.cann}[/])")  # ty:ignore[unresolved-attribute]
    elif torch.backends.mps.is_available():
        print("Detected [bold]1[/] MPS device (Apple Metal)")
    else:
        print(
            "[bold yellow]No GPU or other accelerator detected. Operations will be slow.[/]"
        )

    # We don't need gradients as we only do inference.
    torch.set_grad_enabled(False)

    # While determining the optimal batch size, we will try many different batch sizes,
    # resulting in many computation graphs being compiled. Raising the limit (default = 8)
    # avoids errors from TorchDynamo assuming that something is wrong because we
    # recompile too often.
    torch._dynamo.config.cache_size_limit = 64

    # Silence warning spam from Transformers.
    # In my entire career I've never seen a useful warning from that library.
    transformers.logging.set_verbosity_error()

    # Another library that generates warning spam.
    logging.getLogger("lm_eval").setLevel(logging.ERROR)

    # We do our own trial logging, so we don't need the INFO messages
    # about parameters and results.
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # Silence the warning about multivariate TPE being experimental.
    warnings.filterwarnings("ignore", category=ExperimentalWarning)

    os.makedirs(settings.study_checkpoint_dir, exist_ok=True)

    study_checkpoint_file = os.path.join(
        settings.study_checkpoint_dir,
        "".join(
            [(c if (c.isalnum() or c in ["_", "-"]) else "--") for c in settings.model]
        )
        + ".jsonl",
    )

    lock_obj = JournalFileOpenLock(study_checkpoint_file)
    backend = JournalFileBackend(study_checkpoint_file, lock_obj=lock_obj)
    storage = JournalStorage(backend)

    try:
        existing_study = storage.get_all_studies()[0]
    except IndexError:
        existing_study = None

    if existing_study is not None and settings.evaluate_model is None:
        if settings.exit_after_optimization:
            print()
            print(
                "[yellow]Existing study detected.[/] Reusing stored settings in batch mode."
            )
            settings = Settings.model_validate_json(existing_study.user_attrs["settings"])
        else:
            choices = []

            if existing_study.user_attrs["finished"]:
                print()
                print(
                    (
                        "[green]You have already processed this model.[/] "
                        "You can show the results from the previous run, allowing you to export models or to run additional trials. "
                        "Alternatively, you can ignore the previous run and start from scratch. "
                        "This will delete the checkpoint file and all results from the previous run."
                    )
                )
                choices.append(
                    Choice(
                        title="Show the results from the previous run",
                        value="continue",
                    )
                )
            else:
                print()
                print(
                    (
                        "[yellow]You have already processed this model, but the run was interrupted.[/] "
                        "You can continue the previous run from where it stopped. This will override any specified settings. "
                        "Alternatively, you can ignore the previous run and start from scratch. "
                        "This will delete the checkpoint file and all results from the previous run."
                    )
                )
                choices.append(
                    Choice(
                        title="Continue the previous run",
                        value="continue",
                    )
                )

            choices.append(
                Choice(
                    title="Ignore the previous run and start from scratch",
                    value="restart",
                )
            )

            choices.append(
                Choice(
                    title="Exit program",
                    value="",
                )
            )

            print()
            choice = prompt_select("How would you like to proceed?", choices)

            if choice == "continue":
                settings = Settings.model_validate_json(
                    existing_study.user_attrs["settings"]
                )
            elif choice == "restart":
                os.unlink(study_checkpoint_file)
                backend = JournalFileBackend(study_checkpoint_file, lock_obj=lock_obj)
                storage = JournalStorage(backend)
            elif choice is None or choice == "":
                return

    model = Model(settings)
    print()
    print_memory_usage()

    print()
    print(f"Loading good prompts from [bold]{settings.good_prompts.dataset}[/]...")
    good_prompts = load_prompts(settings, settings.good_prompts)
    print(f"* [bold]{len(good_prompts)}[/] prompts loaded")

    print()
    print(f"Loading bad prompts from [bold]{settings.bad_prompts.dataset}[/]...")
    bad_prompts = load_prompts(settings, settings.bad_prompts)
    print(f"* [bold]{len(bad_prompts)}[/] prompts loaded")

    if settings.batch_size == 0:
        print()
        print("Determining optimal batch size...")

        batch_size = 1
        best_batch_size = -1
        best_performance = -1

        while batch_size <= settings.max_batch_size:
            print(f"* Trying batch size [bold]{batch_size}[/]... ", end="")

            prompts = good_prompts * math.ceil(batch_size / len(good_prompts))
            prompts = prompts[:batch_size]

            try:
                # Warmup run to build the computation graph so that part isn't benchmarked.
                model.get_responses(prompts)

                start_time = time.perf_counter()
                responses = model.get_responses(prompts)
                end_time = time.perf_counter()
            except Exception as error:
                if batch_size == 1:
                    # Even a batch size of 1 already fails.
                    # We cannot recover from this.
                    raise

                print(f"[red]Failed[/] ({error})")
                break

            response_lengths = [
                len(model.tokenizer.encode(response)) for response in responses
            ]
            performance = sum(response_lengths) / (end_time - start_time)

            print(f"[green]Ok[/] ([bold]{performance:.0f}[/] tokens/s)")

            if performance > best_performance:
                best_batch_size = batch_size
                best_performance = performance

            batch_size *= 2

        settings.batch_size = best_batch_size
        print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")

    print()
    print("Checking for common response prefix...")
    prefix_check_prompts = good_prompts[:100] + bad_prompts[:100]
    responses = model.get_responses_batched(prefix_check_prompts)

    # Despite being located in os.path, commonprefix actually performs
    # a naive string operation without any path-specific logic,
    # which is exactly what we need here. Trailing spaces are removed
    # to avoid issues where multiple different tokens that all start
    # with a space character lead to the common prefix ending with
    # a space, which would result in an uncommon tokenization.
    model.response_prefix = commonprefix(responses).rstrip(" ")

    # Suppress CoT output.
    recheck_prefix = False
    if model.response_prefix:
        # When using any of the predefined prefixes below, we need to check that
        # the prefix is actually complete (e.g. not missing a trailing newline).
        recheck_prefix = True
        if model.response_prefix.startswith("<think>"):
            # Most thinking models.
            model.response_prefix = "<think></think>"
        elif model.response_prefix.startswith("<|channel|>analysis<|message|>"):
            # gpt-oss.
            model.response_prefix = "<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>"
        elif model.response_prefix.startswith("<thought>"):
            # Unknown, suggested by user.
            model.response_prefix = "<thought></thought>"
        elif model.response_prefix.startswith("[THINK]"):
            # Unknown, suggested by user.
            model.response_prefix = "[THINK][/THINK]"
        else:
            recheck_prefix = False

    if model.response_prefix:
        print(f"* Prefix found: [bold]{model.response_prefix!r}[/]")
    else:
        print("* None found")

    if recheck_prefix:
        print("* Rechecking with prefix...")
        responses = model.get_responses_batched(prefix_check_prompts)
        additional_prefix = commonprefix(responses).rstrip(" ")
        if additional_prefix:
            model.response_prefix += additional_prefix
            print(f"* Extended prefix found: [bold]{model.response_prefix!r}[/]")

    evaluator = Evaluator(settings, model)

    if settings.evaluate_model is not None:
        print()
        print(f"Loading model [bold]{settings.evaluate_model}[/]...")
        settings.model = settings.evaluate_model
        model.reset_model()
        print("* Evaluating...")
        evaluator.get_score()
        return

    print()
    print("Calculating per-layer refusal directions...")
    print("* Obtaining residuals for good prompts...")
    good_residuals = model.get_residuals_batched(good_prompts)
    print("* Obtaining residuals for bad prompts...")
    bad_residuals = model.get_residuals_batched(bad_prompts)

    good_means = good_residuals.mean(dim=0)
    direction_candidates = compute_direction_candidates(
        good_residuals,
        bad_residuals,
        settings.direction_variance_floor,
    )

    if settings.benign_subspace_rank > 0:
        benign_subspace_basis = compute_benign_subspace_basis(
            good_residuals,
            settings.benign_subspace_rank,
        )
        if benign_subspace_basis is not None:
            direction_candidates = {
                method: project_directions_out_of_subspace(
                    candidate,
                    benign_subspace_basis,
                )
                for method, candidate in direction_candidates.items()
            }

    if settings.orthogonalize_direction:
        # Implements https://huggingface.co/blog/grimjim/projected-abliteration
        # for every candidate direction set rather than only the mean-difference one.
        direction_candidates = {
            method: orthogonalize_directions(candidate, good_means)
            for method, candidate in direction_candidates.items()
        }

    analyzer = Analyzer(settings, model, good_residuals, bad_residuals)

    if settings.print_residual_geometry:
        analyzer.print_residual_geometry()

    if settings.plot_residuals:
        analyzer.plot_residuals()

    # We don't need the residuals after computing refusal directions.
    del good_residuals, bad_residuals, analyzer
    empty_cache()

    components = model.get_abliterable_components()
    last_layer_index = len(model.get_layers()) - 1
    trial_index = 0
    start_index = 0
    start_time = time.perf_counter()

    def build_direction_tensor(
        direction_method: DirectionMethod,
        direction_blend: float,
    ) -> torch.Tensor:
        if direction_method == DirectionMethod.HYBRID:
            return blend_directions(
                direction_candidates[DirectionMethod.MEAN],
                direction_candidates[DirectionMethod.VARIANCE],
                direction_blend,
            )

        return direction_candidates[direction_method]

    def get_trial_direction_indices(
        trial: Trial,
    ) -> float | None | dict[str, float | None]:
        component_direction_scopes = trial.user_attrs.get("component_direction_scopes")
        if isinstance(component_direction_scopes, dict):
            component_direction_indices = trial.user_attrs.get(
                "component_direction_indices",
                {},
            )
            return {
                component: (
                    None
                    if component_direction_scopes.get(component) == "per layer"
                    else component_direction_indices.get(component)
                )
                for component in components
            }

        direction_scope = trial.params.get(
            "direction_scope",
            trial.user_attrs.get("direction_scope", "global"),
        )
        if direction_scope == "per layer":
            return None
        return trial.params.get(
            "direction_index",
            trial.user_attrs.get("direction_index"),
        )

    def get_trial_refusal_directions(
        trial: Trial,
    ) -> torch.Tensor | dict[str, torch.Tensor]:
        component_direction_methods = trial.user_attrs.get("component_direction_methods")
        if isinstance(component_direction_methods, dict):
            component_direction_blends = trial.user_attrs.get(
                "component_direction_blends",
                {},
            )
            return {
                component: build_direction_tensor(
                    DirectionMethod(component_direction_methods[component]),
                    float(component_direction_blends.get(component, 0.0)),
                )
                for component in components
            }

        direction_method = DirectionMethod(
            trial.params.get(
                "direction_method",
                trial.user_attrs.get("direction_method", DirectionMethod.MEAN.value),
            )
        )
        direction_blend = float(
            trial.params.get(
                "direction_blend",
                trial.user_attrs.get("direction_blend", 0.0),
            )
        )
        return build_direction_tensor(direction_method, direction_blend)

    def recompute_objective_score(
        behavior_score: float,
        kl_divergence: float,
    ) -> tuple[float, float]:
        if kl_divergence >= settings.kl_divergence_target:
            kld_score = kl_divergence / settings.kl_divergence_scale
        else:
            kld_score = (
                behavior_score
                * settings.kl_divergence_target
                / settings.kl_divergence_scale
            )
        return (kld_score, behavior_score)

    def should_run_merged_validation(refusals: int) -> bool:
        if settings.merged_validation_interval <= 0:
            return False
        if trial_index == settings.n_trials:
            return True
        if refusals > evaluator.base_refusals:
            return False
        return trial_index % settings.merged_validation_interval == 0

    def compute_merge_penalty(
        adapter_result: Any,
        merged_result: Any,
    ) -> float:
        harmful_count = max(settings.merged_validation_subset_size, 1)
        merge_refusal_gap = max(
            0,
            merged_result.refusals
            - adapter_result.refusals
            - settings.merged_validation_refusal_tolerance,
        ) / harmful_count
        merge_overrefusal_gap = max(
            0,
            merged_result.overrefusals
            - adapter_result.overrefusals
            - settings.merged_validation_overrefusal_tolerance,
        ) / harmful_count
        merge_disclaimer_gap = max(
            0,
            merged_result.harmful_marker_hits
            - adapter_result.harmful_marker_hits
            - settings.merged_validation_disclaimer_tolerance,
        ) / harmful_count
        merge_compliance_gap = max(
            0.0,
            adapter_result.harmful_compliance_score
            - merged_result.harmful_compliance_score
            - settings.merged_validation_compliance_tolerance,
        )
        return settings.merged_consistency_penalty * (
            merge_refusal_gap
            + merge_overrefusal_gap
            + merge_disclaimer_gap
            + merge_compliance_gap
        )

    def objective(trial: Trial) -> tuple[float, float]:
        nonlocal trial_index
        trial_index += 1
        trial.set_user_attr("index", trial_index)
        component_direction_scopes = {}
        component_direction_indices = {}
        component_direction_methods = {}
        component_direction_blends = {}

        if settings.component_specific_directions:
            refusal_directions = {}
            direction_index: float | None | dict[str, float | None] = {}
            for component in components:
                direction_scope = trial.suggest_categorical(
                    f"{component}.direction_scope",
                    ["global", "per layer"],
                )
                sampled_direction_index = trial.suggest_float(
                    f"{component}.direction_index",
                    0.4 * last_layer_index,
                    0.9 * last_layer_index,
                )
                if direction_scope == "per layer":
                    component_direction_indices[component] = None
                else:
                    component_direction_indices[component] = sampled_direction_index
                direction_method = DirectionMethod(
                    trial.suggest_categorical(
                        f"{component}.direction_method",
                        [method.value for method in DirectionMethod],
                    )
                )
                direction_blend = trial.suggest_float(
                    f"{component}.direction_blend",
                    0.0,
                    1.0,
                )
                refusal_directions[component] = build_direction_tensor(
                    direction_method,
                    direction_blend,
                )
                cast(dict[str, float | None], direction_index)[component] = (
                    component_direction_indices[component]
                )
                component_direction_scopes[component] = direction_scope
                component_direction_methods[component] = direction_method.value
                component_direction_blends[component] = direction_blend
            trial.set_user_attr("direction_scope", "mixed")
            trial.set_user_attr("direction_index", None)
            trial.set_user_attr("direction_method", "mixed")
            trial.set_user_attr("direction_blend", 0.0)
            trial.set_user_attr(
                "component_direction_scopes",
                component_direction_scopes,
            )
            trial.set_user_attr(
                "component_direction_indices",
                component_direction_indices,
            )
            trial.set_user_attr(
                "component_direction_methods",
                component_direction_methods,
            )
            trial.set_user_attr(
                "component_direction_blends",
                component_direction_blends,
            )
        else:
            direction_scope = trial.suggest_categorical(
                "direction_scope",
                ["global", "per layer"],
            )
            direction_index = trial.suggest_float(
                "direction_index",
                0.4 * last_layer_index,
                0.9 * last_layer_index,
            )
            if direction_scope == "per layer":
                direction_index = None
            direction_method = DirectionMethod(
                trial.suggest_categorical(
                    "direction_method",
                    [method.value for method in DirectionMethod],
                )
            )
            direction_blend = trial.suggest_float(
                "direction_blend",
                0.0,
                1.0,
            )
            refusal_directions = build_direction_tensor(direction_method, direction_blend)
            trial.set_user_attr("direction_scope", direction_scope)
            trial.set_user_attr("direction_index", direction_index)
            trial.set_user_attr("direction_method", direction_method.value)
            trial.set_user_attr("direction_blend", direction_blend)

        parameters = {}

        for component in components:
            # The parameter ranges are based on experiments with various models
            # and much wider ranges. They are not set in stone and might have to be
            # adjusted for future models.
            max_weight = trial.suggest_float(
                f"{component}.max_weight",
                0.5,
                2.0,
            )
            max_weight_position = trial.suggest_float(
                f"{component}.max_weight_position",
                0.4 * last_layer_index,
                1.0 * last_layer_index,
            )
            # For sampling purposes, min_weight is expressed as a fraction of max_weight,
            # again because multivariate TPE doesn't support variable-range parameters.
            # The value is transformed into the actual min_weight value below.
            min_weight = trial.suggest_float(
                f"{component}.min_weight",
                0.0,
                1.0,
            )
            min_weight_distance = trial.suggest_float(
                f"{component}.min_weight_distance",
                1.0,
                0.6 * last_layer_index,
            )

            parameters[component] = AbliterationParameters(
                max_weight=max_weight,
                max_weight_position=max_weight_position,
                min_weight=(min_weight * max_weight),
                min_weight_distance=min_weight_distance,
            )

        trial.set_user_attr("parameters", {k: asdict(v) for k, v in parameters.items()})

        print()
        print(
            f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
        )
        print("* Parameters:")
        for name, value in get_trial_parameters(trial).items():
            print(f"  * {name} = [bold]{value}[/]")
        print("* Resetting model...")
        model.reset_model()
        print("* Abliterating...")
        model.abliterate(refusal_directions, direction_index, parameters)
        print("* Evaluating...")
        evaluation_result = evaluator.get_score()
        score = evaluation_result.score
        kl_divergence = evaluation_result.kl_divergence
        refusals = evaluation_result.refusals
        overrefusals = evaluation_result.overrefusals
        harmful_marker_hits = evaluation_result.harmful_marker_hits
        harmful_compliance_score = evaluation_result.harmful_compliance_score
        merge_penalty = 0.0

        if should_run_merged_validation(refusals):
            print("* Validating merged-model subset...")
            merged_result = model.evaluate_merged(
                lambda: evaluator.get_subset_score(settings.merged_validation_subset_size)
            )
            merge_penalty = compute_merge_penalty(evaluation_result, merged_result)
            trial.set_user_attr("merged_validated", True)
            trial.set_user_attr("merged_refusals", merged_result.refusals)
            trial.set_user_attr("merged_overrefusals", merged_result.overrefusals)
            trial.set_user_attr(
                "merged_harmful_marker_hits",
                merged_result.harmful_marker_hits,
            )
            trial.set_user_attr(
                "merged_harmful_compliance_score",
                merged_result.harmful_compliance_score,
            )
            trial.set_user_attr("merge_penalty", merge_penalty)
            score = recompute_objective_score(
                evaluation_result.behavior_score + merge_penalty,
                kl_divergence,
            )
        else:
            trial.set_user_attr("merged_validated", False)
            trial.set_user_attr("merge_penalty", 0.0)

        elapsed_time = time.perf_counter() - start_time
        remaining_time = (elapsed_time / (trial_index - start_index)) * (
            settings.n_trials - trial_index
        )
        print()
        print(f"[grey50]Elapsed time: [bold]{format_duration(elapsed_time)}[/][/]")
        if trial_index < settings.n_trials:
            print(
                f"[grey50]Estimated remaining time: [bold]{format_duration(remaining_time)}[/][/]"
            )
        print_memory_usage()

        trial.set_user_attr("kl_divergence", kl_divergence)
        trial.set_user_attr("refusals", refusals)
        trial.set_user_attr("overrefusals", overrefusals)
        trial.set_user_attr("harmful_marker_hits", harmful_marker_hits)
        trial.set_user_attr("harmful_compliance_score", harmful_compliance_score)
        trial.set_user_attr("objective_regime", evaluation_result.objective_regime)
        trial.set_user_attr("harmful_axis_metrics", evaluation_result.harmful_axis_metrics)

        return score

    def objective_wrapper(trial: Trial) -> tuple[float, float]:
        try:
            return objective(trial)
        except KeyboardInterrupt:
            # Stop the study gracefully on Ctrl+C.
            trial.study.stop()
            raise TrialPruned()

    study = optuna.create_study(
        sampler=TPESampler(
            seed=settings.seed,
            n_startup_trials=settings.n_startup_trials,
            n_ei_candidates=128,
            multivariate=True,
        ),
        directions=[StudyDirection.MINIMIZE, StudyDirection.MINIMIZE],
        storage=storage,
        study_name="iconoclast",
        load_if_exists=True,
    )

    study.set_user_attr("settings", settings.model_dump_json())
    study.set_user_attr("finished", False)

    def count_completed_trials() -> int:
        # Count number of complete trials to compute trials to run.
        return sum([(1 if t.state == TrialState.COMPLETE else 0) for t in study.trials])

    def get_completed_trials() -> list[Trial]:
        return [t for t in study.trials if t.state == TrialState.COMPLETE]

    if settings.warm_start_trials and count_completed_trials() == 0:
        print()
        print(
            f"Queueing [bold]{len(settings.warm_start_trials)}[/] warm-start trial(s)..."
        )
        for warm_start_trial in settings.warm_start_trials:
            study.enqueue_trial(dict(warm_start_trial.params))
            if warm_start_trial.description:
                print(f"* {warm_start_trial.description}")

    def get_pareto_trials(completed_trials: list[Trial]) -> list[Trial]:
        sorted_trials = sorted(
            completed_trials,
            key=lambda trial: (
                trial.user_attrs["refusals"],
                trial.user_attrs.get("overrefusals", 0),
                trial.user_attrs["kl_divergence"],
            ),
        )
        min_divergence = math.inf
        min_overrefusals = math.inf
        best_trials = []
        for trial in sorted_trials:
            kl_divergence = trial.user_attrs["kl_divergence"]
            overrefusals = trial.user_attrs.get("overrefusals", 0)
            if (
                overrefusals < min_overrefusals
                or (
                    overrefusals == min_overrefusals
                    and kl_divergence < min_divergence
                )
            ):
                min_overrefusals = overrefusals
                min_divergence = kl_divergence
                best_trials.append(trial)
        return best_trials

    def serialize_trial(trial: Trial) -> dict[str, Any]:
        return {
            "index": trial.user_attrs.get("index"),
            "refusals": trial.user_attrs["refusals"],
            "overrefusals": trial.user_attrs.get("overrefusals", 0),
            "harmful_marker_hits": trial.user_attrs.get("harmful_marker_hits", 0),
            "harmful_compliance_score": trial.user_attrs.get(
                "harmful_compliance_score", 0.0
            ),
            "objective_regime": trial.user_attrs.get("objective_regime"),
            "merge_penalty": trial.user_attrs.get("merge_penalty", 0.0),
            "kl_divergence": trial.user_attrs["kl_divergence"],
            "direction_method": trial.user_attrs.get("direction_method"),
            "direction_scope": trial.user_attrs.get("direction_scope"),
            "direction_index": trial.user_attrs.get("direction_index"),
            "direction_blend": trial.user_attrs.get("direction_blend"),
            "parameters": trial.user_attrs.get("parameters", {}),
            "harmful_axis_metrics": trial.user_attrs.get("harmful_axis_metrics", {}),
        }

    def write_batch_summary(best_trials: list[Trial]) -> Path:
        summary_path = Path(settings.study_checkpoint_dir, "batch_summary.json")
        summary_path.parent.mkdir(parents=True, exist_ok=True)
        summary = {
            "model": settings.model,
            "study_checkpoint_dir": settings.study_checkpoint_dir,
            "base_metrics": {
                "refusals": evaluator.base_refusals,
                "overrefusals": evaluator.base_overrefusals,
                "harmful_marker_hits": evaluator.base_harmful_marker_hits,
                "harmful_compliance_score": evaluator.base_harmful_compliance_score,
                "objective_regime": evaluator.objective_regime.value,
            },
            "pareto_trials": [serialize_trial(trial) for trial in best_trials],
        }
        summary_path.write_text(json.dumps(summary, indent=2))
        return summary_path

    start_index = trial_index = count_completed_trials()
    if start_index > 0:
        print()
        print("Resuming existing study.")

    try:
        study.optimize(
            objective_wrapper,
            n_trials=settings.n_trials - count_completed_trials(),
        )
    except KeyboardInterrupt:
        # This additional handler takes care of the small chance that KeyboardInterrupt
        # is raised just between trials, which wouldn't be caught by the handler
        # defined in objective_wrapper above.
        pass

    if count_completed_trials() == settings.n_trials:
        study.set_user_attr("finished", True)

    if settings.exit_after_optimization:
        completed_trials = get_completed_trials()
        print()
        print("[bold green]Optimization finished in batch mode.[/]")
        if not completed_trials:
            print("[yellow]No completed trials were recorded.[/]")
            return

        best_trials = get_pareto_trials(completed_trials)
        summary_path = write_batch_summary(best_trials)

        print("Top Pareto trials:")
        for trial in best_trials[:5]:
            print(
                f"* Trial {trial.user_attrs['index']}: "
                f"refusals={trial.user_attrs['refusals']}/{len(evaluator.bad_prompts)}, "
                f"overrefusals={trial.user_attrs.get('overrefusals', 0)}/{len(evaluator.good_prompts)}, "
                f"markers={trial.user_attrs.get('harmful_marker_hits', 0)}, "
                f"compliance={trial.user_attrs.get('harmful_compliance_score', 0.0):.3f}, "
                f"kl={trial.user_attrs['kl_divergence']:.4f}"
            )

        print(f"Batch summary written to [bold]{summary_path}[/].")
        return

    while True:
        # If no trials at all have been evaluated, the study must have been stopped
        # by pressing Ctrl+C while the first trial was running. In this case, we just
        # re-raise the interrupt to invoke the standard handler defined below.
        completed_trials = get_completed_trials()
        if not completed_trials:
            raise KeyboardInterrupt

        # Get the Pareto front of trials. We can't use study.best_trials directly
        # as get_score() doesn't return the pure KL divergence and refusal count.
        # Note: Unlike study.best_trials, this does not handle objective constraints.
        best_trials = get_pareto_trials(completed_trials)

        choices = [
            Choice(
                title=(
                    f"[Trial {trial.user_attrs['index']:>3}] "
                    f"Refusals: {trial.user_attrs['refusals']:>2}/{len(evaluator.bad_prompts)}, "
                    f"Overrefusals: {trial.user_attrs.get('overrefusals', 0):>2}/{len(evaluator.good_prompts)}, "
                    f"Markers: {trial.user_attrs.get('harmful_marker_hits', 0):>3}, "
                    f"Compliance: {trial.user_attrs.get('harmful_compliance_score', 0.0):.2f}, "
                    f"MergePen: {trial.user_attrs.get('merge_penalty', 0.0):.2f}, "
                    f"Method: {trial.user_attrs.get('direction_method', 'mean')}, "
                    f"KL divergence: {trial.user_attrs['kl_divergence']:.4f}"
                ),
                value=trial,
            )
            for trial in best_trials
        ]

        choices.append(
            Choice(
                title="Run additional trials",
                value="continue",
            )
        )

        choices.append(
            Choice(
                title="Exit program",
                value="",
            )
        )

        print()
        print("[bold green]Optimization finished![/]")
        print()
        print(
            (
                "The following trials resulted in Pareto optimal combinations of refusals and KL divergence. "
                "Trials are ordered to prefer lower harmful refusals first, then lower overrefusals, then lower KL divergence. "
                "After selecting a trial, you will be able to save the model, upload it to Hugging Face, "
                "or chat with it to test how well it works. You can return to this menu later to select a different trial. "
                "[yellow]Note that KL divergence values above 1 usually indicate significant damage to the original model's capabilities.[/]"
            )
        )

        while True:
            print()
            trial = prompt_select("Which trial do you want to use?", choices)

            if trial == "continue":
                while True:
                    try:
                        n_additional_trials = prompt_text(
                            "How many additional trials do you want to run?"
                        )
                        if n_additional_trials is None or n_additional_trials == "":
                            n_additional_trials = 0
                            break
                        n_additional_trials = int(n_additional_trials)
                        if n_additional_trials > 0:
                            break
                        print("[red]Please enter a number greater than 0.[/]")
                    except ValueError:
                        print("[red]Please enter a number.[/]")

                if n_additional_trials == 0:
                    continue

                settings.n_trials += n_additional_trials
                study.set_user_attr("settings", settings.model_dump_json())
                study.set_user_attr("finished", False)

                try:
                    study.optimize(
                        objective_wrapper,
                        n_trials=settings.n_trials - count_completed_trials(),
                    )
                except KeyboardInterrupt:
                    pass

                if count_completed_trials() == settings.n_trials:
                    study.set_user_attr("finished", True)

                break

            elif trial is None or trial == "":
                return

            print()
            print(f"Restoring model from trial [bold]{trial.user_attrs['index']}[/]...")
            print("* Parameters:")
            for name, value in get_trial_parameters(trial).items():
                print(f"  * {name} = [bold]{value}[/]")
            print("* Resetting model...")
            model.reset_model()
            print("* Abliterating...")
            model.abliterate(
                get_trial_refusal_directions(trial),
                get_trial_direction_indices(trial),
                {
                    k: AbliterationParameters(**v)
                    for k, v in trial.user_attrs["parameters"].items()
                },
            )

            while True:
                print()
                action = prompt_select(
                    "What do you want to do with the decensored model?",
                    [
                        "Save the model to a local folder",
                        "Upload the model to Hugging Face",
                        "Chat with the model",
                        "Benchmark the model",
                        "Return to the trial selection menu",
                    ],
                )

                if action is None or action == "Return to the trial selection menu":
                    break

                # All actions are wrapped in a try/except block so that if an error occurs,
                # another action can be tried, instead of the program crashing and losing
                # the optimized model.
                try:
                    match action:
                        case "Save the model to a local folder":
                            save_directory = prompt_path("Path to the folder:")
                            if not save_directory:
                                continue

                            strategy = obtain_merge_strategy(settings)
                            if strategy is None:
                                continue

                            if strategy == "adapter":
                                print("Saving LoRA adapter...")
                                model.model.save_pretrained(save_directory)
                            else:
                                print("Saving merged model...")
                                merged_model = model.get_merged_model()
                                merged_model.save_pretrained(save_directory)
                                del merged_model
                                empty_cache()
                                model.tokenizer.save_pretrained(save_directory)

                            print(f"Model saved to [bold]{save_directory}[/].")

                        case "Upload the model to Hugging Face":
                            # We don't use huggingface_hub.login() because that stores the token on disk,
                            # and since this program will often be run on rented or shared GPU servers,
                            # it's better to not persist credentials.
                            token = huggingface_hub.get_token()
                            if not token:
                                token = prompt_password("Hugging Face access token:")
                            if not token:
                                continue

                            user = huggingface_hub.whoami(token)
                            fullname = user.get(
                                "fullname",
                                user.get("name", "unknown user"),
                            )
                            email = user.get("email", "no email found")
                            print(f"Logged in as [bold]{fullname} ({email})[/]")

                            repo_id = prompt_text(
                                "Name of repository:",
                                default=f"{user['name']}/{Path(settings.model).name}-iconoclast",
                            )

                            visibility = prompt_select(
                                "Should the repository be public or private?",
                                [
                                    "Public",
                                    "Private",
                                ],
                            )
                            if visibility is None:
                                continue
                            private = visibility == "Private"

                            strategy = obtain_merge_strategy(settings)
                            if strategy is None:
                                continue

                            if strategy == "adapter":
                                print("Uploading LoRA adapter...")
                                model.model.push_to_hub(
                                    repo_id,
                                    private=private,
                                    token=token,
                                )
                            else:
                                print("Uploading merged model...")
                                merged_model = model.get_merged_model()
                                merged_model.push_to_hub(
                                    repo_id,
                                    private=private,
                                    token=token,
                                )
                                del merged_model
                                empty_cache()
                                model.tokenizer.push_to_hub(
                                    repo_id,
                                    private=private,
                                    token=token,
                                )

                            # If the model path exists locally and includes the
                            # card, use it directly. If the model path doesn't
                            # exist locally, it can be assumed to be a model
                            # hosted on the Hugging Face Hub, in which case
                            # we can retrieve the model card.
                            model_path = Path(settings.model)
                            if model_path.exists():
                                card_path = (
                                    model_path / huggingface_hub.constants.REPOCARD_NAME
                                )
                                if card_path.exists():
                                    card = ModelCard.load(card_path)
                                else:
                                    card = None
                            else:
                                card = ModelCard.load(settings.model)
                            if card is not None:
                                if card.data is None:
                                    card.data = ModelCardData()
                                if card.data.tags is None:
                                    card.data.tags = []
                                card.data.tags.append("iconoclast")
                                card.data.tags.append("uncensored")
                                card.data.tags.append("decensored")
                                card.data.tags.append("abliterated")
                                card.text = (
                                    get_readme_intro(
                                        settings,
                                        trial,
                                        evaluator.base_refusals,
                                        evaluator.base_overrefusals,
                                        evaluator.good_prompts,
                                        evaluator.bad_prompts,
                                    )
                                    + card.text
                                )
                                card.push_to_hub(repo_id, token=token)

                            print(f"Model uploaded to [bold]{repo_id}[/].")

                        case "Chat with the model":
                            print()
                            print(
                                "[cyan]Press Ctrl+C at any time to return to the menu.[/]"
                            )

                            chat = [
                                {"role": "system", "content": settings.system_prompt},
                            ]

                            while True:
                                try:
                                    message = prompt_text(
                                        "User:",
                                        qmark=">",
                                        unsafe=True,
                                    )
                                    if not message:
                                        break
                                    chat.append({"role": "user", "content": message})

                                    print("[bold]Assistant:[/] ", end="")
                                    response = model.stream_chat_response(chat)
                                    chat.append(
                                        {"role": "assistant", "content": response}
                                    )
                                except (KeyboardInterrupt, EOFError):
                                    # Ctrl+C/Ctrl+D
                                    break

                        case "Benchmark the model":
                            import lm_eval
                            from lm_eval.models.huggingface import HFLM

                            benchmarks = questionary.checkbox(
                                "Which benchmarks do you want to run?",
                                [
                                    Choice(
                                        title=f"{benchmark.name}: {benchmark.description}",
                                        value=benchmark,
                                    )
                                    for benchmark in settings.benchmarks
                                ],
                                style=Style([("highlighted", "reverse")]),
                            ).ask()
                            if not benchmarks:
                                continue

                            scope = prompt_select(
                                (
                                    "Do you want to benchmark the original model along with the decensored model? "
                                    "Benchmarking both models allows you to compare the scores, but it takes twice as much time."
                                ),
                                [
                                    "Benchmark only the decensored model",
                                    "Benchmark both models",
                                ],
                            )
                            if scope is None:
                                continue
                            benchmark_original_model = scope == "Benchmark both models"

                            hflm = HFLM(
                                pretrained=model.model,  # ty:ignore[invalid-argument-type]
                                tokenizer=model.tokenizer,  # ty:ignore[invalid-argument-type]
                            )

                            table = Table()
                            table.add_column("Benchmark")
                            table.add_column("Metric")
                            if benchmark_original_model:
                                table.add_column("This model", justify="right")
                                table.add_column("Original model", justify="right")
                            else:
                                table.add_column("Value", justify="right")

                            try:
                                first_benchmark = True

                                for benchmark in benchmarks:
                                    print(
                                        f"Running benchmark [bold]{benchmark.name}[/]..."
                                    )

                                    def get_results() -> dict[str, Any]:
                                        results = lm_eval.simple_evaluate(
                                            model=hflm,
                                            tasks=[benchmark.task],
                                            batch_size="auto",
                                        )
                                        return results["results"][benchmark.task]

                                    results = get_results()
                                    if benchmark_original_model:
                                        with model.model.disable_adapter():  # ty:ignore[call-non-callable]
                                            original_results = get_results()

                                    first_row = True

                                    for metric, value in results.items():
                                        if metric != "alias":
                                            if first_row and not first_benchmark:
                                                if benchmark_original_model:
                                                    table.add_row("", "", "", "")
                                                else:
                                                    table.add_row("", "", "")

                                            def format_value(value: Any) -> str:
                                                if isinstance(
                                                    value,
                                                    (float, np.floating),
                                                ):
                                                    return f"{value:.4f}"
                                                else:
                                                    return f"{value}"

                                            cells = [
                                                benchmark.name if first_row else "",
                                                metric,
                                                format_value(value),
                                            ]
                                            if benchmark_original_model:
                                                cells.append(
                                                    format_value(
                                                        original_results[metric]
                                                    )
                                                )
                                            table.add_row(*cells)

                                            first_row = False
                                            first_benchmark = False
                            except KeyboardInterrupt:
                                pass

                            # The benchmark run might have been cancelled by the user
                            # before any benchmark was completed, so we only print results
                            # if there actually are some.
                            if table.rows:
                                print(table)

                except Exception as error:
                    print(f"[red]Error: {error}[/]")


def main():
    # Install Rich traceback handler.
    install()

    try:
        run()
    except BaseException as error:
        # Transformers appears to handle KeyboardInterrupt (or BaseException)
        # internally in some places, which can re-raise a different error in the handler,
        # masking the root cause. We therefore check both the error itself and its context.
        if isinstance(error, KeyboardInterrupt) or isinstance(
            error.__context__, KeyboardInterrupt
        ):
            print()
            print("[red]Shutting down...[/]")
        else:
            raise