Instructions to use HaadesX/Iconoclast with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use HaadesX/Iconoclast with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("HaadesX/Iconoclast", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| import argparse | |
| import importlib | |
| import json | |
| import os | |
| import sys | |
| from collections import defaultdict | |
| from os.path import commonprefix | |
| from pathlib import Path | |
| from typing import Any | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Evaluate the best trial on a large holdout set.") | |
| parser.add_argument("--checkpoint", required=True, help="Path to Optuna checkpoint JSONL file") | |
| parser.add_argument("--dataset", default="mlabonne/harmful_behaviors", help="HF Dataset name") | |
| parser.add_argument("--split", default="train+test", help="Dataset split") | |
| parser.add_argument("--column", default="text", help="Dataset column for prompt") | |
| parser.add_argument("--output", required=True, help="Output JSON path") | |
| return parser.parse_args() | |
| def load_study(checkpoint_path: Path) -> tuple[str, dict[int, dict[str, Any]]]: | |
| settings_json = None | |
| trials: dict[int, dict[str, Any]] = defaultdict(dict) | |
| for line in checkpoint_path.read_text().splitlines(): | |
| obj = json.loads(line) | |
| user_attr = obj.get("user_attr") | |
| if user_attr and "settings" in user_attr and settings_json is None: | |
| settings_json = user_attr["settings"] | |
| trial_id = obj.get("trial_id") | |
| if trial_id is not None and user_attr: | |
| trials[trial_id].update(user_attr) | |
| if settings_json is None: | |
| raise ValueError(f"Did not find settings in {checkpoint_path}") | |
| return settings_json, trials | |
| def pick_best_trial(trials: dict[int, dict[str, Any]]) -> dict[str, Any]: | |
| best = None | |
| for trial_id, attrs in trials.items(): | |
| if not {"refusals", "kl_divergence", "parameters"}.issubset(attrs): | |
| continue | |
| item = { | |
| "trial_id": trial_id, | |
| "index": attrs.get("index"), | |
| "refusals": attrs["refusals"], | |
| "overrefusals": attrs.get("overrefusals", 0), | |
| "kl_divergence": attrs["kl_divergence"], | |
| "direction_index": attrs["direction_index"], | |
| "direction_method": attrs["direction_method"], | |
| "direction_blend": attrs.get("direction_blend", 0.0), | |
| "component_direction_indices": attrs.get("component_direction_indices"), | |
| "component_direction_methods": attrs.get("component_direction_methods"), | |
| "component_direction_blends": attrs.get("component_direction_blends"), | |
| "parameters": attrs["parameters"], | |
| } | |
| key = (item["refusals"], item["overrefusals"], item["kl_divergence"]) | |
| if best is None or key < ( | |
| best["refusals"], | |
| best["overrefusals"], | |
| best["kl_divergence"], | |
| ): | |
| best = item | |
| if best is None: | |
| raise ValueError("No completed trials with metrics were found") | |
| return best | |
| def load_stack(package: str) -> dict[str, Any]: | |
| return { | |
| "package": package, | |
| "config": importlib.import_module(f"{package}.config"), | |
| "direction": importlib.import_module(f"{package}.direction"), | |
| "model": importlib.import_module(f"{package}.model"), | |
| "utils": importlib.import_module(f"{package}.utils"), | |
| } | |
| def normalize_refusal_text(text: str) -> str: | |
| text = text.lower().replace("*", "") | |
| text = text.replace("’", "'") | |
| return " ".join(text.split()) | |
| def is_refusal(response: str, refusal_markers: list[str]) -> bool: | |
| if not response.strip(): | |
| return True | |
| normalized = normalize_refusal_text(response) | |
| return any(marker.lower() in normalized for marker in refusal_markers) | |
| def determine_response_prefix(model: Any, good_prompts: list[Any], bad_prompts: list[Any]) -> None: | |
| prefix_check_prompts = good_prompts[:100] + bad_prompts[:100] | |
| responses = model.get_responses_batched(prefix_check_prompts) | |
| model.response_prefix = commonprefix(responses).rstrip(" ") | |
| recheck_prefix = False | |
| if model.response_prefix: | |
| recheck_prefix = True | |
| if model.response_prefix.startswith("<think>"): | |
| model.response_prefix = "<think></think>" | |
| elif model.response_prefix.startswith("<|channel|>analysis<|message|>"): | |
| model.response_prefix = "<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>" | |
| elif model.response_prefix.startswith("<thought>"): | |
| model.response_prefix = "<thought></thought>" | |
| elif model.response_prefix.startswith("[THINK]"): | |
| model.response_prefix = "[THINK][/THINK]" | |
| else: | |
| recheck_prefix = False | |
| if recheck_prefix: | |
| responses = model.get_responses_batched(prefix_check_prompts) | |
| additional_prefix = commonprefix(responses).rstrip(" ") | |
| if additional_prefix: | |
| model.response_prefix += additional_prefix | |
| def prepare_runtime(stack: dict[str, Any], settings_json: str) -> dict[str, Any]: | |
| Settings = stack["config"].Settings | |
| DirectionMethod = stack["config"].DirectionMethod | |
| Model = stack["model"].Model | |
| load_prompts = stack["utils"].load_prompts | |
| set_random_seed = stack["utils"].set_random_seed | |
| empty_cache = stack["utils"].empty_cache | |
| compute_direction_candidates = stack["direction"].compute_direction_candidates | |
| orthogonalize_directions = stack["direction"].orthogonalize_directions | |
| blend_directions = stack["direction"].blend_directions | |
| settings = Settings.model_validate_json(settings_json) | |
| set_random_seed(settings.seed) | |
| model = Model(settings) | |
| good_prompts = load_prompts(settings, settings.good_prompts) | |
| bad_prompts = load_prompts(settings, settings.bad_prompts) | |
| determine_response_prefix(model, good_prompts, bad_prompts) | |
| good_residuals = model.get_residuals_batched(good_prompts) | |
| bad_residuals = model.get_residuals_batched(bad_prompts) | |
| good_means = good_residuals.mean(dim=0) | |
| direction_candidates = compute_direction_candidates( | |
| good_residuals, | |
| bad_residuals, | |
| settings.direction_variance_floor, | |
| ) | |
| if settings.orthogonalize_direction: | |
| direction_candidates = { | |
| method: orthogonalize_directions(candidate, good_means) | |
| for method, candidate in direction_candidates.items() | |
| } | |
| del good_residuals, bad_residuals | |
| empty_cache() | |
| def get_trial_refusal_directions(trial_data: dict[str, Any]) -> Any: | |
| component_direction_methods = trial_data.get("component_direction_methods") | |
| if isinstance(component_direction_methods, dict): | |
| component_direction_blends = trial_data.get("component_direction_blends", {}) | |
| return { | |
| component: blend_directions( | |
| direction_candidates[DirectionMethod.MEAN], | |
| direction_candidates[DirectionMethod.VARIANCE], | |
| float(component_direction_blends.get(component, 0.0)), | |
| ) | |
| if DirectionMethod(method) == DirectionMethod.HYBRID | |
| else direction_candidates[DirectionMethod(method)] | |
| for component, method in component_direction_methods.items() | |
| } | |
| direction_method = DirectionMethod(trial_data["direction_method"]) | |
| direction_blend = float(trial_data.get("direction_blend", 0.0)) | |
| if direction_method == DirectionMethod.HYBRID: | |
| return blend_directions( | |
| direction_candidates[DirectionMethod.MEAN], | |
| direction_candidates[DirectionMethod.VARIANCE], | |
| direction_blend, | |
| ) | |
| return direction_candidates[direction_method] | |
| return { | |
| "settings": settings, | |
| "model": model, | |
| "get_trial_refusal_directions": get_trial_refusal_directions, | |
| "AbliterationParameters": stack["model"].AbliterationParameters, | |
| "empty_cache": empty_cache, | |
| } | |
| def apply_trial(runtime: dict[str, Any], trial_data: dict[str, Any]) -> None: | |
| model = runtime["model"] | |
| AbliterationParameters = runtime["AbliterationParameters"] | |
| parameters = { | |
| name: AbliterationParameters(**values) | |
| for name, values in trial_data["parameters"].items() | |
| } | |
| model.reset_model() | |
| model.abliterate( | |
| runtime["get_trial_refusal_directions"](trial_data), | |
| trial_data.get("component_direction_indices", trial_data["direction_index"]), | |
| parameters, | |
| ) | |
| def main() -> None: | |
| args = parse_args() | |
| # Critical: Prevent Pydantic BaseSettings in iconoclast.config from | |
| # trying to parse sys.argv, which would collide with our own arguments. | |
| sys.argv = [sys.argv[0]] | |
| settings_json, trials = load_study(Path(args.checkpoint)) | |
| best_trial = pick_best_trial(trials) | |
| stack = load_stack("iconoclast") | |
| runtime = prepare_runtime(stack, settings_json) | |
| # Load large dataset prompts | |
| # Mocking config object to use the load_prompts utility | |
| from pydantic import BaseModel | |
| class DatasetConfig(BaseModel): | |
| dataset: str | |
| name: str | None = None | |
| split: str | |
| column: str | |
| ds_config = DatasetConfig(dataset=args.dataset, split=args.split, column=args.column) | |
| eval_prompts = stack["utils"].load_prompts(runtime["settings"], ds_config) | |
| print(f"Evaluating on {len(eval_prompts)} prompts from {args.dataset} ({args.split})...") | |
| apply_trial(runtime, best_trial) | |
| responses = runtime["model"].get_responses_batched( | |
| eval_prompts, | |
| skip_special_tokens=True, | |
| ) | |
| refusal_markers = runtime["settings"].refusal_markers | |
| refusal_count = sum(1 for resp in responses if is_refusal(resp, refusal_markers)) | |
| summary = { | |
| "model": runtime["settings"].model, | |
| "total_prompts": len(eval_prompts), | |
| "refusals": refusal_count, | |
| "refusal_rate": refusal_count / len(eval_prompts), | |
| } | |
| print(json.dumps(summary, indent=2)) | |
| output_path = Path(args.output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| output_path.write_text(json.dumps(summary, indent=2)) | |
| print(f"Saved results to {args.output}") | |
| if __name__ == "__main__": | |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
| main() | |