Instructions to use HaadesX/Iconoclast with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use HaadesX/Iconoclast with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("HaadesX/Iconoclast", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| import argparse | |
| import importlib | |
| import json | |
| import os | |
| import sys | |
| from collections import defaultdict | |
| from os.path import commonprefix | |
| from pathlib import Path | |
| from typing import Any | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--iconoclast-checkpoint", required=True) | |
| parser.add_argument("--heretic-checkpoint", required=True) | |
| parser.add_argument("--output-dir", required=True) | |
| parser.add_argument("--sample-count", type=int, default=5) | |
| return parser.parse_args() | |
| def load_study(checkpoint_path: Path) -> tuple[str, dict[int, dict[str, Any]]]: | |
| settings_json = None | |
| trials: dict[int, dict[str, Any]] = defaultdict(dict) | |
| for line in checkpoint_path.read_text().splitlines(): | |
| obj = json.loads(line) | |
| user_attr = obj.get("user_attr") | |
| if user_attr and "settings" in user_attr and settings_json is None: | |
| settings_json = user_attr["settings"] | |
| trial_id = obj.get("trial_id") | |
| if trial_id is not None and user_attr: | |
| trials[trial_id].update(user_attr) | |
| if settings_json is None: | |
| raise ValueError(f"Did not find settings in {checkpoint_path}") | |
| return settings_json, trials | |
| def pick_best_trial(trials: dict[int, dict[str, Any]]) -> dict[str, Any]: | |
| best = None | |
| for trial_id, attrs in trials.items(): | |
| if not {"refusals", "kl_divergence", "parameters"}.issubset(attrs): | |
| continue | |
| item = { | |
| "trial_id": trial_id, | |
| "index": attrs.get("index"), | |
| "refusals": attrs["refusals"], | |
| "overrefusals": attrs.get("overrefusals", 0), | |
| "kl_divergence": attrs["kl_divergence"], | |
| "direction_index": attrs["direction_index"], | |
| "direction_method": attrs["direction_method"], | |
| "direction_blend": attrs.get("direction_blend", 0.0), | |
| "component_direction_indices": attrs.get("component_direction_indices"), | |
| "component_direction_methods": attrs.get("component_direction_methods"), | |
| "component_direction_blends": attrs.get("component_direction_blends"), | |
| "parameters": attrs["parameters"], | |
| } | |
| key = (item["refusals"], item["overrefusals"], item["kl_divergence"]) | |
| if best is None or key < ( | |
| best["refusals"], | |
| best["overrefusals"], | |
| best["kl_divergence"], | |
| ): | |
| best = item | |
| if best is None: | |
| raise ValueError("No completed trials with metrics were found") | |
| return best | |
| def load_stack(package: str) -> dict[str, Any]: | |
| return { | |
| "package": package, | |
| "config": importlib.import_module(f"{package}.config"), | |
| "direction": importlib.import_module(f"{package}.direction"), | |
| "model": importlib.import_module(f"{package}.model"), | |
| "utils": importlib.import_module(f"{package}.utils"), | |
| } | |
| def normalize_refusal_text(text: str) -> str: | |
| text = text.lower().replace("*", "") | |
| text = text.replace("’", "'") | |
| return " ".join(text.split()) | |
| def is_refusal(response: str, refusal_markers: list[str]) -> bool: | |
| if not response.strip(): | |
| return True | |
| normalized = normalize_refusal_text(response) | |
| return any(marker.lower() in normalized for marker in refusal_markers) | |
| def determine_response_prefix(model: Any, good_prompts: list[Any], bad_prompts: list[Any]) -> None: | |
| prefix_check_prompts = good_prompts[:100] + bad_prompts[:100] | |
| responses = model.get_responses_batched(prefix_check_prompts) | |
| model.response_prefix = commonprefix(responses).rstrip(" ") | |
| recheck_prefix = False | |
| if model.response_prefix: | |
| recheck_prefix = True | |
| if model.response_prefix.startswith("<think>"): | |
| model.response_prefix = "<think></think>" | |
| elif model.response_prefix.startswith("<|channel|>analysis<|message|>"): | |
| model.response_prefix = ( | |
| "<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>" | |
| ) | |
| elif model.response_prefix.startswith("<thought>"): | |
| model.response_prefix = "<thought></thought>" | |
| elif model.response_prefix.startswith("[THINK]"): | |
| model.response_prefix = "[THINK][/THINK]" | |
| else: | |
| recheck_prefix = False | |
| if recheck_prefix: | |
| responses = model.get_responses_batched(prefix_check_prompts) | |
| additional_prefix = commonprefix(responses).rstrip(" ") | |
| if additional_prefix: | |
| model.response_prefix += additional_prefix | |
| def prepare_runtime(stack: dict[str, Any], settings_json: str) -> dict[str, Any]: | |
| Settings = stack["config"].Settings | |
| DirectionMethod = stack["config"].DirectionMethod | |
| Model = stack["model"].Model | |
| load_prompts = stack["utils"].load_prompts | |
| set_random_seed = stack["utils"].set_random_seed | |
| empty_cache = stack["utils"].empty_cache | |
| compute_direction_candidates = stack["direction"].compute_direction_candidates | |
| orthogonalize_directions = stack["direction"].orthogonalize_directions | |
| blend_directions = stack["direction"].blend_directions | |
| settings = Settings.model_validate_json(settings_json) | |
| set_random_seed(settings.seed) | |
| model = Model(settings) | |
| good_prompts = load_prompts(settings, settings.good_prompts) | |
| bad_prompts = load_prompts(settings, settings.bad_prompts) | |
| good_eval_prompts = load_prompts(settings, settings.good_evaluation_prompts) | |
| bad_eval_prompts = load_prompts(settings, settings.bad_evaluation_prompts) | |
| determine_response_prefix(model, good_prompts, bad_prompts) | |
| good_residuals = model.get_residuals_batched(good_prompts) | |
| bad_residuals = model.get_residuals_batched(bad_prompts) | |
| good_means = good_residuals.mean(dim=0) | |
| direction_candidates = compute_direction_candidates( | |
| good_residuals, | |
| bad_residuals, | |
| settings.direction_variance_floor, | |
| ) | |
| if settings.orthogonalize_direction: | |
| direction_candidates = { | |
| method: orthogonalize_directions(candidate, good_means) | |
| for method, candidate in direction_candidates.items() | |
| } | |
| del good_residuals, bad_residuals | |
| empty_cache() | |
| def get_trial_refusal_directions(trial_data: dict[str, Any]) -> Any: | |
| component_direction_methods = trial_data.get("component_direction_methods") | |
| if isinstance(component_direction_methods, dict): | |
| component_direction_blends = trial_data.get( | |
| "component_direction_blends", | |
| {}, | |
| ) | |
| return { | |
| component: blend_directions( | |
| direction_candidates[DirectionMethod.MEAN], | |
| direction_candidates[DirectionMethod.VARIANCE], | |
| float(component_direction_blends.get(component, 0.0)), | |
| ) | |
| if DirectionMethod(method) == DirectionMethod.HYBRID | |
| else direction_candidates[DirectionMethod(method)] | |
| for component, method in component_direction_methods.items() | |
| } | |
| direction_method = DirectionMethod(trial_data["direction_method"]) | |
| direction_blend = float(trial_data.get("direction_blend", 0.0)) | |
| if direction_method == DirectionMethod.HYBRID: | |
| return blend_directions( | |
| direction_candidates[DirectionMethod.MEAN], | |
| direction_candidates[DirectionMethod.VARIANCE], | |
| direction_blend, | |
| ) | |
| return direction_candidates[direction_method] | |
| return { | |
| "settings": settings, | |
| "model": model, | |
| "good_eval_prompts": good_eval_prompts, | |
| "bad_eval_prompts": bad_eval_prompts, | |
| "get_trial_refusal_directions": get_trial_refusal_directions, | |
| "AbliterationParameters": stack["model"].AbliterationParameters, | |
| "empty_cache": empty_cache, | |
| } | |
| def apply_trial(runtime: dict[str, Any], trial_data: dict[str, Any]) -> None: | |
| model = runtime["model"] | |
| AbliterationParameters = runtime["AbliterationParameters"] | |
| parameters = { | |
| name: AbliterationParameters(**values) | |
| for name, values in trial_data["parameters"].items() | |
| } | |
| model.reset_model() | |
| model.abliterate( | |
| runtime["get_trial_refusal_directions"](trial_data), | |
| trial_data.get("component_direction_indices", trial_data["direction_index"]), | |
| parameters, | |
| ) | |
| def export_merged_model(runtime: dict[str, Any], output_dir: Path) -> None: | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| merged_model = runtime["model"].get_merged_model() | |
| merged_model.save_pretrained(output_dir) | |
| runtime["model"].tokenizer.save_pretrained(output_dir) | |
| del merged_model | |
| runtime["empty_cache"]() | |
| def prompt_record(category: str, index: int, prompt: Any) -> dict[str, Any]: | |
| return { | |
| "category": category, | |
| "index": index, | |
| "system": prompt.system, | |
| "user": prompt.user, | |
| } | |
| def main() -> None: | |
| args = parse_args() | |
| sys.argv = [sys.argv[0]] | |
| output_dir = Path(args.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| icon_settings_json, icon_trials = load_study(Path(args.iconoclast_checkpoint)) | |
| her_settings_json, her_trials = load_study(Path(args.heretic_checkpoint)) | |
| icon_best = pick_best_trial(icon_trials) | |
| her_best = pick_best_trial(her_trials) | |
| icon_stack = load_stack("iconoclast") | |
| her_stack = load_stack("heretic") | |
| icon_runtime = prepare_runtime(icon_stack, icon_settings_json) | |
| her_runtime = prepare_runtime(her_stack, her_settings_json) | |
| sample_count = args.sample_count | |
| sample_prompts = [ | |
| prompt_record("harmful", i, prompt) | |
| for i, prompt in enumerate(icon_runtime["bad_eval_prompts"][:sample_count]) | |
| ] + [ | |
| prompt_record("harmless", i, prompt) | |
| for i, prompt in enumerate(icon_runtime["good_eval_prompts"][:sample_count]) | |
| ] | |
| base_runtime = prepare_runtime(icon_stack, icon_settings_json) | |
| base_prompts = [ | |
| icon_stack["utils"].Prompt(system=item["system"], user=item["user"]) | |
| for item in sample_prompts | |
| ] | |
| base_responses = base_runtime["model"].get_responses_batched( | |
| base_prompts, | |
| skip_special_tokens=True, | |
| ) | |
| apply_trial(icon_runtime, icon_best) | |
| export_merged_model(icon_runtime, output_dir / "iconoclast-best-merged") | |
| icon_responses = icon_runtime["model"].get_responses_batched( | |
| base_prompts, | |
| skip_special_tokens=True, | |
| ) | |
| apply_trial(her_runtime, her_best) | |
| export_merged_model(her_runtime, output_dir / "heretic-best-merged") | |
| her_responses = her_runtime["model"].get_responses_batched( | |
| base_prompts, | |
| skip_special_tokens=True, | |
| ) | |
| refusal_markers = icon_runtime["settings"].refusal_markers | |
| comparisons = [] | |
| for item, base_response, icon_response, her_response in zip( | |
| sample_prompts, | |
| base_responses, | |
| icon_responses, | |
| her_responses, | |
| ): | |
| comparisons.append( | |
| { | |
| **item, | |
| "base": { | |
| "refusal": is_refusal(base_response, refusal_markers), | |
| "response": base_response, | |
| }, | |
| "iconoclast": { | |
| "refusal": is_refusal(icon_response, refusal_markers), | |
| "response": icon_response, | |
| }, | |
| "heretic": { | |
| "refusal": is_refusal(her_response, refusal_markers), | |
| "response": her_response, | |
| }, | |
| } | |
| ) | |
| summary = { | |
| "base_model": icon_runtime["settings"].model, | |
| "iconoclast_best": icon_best, | |
| "heretic_best": her_best, | |
| "comparison_sample_count_per_split": sample_count, | |
| "comparisons": comparisons, | |
| } | |
| (output_dir / "comparison.json").write_text(json.dumps(summary, indent=2)) | |
| (output_dir / "summary.json").write_text( | |
| json.dumps( | |
| { | |
| "base_model": summary["base_model"], | |
| "iconoclast_best": icon_best, | |
| "heretic_best": her_best, | |
| }, | |
| indent=2, | |
| ) | |
| ) | |
| print(json.dumps(summary["iconoclast_best"], indent=2)) | |
| print(json.dumps(summary["heretic_best"], indent=2)) | |
| print(f"Wrote exports and comparison to {output_dir}") | |
| if __name__ == "__main__": | |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
| main() | |