Spaces:

aaron0eidt
/

ELIA

Running

App Files Files Community

aaron0eidt commited on Nov 29, 2025

Commit

5b6c556

0 Parent(s):

Deploy static demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +48 -0
LOGO/Logo.png +0 -0
README.md +25 -0
attribution_analysis/attribution_analysis_page.py +1395 -0
cache/cached_attribution_results.json +0 -0
circuit_analysis/CLT_IMPROVEMENTS.md +232 -0
circuit_analysis/WORKFLOW_PER_PROMPT.md +106 -0
circuit_analysis/attribution_graphs_olmo.py +1931 -0
circuit_analysis/attribution_graphs_olmo_de.py +1165 -0
circuit_analysis/attribution_graphs_olmo_offline.py +1922 -0
circuit_analysis/calculate_cpr_cmd.py +338 -0
circuit_analysis/circuit_trace_page.py +0 -0
circuit_analysis/merge_circuit_results.py +57 -0
circuit_analysis/offline_circuit_metrics.py +194 -0
circuit_analysis/plot_offline_metrics.py +239 -0
circuit_analysis/results/attribution_graphs_results.json +0 -0
circuit_analysis/results/attribution_graphs_results_de.json +0 -0
circuit_analysis/results/attribution_graphs_results_de_prompt_1.json +0 -0
circuit_analysis/results/attribution_graphs_results_de_prompt_2.json +0 -0
circuit_analysis/results/attribution_graphs_results_de_prompt_3.json +0 -0
circuit_analysis/results/attribution_graphs_results_prompt_1.json +0 -0
circuit_analysis/results/attribution_graphs_results_prompt_2.json +0 -0
circuit_analysis/results/attribution_graphs_results_prompt_3.json +0 -0
circuit_analysis/results/clt_training_stats.json +4508 -0
circuit_analysis/results/cpr_cmd_results.json +108 -0
circuit_analysis/results/feature_interpretations_cache/feature_interpretations.json +0 -0
circuit_analysis/results/offline_circuit_metrics.json +484 -0
circuit_analysis/train_clt_and_plot.py +353 -0
function_vectors/data/multilingual_function_categories.py +0 -0
function_vectors/data/visualizations/de_pca_3d_categories_layer_-1.html +0 -0
function_vectors/data/visualizations/en_pca_3d_categories_layer_-1.html +0 -0
function_vectors/function_vectors_page.py +1845 -0
function_vectors/generate_function_vectors.py +85 -0
function_vectors/generate_german_vectors.py +90 -0
function_vectors/generate_page_assets.py +132 -0
function_vectors/translate_prompts.py +264 -0
influence_tracer/build_dolma_index.py +207 -0
locales/de/attribution_analysis_page.json +174 -0
locales/de/circuit_trace_page.json +215 -0
locales/de/common.json +167 -0
locales/de/function_vectors_page.json +171 -0
locales/de/welcome_page.json +39 -0
locales/en/attribution_analysis_page.json +174 -0
locales/en/circuit_trace_page.json +217 -0
locales/en/common.json +45 -0
locales/en/function_vectors_page.json +164 -0
locales/en/welcome_page.json +39 -0
packages.txt +2 -0
requirements.txt +20 -0
run_webapp.py +40 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,48 @@

+# Ignore large model directories
+models/
+circuit_analysis/models/
+circuit_analysis/debug_cpr.log
+circuit_analysis/debug_cpr_2.log
+influence_tracer/influence_tracer_data/
+influence_tracer/dolma_dataset_sample_1.6v/
+influence_tracer/dolma
+# Ignore python cache
+__pycache__/
+*.pyc
+# Ignore local env files if any
+.env
+.venv
+env/
+# Ignore system files
+.DS_Store
+# User Data
+user_study/data/
+user_study/voice_memos/files/
+user_study/voice_memos/merged_files/
+user_study/voice_memos/transcripts/
+process_faithfulness.py
+Faithfulness.csv
+FaithfulnessNew.csv
+# Writing & Documentation artifacts
+writing/
+ELIA_Demo_Script.md
+texput.log
+# Binary files causing upload issues
+circuit_analysis/results/attribution_graph_prompt_1.png
+circuit_analysis/results/attribution_graph_prompt_2.png
+circuit_analysis/results/attribution_graph_prompt_3.png
+circuit_analysis/results/attribution_graph_prompt_de_1.png
+circuit_analysis/results/attribution_graph_prompt_de_2.png
+circuit_analysis/results/attribution_graph_prompt_de_3.png
+circuit_analysis/results/clt_training_loss.png
+circuit_analysis/results/offline_circuit_metrics_combined.png
+function_vectors/data/vectors/de_category_vectors.npz
+function_vectors/data/vectors/en_category_vectors.npz

LOGO/Logo.png ADDED Viewed

README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+---
+title: ELIA Analysis Suite
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.38.0
+app_file: web_app.py
+pinned: false
+license: mit
+---
+# ELIA: Simplifiying Outcomes of Language Model Component Analyses
+ELIA is an interactive analysis suite for exploring the internal mechanisms of the OLMo 7B language model.
+## Features
+- **Attribution Analysis:** Visualize token importance using Integrated Gradients, Occlusion, and Saliency.
+- **Function Vectors:** Explore how the model represents different tasks in its internal activation space.
+- **Circuit Tracing:** (Static Demo) View pre-computed circuit traces for specific behaviors.
+## Note on this Demo
+This Space runs in a **static demonstration mode**. Due to storage and compute constraints, the full OLMo-7B model is not loaded. Instead, you can explore pre-computed analyses for a set of example prompts.
+The AI-powered explanations are fully functional and powered by the Qwen API.

attribution_analysis/attribution_analysis_page.py ADDED Viewed

	@@ -0,0 +1,1395 @@

+import streamlit as st
+import inseq
+import torch
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import json
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from inseq.models.huggingface_model import HuggingfaceDecoderOnlyModel
+import base64
+from io import BytesIO
+from PIL import Image
+import plotly.graph_objects as go
+import re
+import markdown
+from utilities.localization import tr
+import faiss
+from sentence_transformers import SentenceTransformer, util
+from sentence_splitter import SentenceSplitter
+import html
+from utilities.utils import init_qwen_api
+from utilities.feedback_survey import display_attribution_feedback
+from thefuzz import process, fuzz
+import gc
+import time
+import sys
+from pathlib import Path
+# A dictionary to map method names to translation keys.
+METHOD_DESC_KEYS = {
+    "integrated_gradients": "desc_integrated_gradients",
+    "occlusion": "desc_occlusion",
+    "saliency": "desc_saliency"
+}
+# Configuration for the influence tracer.
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+INDEX_DIR = os.path.join("influence_tracer", "influence_tracer_data")
+INDEX_PATH = os.path.join(INDEX_DIR, "dolma_index_multi.faiss")
+MAPPING_PATH = os.path.join(INDEX_DIR, "dolma_mapping_multi.json")
+TRACER_MODEL_NAME = 'paraphrase-multilingual-mpnet-base-v2'
+class CachedAttribution:
+    # A mock object to mimic inseq's Attribution object for cached results.
+    def __init__(self, html_content):
+        self.html_content = html_content
+    def show(self, display=False, return_html=True):
+        return self.html_content
+def load_all_attribution_models():
+    # Loads all the attribution models.
+    try:
+        # Set the device to MPS, CUDA, or CPU.
+        device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+        # Path to the local model.
+        model_path = "./models/OLMo-2-1124-7B"
+        hf_token = os.environ.get("HF_TOKEN")
+        # Load tokenizer and model.
+        tokenizer = AutoTokenizer.from_pretrained(model_path, token=hf_token, trust_remote_code=True)
+        tokenizer.model_max_length = 512
+        # Load the model with half precision to save memory.
+        base_model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            token=hf_token,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        )
+        # Move the model to the selected device.
+        base_model = base_model.to(device)
+        # Add missing special tokens if necessary.
+        if tokenizer.bos_token is None:
+            tokenizer.add_special_tokens({'bos_token': '<s>'})
+            base_model.resize_token_embeddings(len(tokenizer))
+        # Patch the model config.
+        if base_model.config.bos_token_id is None:
+            base_model.config.bos_token_id = tokenizer.bos_token_id
+        attribution_models = {}
+        # Set up the Integrated Gradients model.
+        attribution_models["integrated_gradients"] = HuggingfaceDecoderOnlyModel(
+            model=base_model,
+            tokenizer=tokenizer,
+            device=device,
+            attribution_method="integrated_gradients",
+            attribution_kwargs={"n_steps": 10}
+        )
+        # Set up the Occlusion model.
+        attribution_models["occlusion"] = HuggingfaceDecoderOnlyModel(
+            model=base_model,
+            tokenizer=tokenizer,
+            device=device,
+            attribution_method="occlusion"
+        )
+        # Set up the Saliency model.
+        attribution_models["saliency"] = HuggingfaceDecoderOnlyModel(
+            model=base_model,
+            tokenizer=tokenizer,
+            device=device,
+            attribution_method="saliency"
+        )
+        return attribution_models, tokenizer, base_model, device
+    except Exception as e:
+        st.error(f"Error loading models: {str(e)}")
+        return None, None, None, None
+def load_influence_tracer_data():
+    # Loads the data needed for the influence tracer.
+    if not os.path.exists(INDEX_PATH) or not os.path.exists(MAPPING_PATH):
+        return None, None, None
+    index = faiss.read_index(INDEX_PATH)
+    with open(MAPPING_PATH, 'r', encoding='utf-8') as f:
+        mapping = json.load(f)
+    device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+    model = SentenceTransformer(TRACER_MODEL_NAME, device=device)
+    return index, mapping, model
+@st.cache_data(persist=True)
+def get_influential_docs(text_to_trace: str, lang: str):
+    # Finds influential documents from the training data for a given text.
+    faiss_index, doc_mapping, tracer_model = load_influence_tracer_data()
+    if not faiss_index:
+        return []
+    # Get the embedding for the input text.
+    doc_embedding = tracer_model.encode([text_to_trace], convert_to_numpy=True, normalize_embeddings=True)
+    # Search the FAISS index for the top k documents.
+    k = 3
+    similarities, indices = faiss_index.search(doc_embedding.astype('float32'), k)
+    # Find the most similar sentence in each influential document.
+    results = []
+    query_embedding = tracer_model.encode([text_to_trace], normalize_embeddings=True)
+    for i in range(k):
+        doc_id = str(indices[0][i])
+        if doc_id in doc_mapping:
+            doc_info = doc_mapping[doc_id]
+            file_path = os.path.join("influence_tracer", "dolma_dataset_sample_1.6v", doc_info['file'])
+            try:
+                full_doc_text = ""
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        try:
+                            line_data = json.loads(line)
+                            line_text = line_data.get('text', '')
+                            # Use fuzzy matching to find the text snippet.
+                            if fuzz.partial_ratio(doc_info['text_snippet'], line_text) > 95:
+                                full_doc_text = line_text
+                                break
+                        except json.JSONDecodeError:
+                            continue
+                # Skip if the document text wasn't found.
+                if not full_doc_text:
+                    print(f"Warning: Could not find document snippet for doc {doc_id} in {file_path}. Skipping.")
+                    continue
+                # Find the most similar sentence in the document.
+                splitter = SentenceSplitter(language=lang)
+                sentences = splitter.split(text=full_doc_text)
+                if not sentences:
+                    sentences = [full_doc_text]
+                # Set a batch size to avoid memory issues.
+                sentence_embeddings = tracer_model.encode(sentences, batch_size=64, show_progress_bar=False, normalize_embeddings=True)
+                cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
+                best_sentence_idx = torch.argmax(cos_scores).item()
+                most_similar_sentence = sentences[best_sentence_idx]
+                results.append({
+                    'id': doc_id,
+                    'file': doc_info['file'],
+                    'source': doc_info['source'],
+                    'text': full_doc_text,
+                    'similarity': similarities[0][i],
+                    'highlight_sentence': most_similar_sentence
+                })
+            except (IOError, KeyError) as e:
+                print(f"Could not retrieve full text for doc {doc_id}: {e}")
+                continue
+    return results
+# --- Qwen API for Explanations ---
+@st.cache_data(persist=True)
+def _cached_explain_heatmap(api_config, img_base64, csv_text, structured_prompt):
+    # Makes a cached API call to Qwen to get an explanation for a heatmap.
+    headers = {
+        "Authorization": f"Bearer {api_config['api_key']}",
+        "Content-Type": "application/json"
+    }
+    content = [{"type": "text", "text": structured_prompt}]
+    if img_base64:
+        content.append({
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/png;base64,{img_base64}"
+            }
+        })
+    data = {
+        "model": api_config["model"],
+        "messages": [
+            {
+                "role": "user",
+                "content": content
+            }
+        ],
+        "max_tokens": 1200,
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "seed": 42
+    }
+    response = requests.post(
+        f"{api_config['api_endpoint']}/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=300
+    )
+    # Raise an exception if the API call fails.
+    response.raise_for_status()
+    result = response.json()
+    return result["choices"][0]["message"]["content"]
+@st.cache_data(persist=True)
+def generate_all_attribution_analyses(_attribution_models, _tokenizer, _base_model, _device, prompt, max_tokens, force_exact_num_tokens=False):
+    # Generates text and runs attribution analysis for all methods.
+    # Generate the text first.
+    inputs = _tokenizer(prompt, return_tensors="pt").to(_device)
+    generation_args = {
+        'max_new_tokens': max_tokens,
+        'do_sample': False
+    }
+    if force_exact_num_tokens:
+        generation_args['min_new_tokens'] = max_tokens
+    generated_ids = _base_model.generate(
+        inputs.input_ids,
+        **generation_args
+    )
+    generated_text = _tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+    # Run attribution analysis for all methods.
+    all_attributions = {}
+    methods = ["integrated_gradients", "occlusion", "saliency"]
+    for method in methods:
+        attributions = _attribution_models[method].attribute(
+        input_texts=prompt,
+        generated_texts=generated_text
+    )
+        all_attributions[method] = attributions
+    return generated_text, all_attributions
+def explain_heatmap_with_csv_data(api_config, image_buffer, csv_data, context_prompt, generated_text, method_name="Attribution"):
+    # Generates an explanation for a heatmap using the Qwen API.
+    try:
+        # Convert the image to base64.
+        img_base64 = None
+        if image_buffer:
+            image_buffer.seek(0)
+            image = Image.open(image_buffer)
+            buffered = BytesIO()
+            image.save(buffered, format="PNG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode()
+        # Clean the dataframe to handle duplicates.
+        df_clean = csv_data.copy()
+        cols = pd.Series(df_clean.columns)
+        if cols.duplicated().any():
+            for dup in cols[cols.duplicated()].unique():
+                dup_indices = cols[cols == dup].index.values
+                new_names = [f"{dup} ({i+1})" for i in range(len(dup_indices))]
+                cols[dup_indices] = new_names
+            df_clean.columns = cols
+        if df_clean.index.has_duplicates:
+            counts = {}
+            new_index = list(df_clean.index)
+            duplicated_indices = df_clean.index[df_clean.index.duplicated(keep=False)]
+            for i, idx in enumerate(df_clean.index):
+                if idx in duplicated_indices:
+                    counts[idx] = counts.get(idx, 0) + 1
+                    new_index[i] = f"{idx} ({counts[idx]})"
+            df_clean.index = new_index
+        # --- Rule-Based Analysis ---
+        unstacked = df_clean.unstack()
+        unstacked.index = unstacked.index.map('{0[1]} -> {0[0]}'.format)
+        # Get the top 5 individual scores.
+        top_5_individual = unstacked.abs().nlargest(5).sort_index()
+        top_individual_text_lines = ["\n### Top 5 Strongest Individual Connections:"]
+        for label in top_5_individual.index:
+            score = unstacked[label]
+            top_individual_text_lines.append(f"- **{label}**: score {score:.2f}")
+        # Get the top 5 average input scores.
+        avg_input_scores = df_clean.mean(axis=1)
+        top_5_average = avg_input_scores.abs().nlargest(5).sort_index()
+        top_average_text_lines = ["\n### Top 5 Most Influential Input Tokens (on average over the whole generation):"]
+        for input_token in top_5_average.index:
+            score = avg_input_scores[input_token]
+            top_average_text_lines.append(f"- **'{input_token}'**: average score {score:.2f}")
+        # Get the top output token sources.
+        top_output_text_lines = []
+        if not df_clean.empty:
+            avg_output_scores = df_clean.mean(axis=0)
+            top_3_output = avg_output_scores.abs().nlargest(min(3, len(df_clean.columns))).sort_index()
+            if not top_3_output.empty:
+                top_output_text_lines.append("\n### Top 3 Most Influenced Generated Tokens:")
+                for output_token in top_3_output.index:
+                    # Find which input tokens influenced this output token the most.
+                    top_sources_for_output = df_clean[output_token].abs().nlargest(min(2, len(df_clean.index))).sort_index().index.tolist()
+                    if top_sources_for_output:
+                        top_output_text_lines.append(f"- **'{output_token}'** was most influenced by **'{', '.join(top_sources_for_output)}'**.")
+        data_text_for_llm = "\n".join(top_individual_text_lines + top_average_text_lines + top_output_text_lines)
+        # Get method-specific context from the translation files.
+        desc_key = METHOD_DESC_KEYS.get(method_name, "unsupported_method_desc")
+        method_context = tr(desc_key)
+        # Format the instruction for the LLM.
+        instruction_p1 = tr('instruction_part_1_desc').format(method_name=method_name.replace('_', ' ').title())
+        # Create the prompt for the LLM.
+        structured_prompt = f"""{tr('ai_expert_intro')}
+## {tr('analysis_details')}
+- **{tr('method_being_used')}** {method_name.replace('_', ' ').title()}
+- **{tr('prompt_analyzed')}** "{context_prompt}"
+- **{tr('full_generated_text')}** "{generated_text}"
+## {tr('method_specific_context')}
+{method_context}
+## {tr('instructions_for_analysis')}
+{tr('instruction_part_1_header')}
+{instruction_p1}
+{tr('instruction_synthesis_header')}
+{tr('instruction_synthesis_desc')}
+{tr('instruction_color_coding')}
+## {tr('data_section_header')}
+{data_text_for_llm}
+{tr('begin_analysis_now')}"""
+        # Call the cached function to get the explanation.
+        explanation = _cached_explain_heatmap(api_config, img_base64, data_text_for_llm, structured_prompt)
+        return explanation
+    except Exception as e:
+        # Catch errors from data prep or the API call.
+        st.error(f"Error generating AI explanation: {str(e)}")
+        return tr("unable_to_generate_explanation")
+# --- Faithfulness Verification ---
+@st.cache_data(persist=True)
+def _cached_extract_claims_from_explanation(api_config, explanation_text, analysis_method):
+    # Makes a cached API call to Qwen to get claims from an explanation.
+    headers = {"Authorization": f"Bearer {api_config['api_key']}", "Content-Type": "application/json"}
+    # Dynamically set claim types based on the analysis method.
+    claim_types_details = tr("claim_extraction_prompt_types_details")
+    claim_extraction_prompt = f"""{tr('claim_extraction_prompt_header')}
+{tr('claim_extraction_prompt_instruction')}
+{tr('claim_extraction_prompt_context_header').format(analysis_method=analysis_method)}
+{tr('claim_extraction_prompt_types_header')}
+{claim_types_details}
+{tr('claim_extraction_prompt_example_header')}
+{tr('claim_extraction_prompt_example_explanation')}
+{tr('claim_extraction_prompt_example_json')}
+{tr('claim_extraction_prompt_analyze_header')}
+"{explanation_text}"
+{tr('claim_extraction_prompt_instruction_footer')}
+"""
+    data = {
+        "model": api_config["model"],
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": claim_extraction_prompt}]
+            }
+        ],
+        "max_tokens": 1500,
+        "temperature": 0.0,  # Set to 0 for deterministic output.
+        "seed": 42
+    }
+    response = requests.post(
+        f"{api_config['api_endpoint']}/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=300
+    )
+    response.raise_for_status()
+    claims_text = response.json()["choices"][0]["message"]["content"]
+    try:
+        # The response might be inside a markdown code block, so we try to extract it.
+        if '```json' in claims_text:
+            claims_text = re.search(r'```json\n(.*?)\n```', claims_text, re.DOTALL).group(1)
+        # Parse the JSON string into a Python list.
+        return json.loads(claims_text)
+    except (AttributeError, json.JSONDecodeError):
+        return []
+@st.cache_data(persist=True)
+def _cached_verify_token_justification(api_config, analysis_method, input_prompt, generated_text, token, justification):
+    # Uses an LLM to verify if a justification for a token's importance is sound.
+    headers = {"Authorization": f"Bearer {api_config['api_key']}", "Content-Type": "application/json"}
+    verification_prompt = f"""{tr('justification_verification_prompt_header')}
+{tr('justification_verification_prompt_crucial_rule')}
+{tr('justification_verification_prompt_token_location')}
+{tr('justification_verification_prompt_special_tokens')}
+{tr('justification_verification_prompt_evaluating_justifications')}
+{tr('justification_verification_prompt_linguistic_context')}
+{tr('justification_verification_prompt_collective_reasoning')}
+**Analysis Method:** {analysis_method}
+**Input Prompt:** "{input_prompt}"
+**Generated Text:** "{generated_text}"
+**Token in Question:** "{token}"
+**Provided Justification:** "{justification}"
+{tr('justification_verification_prompt_task_header')}
+{tr('justification_verification_prompt_task_instruction')}
+{tr('justification_verification_prompt_json_instruction')}
+{tr('justification_verification_prompt_footer')}
+"""
+    data = {
+        "model": "qwen2.5-vl-72b-instruct",
+        "messages": [{"role": "user", "content": verification_prompt}],
+        "max_tokens": 400,
+        "temperature": 0.0,
+        "seed": 42,
+        "response_format": {"type": "json_object"}
+    }
+    response = requests.post(
+        f"{api_config['api_endpoint']}/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=300
+    )
+    response.raise_for_status()
+    try:
+        result_json = response.json()["choices"][0]["message"]["content"]
+        return json.loads(result_json)
+    except (json.JSONDecodeError, KeyError):
+        return {"is_verified": False, "reasoning": "Could not parse the semantic justification result."}
+def verify_claims(claims, analysis_data):
+    # Verifies the extracted claims against the analysis data.
+    verification_results = []
+    # Pre-calculate thresholds and rankings for efficiency.
+    all_scores_flat = analysis_data['scores_df'].abs().values.flatten()
+    # Average influence of each input token.
+    avg_input_scores_abs = analysis_data['scores_df'].mean(axis=1).abs().sort_values(ascending=False)
+    avg_input_scores_raw = analysis_data['scores_df'].mean(axis=1) # Keep signs for specific value checks
+    # Average influence on each generated token.
+    avg_output_scores = analysis_data['scores_df'].mean(axis=0).abs().sort_values(ascending=False)
+    input_tokens = analysis_data['scores_df'].index.tolist()
+    generated_tokens = analysis_data['scores_df'].columns.tolist()
+    for claim in claims:
+        is_verified = False
+        evidence = "Could not be verified."
+        details = claim.get('details', {})
+        claim_type = claim.get('claim_type')
+        try:
+            # Clean tokens in the claim's details, as the LLM sometimes includes extra quotes.
+            if 'token' in details and isinstance(details['token'], str):
+                details['token'] = re.sub(r"^\s*['\"]|['\"]\s*$", '', details['token']).strip()
+            if 'tokens' in details and isinstance(details['tokens'], list):
+                details['tokens'] = [re.sub(r"^\s*['\"]|['\"]\s*$", '', t).strip() for t in details['tokens']]
+            if claim_type == 'attribution_claim':
+                tokens_claimed = details.get('tokens', [])
+                qualifier = details.get('qualifier', 'significant') # Default to the lower bar
+                score_type = details.get('score_type', 'peak')
+                # Calculate the correct scores based on the claim's score_type.
+                if score_type == 'average':
+                    score_series = analysis_data['scores_df'].abs().mean(axis=1)
+                    score_name = "average score"
+                else: # peak
+                    score_series = analysis_data['scores_df'].abs().max(axis=1)
+                    score_name = "peak score"
+                if score_series.empty:
+                    evidence = "No attribution data available to verify claim."
+                else:
+                    all_attributions = sorted(
+                        [{'token': token, 'attribution': score} for token, score in score_series.items()],
+                        key=lambda x: x['attribution'],
+                        reverse=True
+                    )
+                    max_score = all_attributions[0]['attribution'] if all_attributions else 0
+                    if qualifier == 'high':
+                        threshold = 0.70 * max_score
+                        threshold_name = "high"
+                    else: # 'significant' or default
+                        threshold = 0.50 * max_score
+                        threshold_name = "significant"
+                    token_scores_dict = {item['token'].lower().strip(): item['attribution'] for item in all_attributions}
+                    unverified_tokens = []
+                    verified_tokens_details = []
+                    for token in tokens_claimed:
+                        # New, more robust matching logic.
+                        # First, check for a direct match for specific claims like ', (1)'.
+                        token_lower = token.lower().strip()
+                        if token_lower in token_scores_dict:
+                            matching_keys = [token_lower]
+                        else:
+                            # If no direct match, fall back to a generic search for claims like ','.
+                            # This finds all instances: ', (1)', ', (2)', etc.
+                            matching_keys = [
+                                k for k in token_scores_dict.keys()
+                                if re.sub(r'\s\(\d+\)$', '', k).strip() == token_lower
+                            ]
+                        if not matching_keys:
+                            unverified_tokens.append(f"'{token}' (not found in analysis)")
+                            continue
+                        # Check each matching instance against the threshold.
+                        for key in matching_keys:
+                            actual_score = token_scores_dict.get(key)
+                            if abs(actual_score) < threshold:
+                                unverified_tokens.append(f"'{key}' ({score_name}: {abs(actual_score):.2f})")
+                            else:
+                                verified_tokens_details.append(f"'{key}' ({score_name}: {abs(actual_score):.2f})")
+                    is_verified = not unverified_tokens
+                    if is_verified:
+                        evidence = f"Verified. All claimed tokens passed the {threshold_name} threshold (> {threshold:.2f}). Details: {', '.join(verified_tokens_details)}."
+                    else:
+                        fail_reason = f"the following did not meet the {threshold_name} threshold (> {threshold:.2f}): {', '.join(unverified_tokens)}"
+                        if verified_tokens_details:
+                            evidence = f"While some tokens passed ({', '.join(verified_tokens_details)}), {fail_reason}."
+                        else:
+                            evidence = f"The following did not meet the {threshold_name} threshold (> {threshold:.2f}): {', '.join(unverified_tokens)}."
+            elif claim_type in ['token_justification_claim', 'token_begruendung_anspruch']:
+                token_val = details.get('token') or details.get('tokens')
+                if isinstance(token_val, list):
+                    token = ", ".join(map(str, token_val))
+                else:
+                    token = token_val
+                justification = details.get('justification') or details.get('begruendung')
+                input_prompt = analysis_data.get('prompt', '')
+                generated_text = analysis_data.get('generated_text', '')
+                if not all([token, justification, input_prompt, generated_text]):
+                    evidence = "Missing data for justification verification (token, justification, or prompt)."
+                else:
+                    api_config = init_qwen_api()
+                    if api_config:
+                        verification = _cached_verify_token_justification(api_config, analysis_data['method'], input_prompt, generated_text, token, justification)
+                        is_verified = verification.get('is_verified', False)
+                        evidence = verification.get('reasoning', "Failed to get semantic reasoning for justification.")
+                    else:
+                        is_verified = False
+                        evidence = "API key not configured for semantic verification."
+        except Exception as e:
+            evidence = f"An error occurred during verification: {str(e)}"
+        verification_results.append({
+            'claim_text': claim.get('claim_text', 'N/A'),
+            'verified': is_verified,
+            'evidence': evidence
+        })
+    return verification_results
+# --- End Faithfulness Verification ---
+def create_heatmap_visualization(attributions, method_name="Attribution"):
+    # Creates a heatmap visualization from attribution scores.
+    try:
+        # Get the HTML content from the attributions.
+        html_content = attributions.show(display=False, return_html=True)
+        if not html_content:
+            st.error(tr("error_inseq_no_html").format(method_name=method_name))
+            return None, None, None, None
+        # Parse the HTML to extract the data table.
+        soup = BeautifulSoup(html_content, 'html.parser')
+        table = soup.find('table')
+        if not table:
+            st.error(tr("error_no_table_in_html").format(method_name=method_name))
+            return None, None, None, None
+        # A more structured approach to parsing the HTML.
+        header_row_element = table.find('thead')
+        if header_row_element:
+            headers = [th.get_text(strip=True) for th in header_row_element.find_all('th')[1:]]
+        else:
+            # Fallback if no <thead> is found.
+            first_row = table.find('tr')
+            if not first_row:
+                st.error(tr("error_table_no_rows").format(method_name=method_name))
+                return None, None, None, None
+            headers = [th.get_text(strip=True) for th in first_row.find_all('th')[1:]]
+        data_rows = []
+        row_labels = []
+        # Find all `<tbody>` elements and iterate through their rows.
+        table_bodies = table.find_all('tbody')
+        if not table_bodies:
+            # Fallback if no <tbody> is found.
+            all_trs = table.find_all('tr')
+            data_trs = all_trs[1:] if len(all_trs) > 1 else []
+        else:
+            data_trs = []
+            for tbody in table_bodies:
+                data_trs.extend(tbody.find_all('tr'))
+        for tr_element in data_trs:
+            all_cells = tr_element.find_all(['th', 'td'])
+            if not all_cells or len(all_cells) <= 1:
+                continue
+            row_labels.append(all_cells[0].get_text(strip=True))
+            # Convert text values to float, handling empty strings as 0.
+            row_data = []
+            for cell in all_cells[1:]:
+                text_val = cell.get_text(strip=True)
+                # Remove non-breaking spaces.
+                clean_text = text_val.replace('\xa0', '').strip()
+                if clean_text:
+                    try:
+                        row_data.append(float(clean_text))
+                    except ValueError:
+                        # Default to 0 if conversion fails.
+                        row_data.append(0.0)
+                else:
+                    row_data.append(0.0)
+            data_rows.append(row_data)
+        # Create the dataframe from the parsed data.
+        if not data_rows or not data_rows[0]:
+            st.error(tr("error_failed_to_parse_rows").format(method_name=method_name))
+            return None, None, None, None
+        # --- Make token labels unique for duplicates ---
+        def make_labels_unique(labels):
+            counts = {}
+            new_labels = []
+            # First, count all occurrences to decide which ones need numbering.
+            label_counts = {label: labels.count(label) for label in set(labels)}
+            for label in labels:
+                if label_counts[label] > 1:
+                    counts[label] = counts.get(label, 0) + 1
+                    new_labels.append(f"{label} ({counts[label]})")
+                else:
+                    new_labels.append(label)
+            return new_labels
+        unique_row_labels = make_labels_unique(row_labels)
+        unique_headers = make_labels_unique(headers)
+        parsed_df = pd.DataFrame(data_rows, index=unique_row_labels, columns=unique_headers)
+        attribution_scores = parsed_df.values
+        # Clean tokens for display.
+        clean_headers = parsed_df.columns.tolist()
+        clean_row_labels = parsed_df.index.tolist()
+        # Use numerical indices for the heatmap to handle duplicate labels.
+        x_indices = list(range(len(clean_headers)))
+        y_indices = list(range(len(clean_row_labels)))
+        # Prepare custom data for hover labels.
+        custom_data = np.empty(attribution_scores.shape, dtype=object)
+        for i in range(len(clean_row_labels)):
+            for j in range(len(clean_headers)):
+                custom_data[i, j] = (clean_row_labels[i], clean_headers[j])
+        fig = go.Figure(data=go.Heatmap(
+            z=attribution_scores,
+            x=x_indices,
+            y=y_indices,
+            customdata=custom_data,
+            hovertemplate="Input: %{customdata[0]}<br>Generated: %{customdata[1]}<br>Score: %{z:.4f}<extra></extra>",
+            colorscale='Plasma',
+            hoverongaps=False,
+        ))
+        fig.update_layout(
+            title=tr('heatmap_title').format(method_name=method_name),
+            xaxis_title=tr('heatmap_xaxis'),
+            yaxis_title=tr('heatmap_yaxis'),
+            xaxis=dict(
+                tickmode='array',
+                tickvals=x_indices,
+                ticktext=clean_headers,
+                tickangle=45
+            ),
+            yaxis=dict(
+                tickmode='array',
+                tickvals=y_indices,
+                ticktext=clean_row_labels,
+                autorange='reversed'
+            ),
+            height=max(400, len(clean_row_labels) * 30),
+            width=max(600, len(clean_headers) * 50)
+        )
+        # Save the plot to a buffer.
+        buffer = BytesIO()
+        fig.write_image(buffer, format='png', scale=2)
+        buffer.seek(0)
+        return fig, html_content, buffer, parsed_df
+    except Exception as e:
+        st.error(tr("error_creating_heatmap").format(e=str(e)))
+        return None, None, None, None
+def start_new_analysis(prompt, max_tokens, enable_explanations):
+    # Clears old results and starts a new analysis.
+    # Clear old results from the session state.
+    keys_to_clear = [
+        'generated_text',
+        'all_attributions'
+    ]
+    for key in keys_to_clear:
+        if key in st.session_state:
+            del st.session_state[key]
+    # Clear any old cached items.
+    for key in list(st.session_state.keys()):
+        if key.startswith('influential_docs_'):
+            del st.session_state[key]
+    # Update the text area with the new prompt.
+    st.session_state.attr_prompt = prompt
+    # Set parameters for the new analysis.
+    st.session_state.run_request = {
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+        "enable_explanations": enable_explanations
+    }
+def run_analysis(prompt, max_tokens, enable_explanations, force_exact_num_tokens=False):
+    # Runs the full analysis pipeline.
+    if not prompt.strip():
+        st.warning(tr('please_enter_prompt_warning'))
+        return
+    # Check for cached results first
+    cache_file = os.path.join("cache", "cached_attribution_results.json")
+    if os.path.exists(cache_file):
+        with open(cache_file, "r", encoding="utf-8") as f:
+            cached_data = json.load(f)
+        if prompt in cached_data:
+            print("Loading full attribution analysis from cache.")
+            cached_result = cached_data[prompt]
+            # Populate session state from the comprehensive cache
+            st.session_state.generated_text = cached_result["generated_text"]
+            st.session_state.prompt = prompt
+            st.session_state.enable_explanations = enable_explanations
+            st.session_state.qwen_api_config = init_qwen_api() if enable_explanations else None
+            # Reconstruct attribution objects and store explanations/faithfulness
+            reconstructed_attributions = {}
+            for method, data in cached_result["html_contents"].items():
+                reconstructed_attributions[method] = CachedAttribution(data)
+                # Use a consistent key for caching in session state
+                cache_key_base = f"{method}_{cached_result['generated_text']}"
+                if "explanation" in data:
+                    st.session_state[f"explanation_{cache_key_base}"] = data["explanation"]
+                if "faithfulness_results" in data:
+                    st.session_state[f"faithfulness_check_{cache_key_base}"] = data["faithfulness_results"]
+            st.session_state.all_attributions = reconstructed_attributions
+            # Store influential docs
+            if "influential_docs" in cached_result:
+                # Use a key that the UI part can check for
+                st.session_state.cached_influential_docs = cached_result["influential_docs"]
+            st.success(tr('analysis_complete_success'))
+            return
+    # If not in cache, check if models exist before trying to load
+    model_path = "./models/OLMo-2-1124-7B"
+    if not os.path.exists(model_path):
+        st.info("This live demo is running in a static environment. Only the pre-cached example prompts are available. Please select an example to view its analysis.")
+        return
+    # Load the models.
+    with st.spinner(tr('loading_models_spinner')):
+        attribution_models, tokenizer, base_model, device = load_all_attribution_models()
+    if not attribution_models:
+        st.error(tr('failed_to_load_models_error'))
+        return
+    st.session_state.qwen_api_config = init_qwen_api() if enable_explanations else None
+    st.session_state.enable_explanations = enable_explanations
+    st.session_state.prompt = prompt
+    # Generate text and attributions.
+    with st.spinner(tr('running_attribution_analysis_spinner')):
+        try:
+            generated_text, all_attributions = generate_all_attribution_analyses(
+                attribution_models,
+                tokenizer,
+                base_model,
+                device,
+                prompt,
+                max_tokens,
+                force_exact_num_tokens=force_exact_num_tokens
+            )
+        except Exception as e:
+            st.error(f"Error in attribution analysis: {str(e)}")
+            # Let the rest of the function know it failed.
+            generated_text, all_attributions = None, None
+    if not generated_text or not all_attributions:
+        st.error(tr('failed_to_generate_analysis_error'))
+        return
+    # Store the results in the session state.
+    st.session_state.generated_text = generated_text
+    st.session_state.all_attributions = all_attributions
+    # --- New: Save the new result back to the cache ---
+    try:
+        cache_file = os.path.join("cache", "cached_attribution_results.json")
+        os.makedirs("cache", exist_ok=True)
+        # Load existing cache or create new
+        if os.path.exists(cache_file):
+            with open(cache_file, "r", encoding="utf-8") as f:
+                cached_data = json.load(f)
+        else:
+            cached_data = {}
+        # Add new result
+        html_contents = {method: attr.show(display=False, return_html=True) for method, attr in all_attributions.items()}
+        cached_data[prompt] = {
+            "generated_text": generated_text,
+            "html_contents": html_contents
+        }
+        # Write back to file
+        with open(cache_file, "w", encoding="utf-8") as f:
+            json.dump(cached_data, f, ensure_ascii=False, indent=4)
+        print(f"Saved new analysis for '{prompt}' to cache.")
+    except Exception as e:
+        print(f"Warning: Could not save result to cache file. {e}")
+    # --- End new section ---
+    # Clean up models to free memory.
+    del attribution_models
+    del tokenizer
+    del base_model
+    gc.collect()
+    if device == 'mps':
+        torch.mps.empty_cache()
+    elif device == 'cuda':
+        torch.cuda.empty_cache()
+    st.success(tr('analysis_complete_success'))
+def show_attribution_analysis():
+    # Shows the main attribution analysis page.
+    # Add some CSS for icons.
+    st.markdown('<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.5/font/bootstrap-icons.css">', unsafe_allow_html=True)
+    st.markdown(f"<h1>{tr('attr_page_title')}</h1>", unsafe_allow_html=True)
+    st.markdown(f"{tr('attr_page_desc')}", unsafe_allow_html=True)
+    # Check if a new analysis has been requested by the user.
+    if 'run_request' in st.session_state:
+        request = st.session_state.pop('run_request')
+        run_analysis(
+            prompt=request['prompt'],
+            max_tokens=request['max_tokens'],
+            enable_explanations=request['enable_explanations']
+        )
+    # Set up the main layout.
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.markdown(f"<h2>{tr('input_header')}</h2>", unsafe_allow_html=True)
+        # Get the current language from the session state.
+        lang = st.session_state.get('lang', 'en')
+        # Example prompts for English and German.
+        example_prompts = {
+            'en': [
+                "The capital of France is",
+                "The first person to walk on the moon was",
+                "To be or not to be, that is the",
+                "Once upon a time, in a land far, far away,",
+                "The chemical formula for water is",
+                "A stitch in time saves",
+                "The opposite of hot is",
+                "The main ingredients of a pizza are",
+                "She opened the door and saw"
+            ],
+            'de': [
+                "Die Hauptstadt von Frankreich ist",
+                "Die erste Person auf dem Mond war",
+                "Sein oder Nichtsein, das ist hier die",
+                "Es war einmal, in einem weit, weit entfernten Land,",
+                "Die chemische Formel für Wasser ist",
+                "Was du heute kannst besorgen, das verschiebe nicht auf",
+                "Das Gegenteil von heiß ist",
+                "Die Hauptzutaten einer Pizza sind",
+                "Sie öffnete die Tür und sah"
+            ]
+        }
+        st.markdown('**<i class="bi bi-lightbulb"></i> Example Prompts:**', unsafe_allow_html=True)
+        cols = st.columns(3)
+        for i, example in enumerate(example_prompts[lang][:9]):
+            with cols[i % 3]:
+                st.button(
+                    example,
+                    key=f"example_{i}",
+                    use_container_width=True,
+                    on_click=start_new_analysis,
+                    args=(example, 10, st.session_state.get('enable_explanations', True))
+                )
+        # Text input area for the user's prompt.
+        prompt = st.text_area(
+            tr('enter_prompt'),
+            value=st.session_state.get('attr_prompt', ""),
+            height=100,
+            help=tr('enter_prompt_help'),
+            placeholder="Sadly no GPU available. Please select an example above.",
+            disabled=True
+        )
+        # Slider for the number of tokens to generate.
+        max_tokens = st.slider(
+            tr('max_new_tokens_slider'),
+            min_value=1,
+            max_value=50,
+            value=5,
+            help=tr('max_new_tokens_slider_help')
+        )
+        # Checkbox to enable or disable AI explanations.
+        enable_explanations = st.checkbox(
+            tr('enable_ai_explanations'),
+            value=True,
+            help=tr('enable_ai_explanations_help')
+        )
+        # Button to start the analysis.
+        st.button(
+            tr('generate_and_analyze_button'),
+            type="primary",
+            on_click=start_new_analysis,
+            args=(prompt, max_tokens, enable_explanations)
+        )
+    with col2:
+        st.markdown(f"<h2>{tr('output_header')}</h2>", unsafe_allow_html=True)
+        if hasattr(st.session_state, 'generated_text'):
+            st.subheader(tr('generated_text_subheader'))
+            # Extract the generated part of the text.
+            prompt_part = st.session_state.prompt
+            full_text = st.session_state.generated_text
+            generated_part = full_text
+            if full_text.startswith(prompt_part):
+                generated_part = full_text[len(prompt_part):].lstrip()
+            else:
+                # A fallback in case tokenization changes the prompt slightly.
+                generated_part = full_text.replace(prompt_part, "", 1).strip()
+            # Clean up the generated text for display.
+            cleaned_generated_part = re.sub(r'\n{2,}', '\n', generated_part).strip()
+            escaped_generated = html.escape(cleaned_generated_part)
+            escaped_prompt = html.escape(prompt_part)
+            st.markdown(f"""
+            <div style="background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border: 1px solid #444;">
+                <strong>{tr('input_label')}</strong> <span style="color: #60a5fa;">{escaped_prompt}</span><br>
+                <strong>{tr('generated_label')}</strong> <span style="font-weight: bold; color: #fca5a5; white-space: pre-wrap;">{escaped_generated}</span>
+            </div>
+            """, unsafe_allow_html=True)
+    # Display the visualizations for each method.
+    if hasattr(st.session_state, 'all_attributions'):
+        st.header(tr('attribution_analysis_results_header'))
+        # Create tabs for each analysis method.
+        tab_titles = [
+            tr('saliency_tab'),
+            tr('attr_tab'),
+            tr('occlusion_tab')
+        ]
+        tabs = st.tabs(tab_titles)
+        # Define the order of the methods in the tabs.
+        methods = {
+            "saliency": {
+                "tab": tabs[0],
+                "title": tr('saliency_title'),
+                "description": tr('saliency_viz_desc')
+            },
+            "integrated_gradients": {
+                "tab": tabs[1],
+                "title": tr('attr_title'),
+                "description": tr('attr_viz_desc')
+            },
+            "occlusion": {
+                "tab": tabs[2],
+                "title": tr('occlusion_title'),
+                "description": tr('occlusion_viz_desc')
+            }
+        }
+        # Generate and display the visualization for each method.
+        for method_name, method_info in methods.items():
+            with method_info["tab"]:
+                st.subheader(f"{method_info['title']} Analysis")
+                # Generate the heatmap.
+                with st.spinner(tr('creating_viz_spinner').format(method_title=method_info['title'])):
+                    heatmap_fig, html_content, heatmap_buffer, scores_df = create_heatmap_visualization(
+                        st.session_state.all_attributions[method_name],
+                        method_name=method_info['title']
+                    )
+                if heatmap_fig:
+                    st.plotly_chart(heatmap_fig, use_container_width=True)
+                    # Add an explanation of how to read the heatmap.
+                    explanation_html = f"""
+                    <div style="background-color: #0E1117; border-radius: 10px; padding: 15px; margin: 10px 0; border: 1px solid #262730;">
+                        <h4 style="color: #FAFAFA; margin-bottom: 10px;">{tr('how_to_read_heatmap')}</h4>
+                        <ul style="color: #DCDCDC; margin-left: 20px; padding-left: 0;">
+                            <li style="margin-bottom: 5px;"><strong>{tr('xaxis_label')}:</strong> {tr('xaxis_desc')}</li>
+                            <li style="margin-bottom: 5px;"><strong>{tr('yaxis_label')}:</strong> {tr('yaxis_desc')}</li>
+                            <li style="margin-bottom: 5px;"><strong>{tr('color_intensity_label')}:</strong> {tr('color_intensity_desc')}</li>
+                            <li style="margin-bottom: 5px;"><strong>{tr('interpretation_label')}:</strong> {tr('interpretation_desc')}</li>
+                            <li style="margin-bottom: 5px;"><strong>{tr('special_tokens_label')}:</strong> {tr('special_tokens_desc')}</li>
+                        </ul>
+                    </div>
+                    """
+                    st.markdown(explanation_html, unsafe_allow_html=True)
+                    # Generate an AI explanation for the heatmap.
+                    if (st.session_state.get('enable_explanations') and
+                        st.session_state.get('qwen_api_config') and
+                        heatmap_buffer is not None and scores_df is not None):
+                        explanation_cache_key = f"explanation_{method_name}_{st.session_state.generated_text}"
+                        # Get the explanation from the cache or generate it.
+                        if explanation_cache_key not in st.session_state:
+                            with st.spinner(tr('generating_ai_explanations_spinner').format(method_title=method_info['title'])):
+                                explanation = explain_heatmap_with_csv_data(
+                                    st.session_state.qwen_api_config,
+                                    heatmap_buffer,
+                                    scores_df,
+                                    st.session_state.prompt,
+                                    st.session_state.generated_text,
+                                    method_name
+                                )
+                                st.session_state[explanation_cache_key] = explanation
+                        explanation = st.session_state.get(explanation_cache_key)
+                        if explanation and not explanation.startswith("Error:"):
+                            simple_desc = tr(METHOD_DESC_KEYS.get(method_name, "unsupported_method_desc"))
+                            st.markdown(f"#### {tr('what_this_method_shows')}")
+                            st.markdown(f"""
+                            <div style="background-color: #2f3f70; color: #f5f7fb; padding: 1.2rem; border-radius: 12px; margin-bottom: 1rem; box-shadow: 0 12px 24px rgba(47, 63, 112, 0.35);">
+                                <p style='font-size: 1.05em; font-weight: 500; margin:0; color: #f5f7fb;'>{simple_desc}</p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                            html_explanation = markdown.markdown(explanation)
+                            st.markdown(f"#### {tr('ai_generated_analysis')}")
+                            st.markdown(f"""
+                            <div style="background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; border-left: 4px solid #dcae36; font-size: 0.9rem; margin-bottom: 1rem;">
+                                    {html_explanation}
+                            </div>
+                            """, unsafe_allow_html=True)
+                            # Faithfulness Check Expander
+                            with st.expander(tr('faithfulness_check_expander')):
+                                st.markdown(tr('faithfulness_check_explanation_html'), unsafe_allow_html=True)
+                                with st.spinner(tr('running_faithfulness_check_spinner')):
+                                    try:
+                                        # Use a cache key to avoid re-running the check unnecessarily.
+                                        check_cache_key = f"faithfulness_check_{method_name}_{st.session_state.generated_text}"
+                                        if check_cache_key not in st.session_state:
+                                            claims = _cached_extract_claims_from_explanation(
+                                                st.session_state.qwen_api_config,
+                                                explanation,
+                                                method_name
+                                            )
+                                            if claims:
+                                                analysis_data = {
+                                                    'scores_df': scores_df,
+                                                    'method': method_name,
+                                                    'prompt': st.session_state.prompt,
+                                                    'generated_text': st.session_state.generated_text
+                                                }
+                                                verification_results = verify_claims(claims, analysis_data)
+                                                st.session_state[check_cache_key] = verification_results
+                                            else:
+                                                st.session_state[check_cache_key] = []
+                                        verification_results = st.session_state[check_cache_key]
+                                        if verification_results:
+                                            st.markdown(f"<h6>{tr('faithfulness_check_results_header')}</h6>", unsafe_allow_html=True)
+                                            for result in verification_results:
+                                                status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                                                st.markdown(f"""
+                                                <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                                                    <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                                                    <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                                                    <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                                                </div>
+                                                """, unsafe_allow_html=True)
+                                        else:
+                                            st.info(tr('no_verifiable_claims_info'))
+                                    except Exception as e:
+                                        st.error(tr('faithfulness_check_error').format(e=str(e)))
+                # Add download buttons for the results.
+                st.subheader(tr("download_results_subheader"))
+                col1, col2 = st.columns(2)
+                with col1:
+                        if html_content:
+                         st.download_button(
+                                label=tr("download_html_button").format(method_title=method_info['title']),
+                            data=html_content,
+                                file_name=f"{method_name}_analysis.html",
+                                mime="text/html",
+                                key=f"html_{method_name}"
+                        )
+                        if scores_df is not None:
+                            st.download_button(
+                                label=tr("download_csv_button"),
+                                data=scores_df.to_csv().encode('utf-8'),
+                                file_name=f"{method_name}_scores.csv",
+                                mime="text/csv",
+                                key=f"csv_raw_{method_name}"
+                            )
+                with col2:
+                        if heatmap_fig:
+                            img_bytes = heatmap_fig.to_image(format="png", scale=2)
+                            st.download_button(
+                                label=tr("download_png_button").format(method_title=method_info['title']),
+                                data=img_bytes,
+                                file_name=f"{method_name}_heatmap.png",
+                                mime="image/png",
+                                key=f"png_{method_name}"
+                            )
+        # Display the influence tracer section.
+        st.markdown("---")
+        st.markdown(f'<h3><i class="bi bi-compass"></i> {tr("influence_tracer_title")}</h3>', unsafe_allow_html=True)
+        st.markdown(f"<div style='font-size: 1.1rem;'>{tr('influence_tracer_desc')}</div>", unsafe_allow_html=True)
+        # Add a visual explanation of cosine similarity.
+        # Get translated text.
+        sentence_a = tr('influence_example_sentence_a')
+        sentence_b = tr('influence_example_sentence_b')
+        # Create the SVG for the diagram.
+        svg_code = f"""
+        <svg width="250" height="150" viewBox="0 0 250 150" xmlns="http://www.w3.org/2000/svg">
+            <line x1="10" y1="130" x2="240" y2="130" stroke="#555" stroke-width="2"></line>
+            <line x1="10" y1="130" x2="10" y2="10" stroke="#555" stroke-width="2"></line>
+            <!-- Corrected angle arc and theta position -->
+            <path d="M 49 123 A 40 40 0 0 0 42 107" fill="none" stroke="#FFD700" stroke-width="2"></path>
+            <text x="50" y="115" font-family="monospace" font-size="12" fill="#FFD700">θ</text>
+            <line x1="10" y1="130" x2="150" y2="30" stroke="#87CEEB" stroke-width="3"></line>
+            <text x="155" y="25" font-family="monospace" font-size="12" fill="#87CEEB">Vector A</text>
+            <text x="155" y="40" font-family="monospace" font-size="10" fill="#aaa">{sentence_a}</text>
+            <line x1="10" y1="130" x2="170" y2="100" stroke="#90EE90" stroke-width="3"></line>
+            <text x="175" y="95" font-family="monospace" font-size="12" fill="#90EE90">Vector B</text>
+            <text x="175" y="110" font-family="monospace" font-size="10" fill="#aaa">{sentence_b}</text>
+        </svg>
+        """
+        # Encode the SVG to base64.
+        encoded_svg = base64.b64encode(svg_code.encode("utf-8")).decode("utf-8")
+        image_uri = f"data:image/svg+xml;base64,{encoded_svg}"
+        # Display the explanation and diagram.
+        st.markdown(f"""
+        <div style="background-color: #2b2b2b; border-radius: 10px; padding: 1.5rem; margin: 1rem 0; border-left: 4px solid #FFD700;">
+            <h4 style="color: #FFD700; margin-top: 0; margin-bottom: 1rem;">{tr('how_influence_is_found_header')}</h4>
+            <div>
+                <p style="font-size: 1rem;">{tr('how_influence_is_found_desc')}</p>
+                <div style="font-family: 'SF Mono', 'Consolas', 'Menlo', monospace; margin-top: 1.5rem; font-size: 0.95em;">
+                    <p>{tr('influence_step_1_title')}: {tr('influence_step_1_desc')}</p>
+                    <p>{tr('influence_step_2_title')}: {tr('influence_step_2_desc')}</p>
+                    <p>{tr('influence_step_3_title')}: {tr('influence_step_3_desc')}</p>
+                </div>
+            </div>
+            <div style="text-align: center; margin-top: 2rem;">
+                <img src="{image_uri}" alt="Cosine Similarity Diagram" />
+            </div>
+        </div>
+        """, unsafe_allow_html=True)
+        st.write("")
+        if hasattr(st.session_state, 'generated_text'):
+            # First, check if influential docs are available in the cache from session_state
+            if 'cached_influential_docs' in st.session_state:
+                influential_docs = st.session_state.pop('cached_influential_docs') # Use and remove
+            else:
+                with st.spinner(tr('running_influence_trace_spinner')):
+                    lang = st.session_state.get('lang', 'en')
+                    influential_docs = get_influential_docs(st.session_state.prompt, lang)
+            # Display the results.
+            if influential_docs:
+                st.markdown(f"#### {tr('top_influential_docs_header').format(num_docs=len(influential_docs))}")
+                # A nice visualization for the influential documents.
+                for i, doc in enumerate(influential_docs):
+                    colors = ["#A78BFA", "#7F9CF5", "#6EE7B7", "#FBBF24", "#F472B6"]
+                    card_color = colors[i % len(colors)]
+                    full_text = doc['text']
+                    highlight_sentence = doc.get('highlight_sentence', '')
+                    highlighted_html = ""
+                    lang = st.session_state.get('lang', 'en')
+                    if highlight_sentence:
+                        # Normalize the sentence to be highlighted.
+                        normalized_highlight = re.sub(r'\s+', ' ', highlight_sentence).strip()
+                        # Use fuzzy matching to find the best match in the document.
+                        splitter = SentenceSplitter(language=lang)
+                        sentences_in_doc = splitter.split(text=full_text)
+                        if sentences_in_doc:
+                            best_match, score = process.extractOne(normalized_highlight, sentences_in_doc)
+                            start_index = full_text.find(best_match)
+                            if start_index != -1:
+                                end_index = start_index + len(best_match)
+                                # Create a context window around the matched sentence.
+                                context_window = 500
+                                snippet_start = max(0, start_index - context_window)
+                                snippet_end = min(len(full_text), end_index + context_window)
+                                # Reconstruct the HTML with the highlighted sentence.
+                                before = html.escape(full_text[snippet_start:start_index])
+                                highlight = html.escape(best_match)
+                                after = html.escape(full_text[end_index:snippet_end])
+                                # Add ellipses if we're not showing the full text.
+                                start_ellipsis = "... " if snippet_start > 0 else ""
+                                end_ellipsis = " ..." if snippet_end < len(full_text) else ""
+                                highlighted_html = (
+                                    f"{start_ellipsis}{before}"
+                                    f'<mark style="background-color: {card_color}77; color: #DCDCDC; padding: 2px 4px; border-radius: 4px; font-weight: bold;">{highlight}</mark>'
+                                    f"{after}{end_ellipsis}"
+                                )
+                    # If no highlight was applied, just show the full text.
+                    if not highlighted_html:
+                        highlighted_html = html.escape(full_text)
+                    st.markdown(f"""
+                    <div style="border: 1px solid #262730; border-left: 5px solid {card_color}; border-radius: 10px; padding: 1.5rem; margin-bottom: 1.5rem; background-color: #0E1117; box-shadow: 0 4px 8px rgba(0,0,0,0.2);">
+                        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
+                            <span style="font-size: 1.1rem; color: #FAFAFA; font-weight: 600;"><i class="bi bi-journal-text"></i> {tr('source_label')}: {doc['source']}</span>
+                            <span style="font-size: 1.1rem; color: {card_color}; background-color: {card_color}22; padding: 0.3rem 0.8rem; border-radius: 15px; font-weight: bold;">
+                                <i class="bi bi-stars"></i> {tr('similarity_label')}: {doc['similarity']:.3f}
+                            </span>
+                        </div>
+                        <div style="background-color: #1a1a1a; color: #DCDCDC; padding: 1rem; border-radius: 8px; font-family: 'Courier New', Courier, monospace; white-space: pre-wrap; word-wrap: break-word; max-height: 300px; overflow-y: auto;">
+                            {highlighted_html.strip()}
+                        </div>
+                    </div>
+                    """, unsafe_allow_html=True)
+            else:
+                # Give a helpful message if the index is missing.
+                if not os.path.exists(INDEX_PATH) or not os.path.exists(MAPPING_PATH):
+                    st.warning(tr('influence_index_not_found_warning'))
+                else:
+                    st.info(tr('no_influential_docs_found'))
+        else:
+            st.info(tr('run_analysis_for_influence_info'))
+    # Show the feedback survey in the sidebar.
+    #if 'all_attributions' in st.session_state:
+    #    display_attribution_feedback()
+if __name__ == "__main__":
+    show_attribution_analysis()

cache/cached_attribution_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/CLT_IMPROVEMENTS.md ADDED Viewed

	@@ -0,0 +1,232 @@

+# CLT Improvement Guide
+## Current Issues & Solutions
+### 1. **Weak Sparsity Loss** (Critical)
+**Current:** `torch.mean(torch.tanh(sparsity_lambda * features))`
+- `tanh` saturates and doesn't strongly penalize activations
+- Results in dense, non-interpretable features
+**Better:** Use L1 sparsity (standard in SAEs)
+```python
+# Replace line 931 in train_clt:
+sparsity_loss += torch.mean(torch.abs(features))  # L1 norm
+# Or per-feature L1:
+sparsity_loss += torch.sum(torch.abs(features), dim=-1).mean()  # Sum over features, mean over batch/seq
+```
+**Even Better:** Use top-k sparsity (encourages dead features)
+```python
+# Only penalize top-k active features per position
+k = int(0.1 * self.n_features)  # Top 10% most active
+for features in feature_activations:
+    # features: [batch, seq, n_features]
+    topk_vals, _ = torch.topk(features, k=k, dim=-1)
+    sparsity_loss += torch.mean(topk_vals)
+```
+### 2. **Dead Feature Resampling** (Important)
+**Problem:** Many features never activate → wasted capacity
+**Solution:** Periodically resample dead features
+```python
+def resample_dead_features(self, feature_activations: List[torch.Tensor],
+                          dead_threshold: float = 1e-6):
+    """Resample encoder weights for features that never activate."""
+    for layer_idx, features in enumerate(feature_activations):
+        # features: [batch, seq, n_features]
+        avg_activation = features.mean(dim=(0, 1))  # [n_features]
+        dead_mask = avg_activation < dead_threshold
+        if dead_mask.any():
+            n_dead = dead_mask.sum().item()
+            logger.info(f"Layer {layer_idx}: Resampling {n_dead} dead features")
+            # Resample dead encoder weights
+            with torch.no_grad():
+                # Copy from a random active feature
+                active_indices = torch.nonzero(~dead_mask).squeeze(-1)
+                if len(active_indices) > 0:
+                    for dead_idx in torch.nonzero(dead_mask).squeeze(-1):
+                        source_idx = active_indices[torch.randint(0, len(active_indices), (1,))]
+                        self.encoders[layer_idx].weight[dead_idx] = \
+                            self.encoders[layer_idx].weight[source_idx].clone()
+                        # Add small noise to break symmetry
+                        self.encoders[layer_idx].weight[dead_idx] += \
+                            torch.randn_like(self.encoders[layer_idx].weight[dead_idx]) * 0.01
+```
+Call this every 100-500 training steps.
+### 3. **Better Weight Initialization**
+**Current:** `nn.init.normal_(weight, mean=0.0, std=0.01)` - too small
+**Better:** Use geometric mean initialization (Anthropic SAE style)
+```python
+def _init_weights(self):
+    for layer_idx, encoder in enumerate(self.encoders):
+        # Initialize to have roughly unit norm
+        nn.init.xavier_uniform_(encoder.weight, gain=0.1)
+        # Or: initialize from pretrained if available
+```
+### 4. **Learning Rate Scheduling**
+**Current:** Fixed learning rate
+**Better:** Cosine annealing or warmup
+```python
+from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
+# In train_clt:
+optimizer = torch.optim.Adam(self.clt.parameters(), lr=self.config.learning_rate)
+scheduler = CosineAnnealingLR(optimizer, T_max=self.config.training_steps, eta_min=1e-6)
+# Or OneCycleLR for faster convergence:
+# scheduler = OneCycleLR(optimizer, max_lr=self.config.learning_rate,
+#                       total_steps=self.config.training_steps)
+# In training loop:
+optimizer.step()
+scheduler.step()
+```
+### 5. **Gradient Clipping**
+**Problem:** Large gradients can destabilize training
+**Solution:**
+```python
+# After loss.backward(), before optimizer.step():
+torch.nn.utils.clip_grad_norm_(self.clt.parameters(), max_norm=1.0)
+optimizer.step()
+```
+### 6. **Better Reconstruction Target**
+**Current:** Reconstructs `hidden_states` (residual stream)
+**Question:** Should you reconstruct MLP outputs instead?
+- If CLT is meant to approximate MLP computation, target should be MLP outputs
+- Current approach learns residual stream → feature → residual stream mapping
+**If targeting MLP outputs:**
+```python
+# Get MLP outputs instead of hidden states
+with torch.no_grad():
+    outputs = self.model(**inputs, output_hidden_states=True)
+    # Extract MLP outputs (hidden_states[layer] - hidden_states[layer-1] for some models)
+    # Or use model's intermediate outputs if available
+    mlp_outputs = [...]  # Extract MLP outputs per layer
+# Then reconstruct mlp_outputs instead of hidden_states
+recon_loss = sum(F.mse_loss(pred, target)
+                 for target, pred in zip(mlp_outputs, reconstructed_outputs))
+```
+### 7. **Feature Density Monitoring**
+**Add metrics to track:**
+```python
+# In training loop, track:
+active_features = (features > 0.01).sum(dim=-1).float().mean()  # Avg active per position
+feature_density = (features > 0).float().mean()  # Fraction of features active
+max_activation = features.max().item()
+# Log these to understand sparsity
+if step % 100 == 0:
+    logger.info(f"Active features/position: {active_features:.1f}, "
+                f"Density: {feature_density:.3f}, Max act: {max_activation:.4f}")
+```
+### 8. **Orthogonality Regularization** (Optional)
+**Encourages diverse features:**
+```python
+# Add to loss:
+ortho_loss = 0.0
+for encoder in self.encoders:
+    W = encoder.weight  # [n_features, hidden_size]
+    # Encourage orthogonality: W @ W.T should be close to identity
+    gram = torch.mm(W, W.T)
+    identity = torch.eye(self.n_features, device=W.device)
+    ortho_loss += F.mse_loss(gram, identity)
+loss += 0.01 * ortho_loss  # Small weight
+```
+### 9. **Batch Processing** (Performance)
+**Current:** Processes texts one-by-one in batch
+**Better:** True batch processing
+```python
+# Tokenize all texts at once
+inputs = self.tokenizer(
+    batch_texts,  # List of strings
+    return_tensors="pt",
+    padding=True,
+    truncation=True,
+    max_length=self.config.max_seq_length
+).to(self.device)
+# Get activations for entire batch
+with torch.no_grad():
+    outputs = self.model(**inputs, output_hidden_states=True)
+    hidden_states = outputs.hidden_states[1:]
+# Forward pass (now handles batch dimension properly)
+feature_activations, reconstructed_outputs = self.clt(hidden_states)
+# Loss computation (already batched)
+recon_loss = sum(F.mse_loss(pred, target)
+                 for target, pred in zip(hidden_states, reconstructed_outputs))
+```
+### 10. **Hyperparameter Tuning**
+**Recommended starting values:**
+```python
+config = AttributionGraphConfig(
+    n_features_per_layer=2048,  # Increase from 512 (more capacity)
+    sparsity_lambda=1e-3,  # Start small, increase if too dense
+    reconstruction_loss_weight=1.0,  # Keep at 1.0
+    learning_rate=3e-4,  # Standard for Adam
+    training_steps=10000,  # More steps for better convergence
+    batch_size=32,  # Larger batches stabilize training
+)
+```
+## Implementation Priority
+1. **High Priority:**
+   - Change sparsity loss to L1 (easy, big impact)
+   - Add gradient clipping
+   - Add learning rate scheduling
+   - Fix batch processing
+2. **Medium Priority:**
+   - Dead feature resampling
+   - Better weight initialization
+   - Feature density monitoring
+3. **Low Priority:**
+   - Orthogonality regularization
+   - MLP output targeting (if architecture change needed)
+## Quick Win: Minimal Changes
+If you want the biggest improvement with minimal code changes:
+1. Replace sparsity loss (line 931):
+```python
+sparsity_loss += torch.mean(torch.abs(features))  # L1 instead of tanh
+```
+2. Add gradient clipping (after line 948):
+```python
+torch.nn.utils.clip_grad_norm_(self.clt.parameters(), max_norm=1.0)
+```
+3. Add learning rate scheduler (after line 894):
+```python
+from torch.optim.lr_scheduler import CosineAnnealingLR
+scheduler = CosineAnnealingLR(optimizer, T_max=self.config.training_steps)
+# Then after optimizer.step():
+scheduler.step()
+```
+These three changes should significantly improve feature quality and training stability.

circuit_analysis/WORKFLOW_PER_PROMPT.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# Workflow for Per-Prompt Analysis with Feature Interpretations
+## Overview
+This workflow allows you to run feature interpretations and offline circuit metrics for each prompt separately to stay within API rate limits (200 calls/hour), then merge the results.
+## Step-by-Step Process
+### 1. Run Feature Interpretations Per Prompt
+For each prompt (0, 1, 2), run the main analysis script:
+```bash
+# Prompt 0
+python3 circuit_analysis/attribution_graphs_olmo.py --prompt-index 0
+# Prompt 1
+python3 circuit_analysis/attribution_graphs_olmo.py --prompt-index 1
+# Prompt 2
+python3 circuit_analysis/attribution_graphs_olmo.py --prompt-index 2
+```
+This will:
+- Generate feature interpretations using Qwen API (stays within rate limits per prompt)
+- Save results to `circuit_analysis/results/attribution_graphs_results_prompt_{1,2,3}.json`
+- Use consistent config: `n_features_per_layer=512`, `sparsity_lambda=1e-3`, etc.
+### 2. Run Offline Circuit Metrics Per Prompt
+For each prompt, run the offline metrics script:
+```bash
+# Prompt 0
+python3 circuit_analysis/offline_circuit_metrics.py --prompt-index 0
+# Prompt 1
+python3 circuit_analysis/offline_circuit_metrics.py --prompt-index 1
+# Prompt 2
+python3 circuit_analysis/offline_circuit_metrics.py --prompt-index 2
+```
+This will:
+- Calculate **full universe delta p** (ablating all CLT features)
+- Calculate **full circuit delta p** (ablating all graph features)
+- Use the same config numbers as the main analysis
+- Save to `circuit_analysis/results/offline_circuit_metrics_prompt_{1,2,3}.json`
+### 3. Merge Results
+Merge all the per-prompt results into a single file:
+```bash
+# Start with prompt 1
+python3 circuit_analysis/merge_circuit_results.py attribution_graphs_results.json attribution_graphs_results_prompt_1.json
+# Add prompt 2
+python3 circuit_analysis/merge_circuit_results.py attribution_graphs_results.json attribution_graphs_results_prompt_2.json
+# Add prompt 3
+python3 circuit_analysis/merge_circuit_results.py attribution_graphs_results.json attribution_graphs_results_prompt_3.json
+```
+For offline metrics (if you want to merge those too):
+```bash
+python3 circuit_analysis/merge_circuit_results.py offline_circuit_metrics.json offline_circuit_metrics_prompt_1.json
+python3 circuit_analysis/merge_circuit_results.py offline_circuit_metrics.json offline_circuit_metrics_prompt_2.json
+python3 circuit_analysis/merge_circuit_results.py offline_circuit_metrics.json offline_circuit_metrics_prompt_3.json
+```
+### 4. Plot Offline Metrics
+After merging, plot the combined results:
+```bash
+python3 circuit_analysis/plot_offline_metrics.py
+```
+## Configuration Consistency
+All scripts now use the same configuration:
+- `n_features_per_layer=512` (matches trained CLT)
+- `sparsity_lambda=1e-3` (L1 sparsity, matches training)
+- `graph_feature_activation_threshold=0.01`
+- `graph_edge_weight_threshold=0.003`
+- `graph_max_features_per_layer=40`
+- `graph_max_edges_per_node=20`
+## Output Format
+Each per-prompt result file contains:
+- `analyses`: Dictionary with `prompt_{N}` keys
+- `config`: Configuration used
+- `timestamp`: When the analysis was run
+The offline metrics additionally include:
+- `full_universe_ablation`: Delta p when ablating all CLT features
+- `full_circuit_ablation`: Delta p when ablating all graph features
+## Notes
+- Feature interpretations are cached, so re-running won't make duplicate API calls
+- Wait at least 1 hour between prompts if you're hitting rate limits
+- The merge script preserves all data and just combines the `analyses` dictionaries

circuit_analysis/attribution_graphs_olmo.py ADDED Viewed

	@@ -0,0 +1,1931 @@

+#!/usr/bin/env python3
+# This script generates attribution graphs for the OLMo2 7B model.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Tuple, Optional, Any, Set
+import json
+import logging
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from collections import defaultdict
+import networkx as nx
+from dataclasses import dataclass
+from tqdm import tqdm
+import pickle
+import requests
+import time
+import random
+import copy
+import os
+import argparse
+# --- Add this block to fix the import path ---
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+# ---------------------------------------------
+from utilities.utils import init_qwen_api, set_seed
+# --- Constants ---
+RESULTS_DIR = "circuit_analysis/results"
+CLT_SAVE_PATH = "circuit_analysis/models/clt_model.pth"
+# Configure logging.
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set the device for training.
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+    logger.info("Using MPS (Metal Performance Shaders) for GPU acceleration")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+    logger.info("Using CUDA for GPU acceleration")
+else:
+    DEVICE = torch.device("cpu")
+    logger.info("Using CPU")
+@dataclass
+class AttributionGraphConfig:
+    # Configuration for building the attribution graph.
+    model_path: str = "./models/OLMo-2-1124-7B"
+    max_seq_length: int = 512
+    n_features_per_layer: int = 512   # Number of features in each CLT layer
+    sparsity_lambda: float = 1e-3     # Updated for L1 sparsity
+    reconstruction_loss_weight: float = 1.0
+    batch_size: int = 8
+    learning_rate: float = 1e-4
+    training_steps: int = 1000
+    device: str = str(DEVICE)
+    pruning_threshold: float = 0.8  # For graph pruning
+    intervention_strength: float = 5.0  # For perturbation experiments
+    qwen_api_config: Optional[Dict[str, str]] = None
+    max_ablation_experiments: Optional[int] = None
+    ablation_top_k_tokens: int = 5
+    ablation_features_per_layer: Optional[int] = 2
+    summary_max_layers: Optional[int] = None
+    summary_features_per_layer: Optional[int] = 2
+    random_baseline_trials: int = 5
+    random_baseline_features: int = 1
+    random_baseline_seed: int = 1234
+    path_ablation_top_k: int = 3
+    random_path_baseline_trials: int = 5
+    graph_max_features_per_layer: int = 40
+    graph_feature_activation_threshold: float = 0.01
+    graph_edge_weight_threshold: float = 0.0
+    graph_max_edges_per_node: int = 12
+class JumpReLU(nn.Module):
+    # The JumpReLU activation function.
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x):
+        return F.relu(x - self.threshold)
+class CrossLayerTranscoder(nn.Module):
+    # The Cross-Layer Transcoder (CLT) model.
+    def __init__(self, model_config: Dict, clt_config: AttributionGraphConfig):
+        super().__init__()
+        self.config = clt_config
+        self.model_config = model_config
+        self.n_layers = model_config['num_hidden_layers']
+        self.hidden_size = model_config['hidden_size']
+        self.n_features = clt_config.n_features_per_layer
+        # Encoder weights for each layer.
+        self.encoders = nn.ModuleList([
+            nn.Linear(self.hidden_size, self.n_features, bias=False)
+            for _ in range(self.n_layers)
+        ])
+        # Decoder weights for cross-layer connections.
+        self.decoders = nn.ModuleDict()
+        for source_layer in range(self.n_layers):
+            for target_layer in range(source_layer, self.n_layers):
+                key = f"{source_layer}_to_{target_layer}"
+                self.decoders[key] = nn.Linear(self.n_features, self.hidden_size, bias=False)
+        # The activation function.
+        self.activation = JumpReLU(threshold=0.0)
+        # Initialize the weights.
+        self._init_weights()
+    def _init_weights(self):
+        # Initializes the weights with small random values.
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=0.01)
+    def encode(self, layer_idx: int, residual_activations: torch.Tensor) -> torch.Tensor:
+        # Encodes residual stream activations to feature activations.
+        return self.activation(self.encoders[layer_idx](residual_activations))
+    def decode(self, source_layer: int, target_layer: int, feature_activations: torch.Tensor) -> torch.Tensor:
+        # Decodes feature activations to the MLP output space.
+        key = f"{source_layer}_to_{target_layer}"
+        return self.decoders[key](feature_activations)
+    def forward(self, residual_activations: List[torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        # The forward pass of the CLT.
+        feature_activations = []
+        reconstructed_mlp_outputs = []
+        # Encode features for each layer.
+        for layer_idx, residual in enumerate(residual_activations):
+            features = self.encode(layer_idx, residual)
+            feature_activations.append(features)
+        # Reconstruct MLP outputs with cross-layer connections.
+        for target_layer in range(self.n_layers):
+            reconstruction = torch.zeros_like(residual_activations[target_layer])
+            # Sum contributions from all previous layers.
+            for source_layer in range(target_layer + 1):
+                decoded = self.decode(source_layer, target_layer, feature_activations[source_layer])
+                reconstruction += decoded
+            reconstructed_mlp_outputs.append(reconstruction)
+        return feature_activations, reconstructed_mlp_outputs
+class FeatureVisualizer:
+    # A class to visualize and interpret individual features.
+    def __init__(self, tokenizer, cache_dir: Optional[Path] = None):
+        self.tokenizer = tokenizer
+        self.feature_interpretations: Dict[str, str] = {}
+        self.cache_dir = cache_dir
+        if self.cache_dir is not None:
+            self.cache_dir = Path(self.cache_dir)
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+            self._load_cache()
+    def _cache_file(self) -> Optional[Path]:
+        if self.cache_dir is None:
+            return None
+        return self.cache_dir / "feature_interpretations.json"
+    def _load_cache(self):
+        cache_file = self._cache_file()
+        if cache_file is None or not cache_file.exists():
+            return
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, dict):
+                    self.feature_interpretations.update({str(k): str(v) for k, v in data.items()})
+        except Exception as e:
+            logger.warning(f"Failed to load feature interpretation cache: {e}")
+    def _save_cache(self):
+        cache_file = self._cache_file()
+        if cache_file is None:
+            return
+        try:
+            with open(cache_file, 'w', encoding='utf-8') as f:
+                json.dump(self.feature_interpretations, f, indent=2)
+        except Exception as e:
+            logger.warning(f"Failed to save feature interpretation cache: {e}")
+    def visualize_feature(self, feature_idx: int, layer_idx: int,
+                         activations: torch.Tensor, input_tokens: List[str],
+                         top_k: int = 10) -> Dict:
+        # Creates a visualization for a single feature.
+        feature_acts = activations[:, feature_idx].detach().cpu().numpy()
+        # Find the top activating positions.
+        top_positions = np.argsort(feature_acts)[-top_k:][::-1]
+        visualization = {
+            'feature_idx': feature_idx,
+            'layer_idx': layer_idx,
+            'max_activation': float(feature_acts.max()),
+            'mean_activation': float(feature_acts.mean()),
+            'sparsity': float((feature_acts > 0.1).mean()),
+            'top_activations': []
+        }
+        for pos in top_positions:
+            if pos < len(input_tokens):
+                visualization['top_activations'].append({
+                    'token': input_tokens[pos],
+                    'position': int(pos),
+                    'activation': float(feature_acts[pos])
+                })
+        return visualization
+    def interpret_feature(self, feature_idx: int, layer_idx: int,
+                          visualization_data: Dict,
+                          qwen_api_config: Optional[Dict[str, str]] = None) -> str:
+        # Interprets a feature based on its top activating tokens.
+        top_tokens = [item['token'] for item in visualization_data['top_activations']]
+        cache_key = f"L{layer_idx}_F{feature_idx}"
+        if cache_key in self.feature_interpretations:
+            return self.feature_interpretations[cache_key]
+        # Use the Qwen API if it is configured.
+        if qwen_api_config and qwen_api_config.get('api_key'):
+            feature_name = cache_key
+            interpretation = get_feature_interpretation_with_qwen(
+                qwen_api_config, top_tokens, feature_name, layer_idx
+            )
+        else:
+            # Use a simple heuristic as a fallback.
+            if len(set(top_tokens)) == 1 and top_tokens:
+                interpretation = f"Specific token: '{top_tokens[0]}'"
+            elif top_tokens and all(token.isalpha() for token in top_tokens):
+                interpretation = "Word/alphabetic tokens"
+            elif top_tokens and all(token.isdigit() for token in top_tokens):
+                interpretation = "Numeric tokens"
+            elif top_tokens and all(token in '.,!?;:' for token in top_tokens):
+                interpretation = "Punctuation"
+            else:
+                interpretation = "Mixed/polysemantic feature"
+        self.feature_interpretations[cache_key] = interpretation
+        self._save_cache()
+        return interpretation
+class AttributionGraph:
+    # A class to construct and analyze attribution graphs.
+    def __init__(self, clt: CrossLayerTranscoder, tokenizer, config: AttributionGraphConfig):
+        self.clt = clt
+        self.tokenizer = tokenizer
+        self.config = config
+        self.graph = nx.DiGraph()
+        self.node_types = {}  # Track node types (feature, embedding, error, output)
+        self.edge_weights = {}
+        self.feature_metadata: Dict[str, Dict[str, Any]] = {}
+    def compute_virtual_weights(self, source_layer: int, target_layer: int,
+                               source_feature: int, target_feature: int) -> float:
+        # Computes the virtual weight between two features.
+        if target_layer <= source_layer:
+            return 0.0
+        # Get the encoder and decoder weights.
+        encoder_weight = self.clt.encoders[target_layer].weight[target_feature]  # [hidden_size]
+        total_weight = 0.0
+        for intermediate_layer in range(source_layer, target_layer):
+            decoder_key = f"{source_layer}_to_{intermediate_layer}"
+            if decoder_key in self.clt.decoders:
+                decoder_weight = self.clt.decoders[decoder_key].weight[:, source_feature]  # [hidden_size]
+                # The virtual weight is inner product
+                virtual_weight = torch.dot(decoder_weight, encoder_weight).item()
+                total_weight += virtual_weight
+        return total_weight
+    def construct_graph(self, input_tokens: List[str],
+                       feature_activations: List[torch.Tensor],
+                       target_token_idx: int = -1) -> nx.DiGraph:
+        # Constructs the attribution graph for a prompt.
+        self.graph.clear()
+        self.node_types.clear()
+        self.edge_weights.clear()
+        seq_len = len(input_tokens)
+        n_layers = len(feature_activations)
+        # Add embedding nodes for the input tokens.
+        for i, token in enumerate(input_tokens):
+            node_id = f"emb_{i}_{token}"
+            self.graph.add_node(node_id)
+            self.node_types[node_id] = "embedding"
+        # Add nodes for the features.
+        active_features = {}  # Track which features are significantly active
+        max_features_per_layer = self.config.graph_max_features_per_layer or 20  # Limit features per layer to prevent explosion
+        activation_threshold = self.config.graph_feature_activation_threshold
+        edge_weight_threshold = self.config.graph_edge_weight_threshold
+        max_edges_per_node_cfg = self.config.graph_max_edges_per_node or 5
+        for layer_idx, features in enumerate(feature_activations):
+            # features shape: [batch_size, seq_len, n_features]
+            batch_size, seq_len_layer, n_features = features.shape
+            # Get the top activating features for this layer.
+            layer_activations = features[0].mean(dim=0)  # Average across sequence
+            top_features = torch.topk(layer_activations,
+                                    k=min(max_features_per_layer, n_features)).indices
+            for token_pos in range(min(seq_len, seq_len_layer)):
+                for feat_idx in top_features:
+                    activation = features[0, token_pos, feat_idx.item()].item()
+                    if activation > activation_threshold:
+                        node_id = f"feat_L{layer_idx}_T{token_pos}_F{feat_idx.item()}"
+                        self.graph.add_node(node_id)
+                        self.node_types[node_id] = "feature"
+                        active_features[node_id] = {
+                            'layer': layer_idx,
+                            'token_pos': token_pos,
+                            'feature_idx': feat_idx.item(),
+                            'activation': activation
+                        }
+                        self.feature_metadata[node_id] = {
+                            'layer': layer_idx,
+                            'token_position': token_pos,
+                            'feature_index': feat_idx.item(),
+                            'activation': activation,
+                            'input_token': input_tokens[token_pos] if token_pos < len(input_tokens) else None
+                        }
+        # Add an output node for the target token.
+        output_node = f"output_{target_token_idx}"
+        self.graph.add_node(output_node)
+        self.node_types[output_node] = "output"
+        # Add edges based on virtual weights and activations.
+        feature_nodes = [node for node, type_ in self.node_types.items() if type_ == "feature"]
+        print(f"  Building attribution graph: {len(feature_nodes)} feature nodes, {len(self.graph.nodes())} total nodes")
+        # Limit the number of edges to compute.
+        max_edges_per_node = max(max_edges_per_node_cfg, 1)  # Limit connections per node
+        for i, source_node in enumerate(feature_nodes):
+            if i % 50 == 0:  # Progress indicator
+                print(f"  Processing node {i+1}/{len(feature_nodes)}")
+            edges_added = 0
+            source_info = active_features[source_node]
+            source_activation = source_info['activation']
+            # Add edges to other features.
+            for target_node in feature_nodes:
+                if source_node == target_node or edges_added >= max_edges_per_node:
+                    continue
+                target_info = active_features[target_node]
+                # Only add edges that go forward in the network.
+                if (target_info['layer'] > source_info['layer'] or
+                    (target_info['layer'] == source_info['layer'] and
+                     target_info['token_pos'] > source_info['token_pos'])):
+                    virtual_weight = self.compute_virtual_weights(
+                        source_info['layer'], target_info['layer'],
+                        source_info['feature_idx'], target_info['feature_idx']
+                    )
+                    if abs(virtual_weight) > edge_weight_threshold:
+                        edge_weight = source_activation * virtual_weight
+                        self.graph.add_edge(source_node, target_node, weight=edge_weight)
+                        self.edge_weights[(source_node, target_node)] = edge_weight
+                        edges_added += 1
+            # Add edges to the output node.
+            layer_position = source_info['layer']
+            # Allow contributions from all layers, with smaller weights for early layers.
+            layer_scale = 0.1 if layer_position >= n_layers - 2 else max(0.05, 0.1 * (layer_position + 1) / n_layers)
+            output_weight = source_activation * layer_scale
+            if abs(output_weight) > 0:
+                self.graph.add_edge(source_node, output_node, weight=output_weight)
+                self.edge_weights[(source_node, output_node)] = output_weight
+        # Add edges from embeddings to early features.
+        for emb_node in [node for node, type_ in self.node_types.items() if type_ == "embedding"]:
+            token_idx = int(emb_node.split('_')[1])
+            for feat_node in feature_nodes:
+                feat_info = active_features[feat_node]
+                if feat_info['layer'] == 0 and feat_info['token_pos'] == token_idx:
+                    # Direct connection from an embedding to a first-layer feature.
+                    weight = feat_info['activation'] * 0.5  # Simplified
+                    self.graph.add_edge(emb_node, feat_node, weight=weight)
+                    self.edge_weights[(emb_node, feat_node)] = weight
+        return self.graph
+    def prune_graph(self, threshold: float = 0.8) -> nx.DiGraph:
+        # Prunes the graph to keep only the most important nodes.
+        # Calculate node importance based on edge weights.
+        node_importance = defaultdict(float)
+        for (source, target), weight in self.edge_weights.items():
+            node_importance[source] += abs(weight)
+            node_importance[target] += abs(weight)
+        # Keep the top nodes by importance.
+        sorted_nodes = sorted(node_importance.items(), key=lambda x: x[1], reverse=True)
+        n_keep = int(len(sorted_nodes) * threshold)
+        important_nodes = set([node for node, _ in sorted_nodes[:n_keep]])
+        # Always keep the output and embedding nodes.
+        for node, type_ in self.node_types.items():
+            if type_ in ["output", "embedding"]:
+                important_nodes.add(node)
+        # Create the pruned graph.
+        pruned_graph = self.graph.subgraph(important_nodes).copy()
+        return pruned_graph
+    def visualize_graph(self, graph: nx.DiGraph = None, save_path: str = None):
+        # Visualizes the attribution graph.
+        if graph is None:
+            graph = self.graph
+        plt.figure(figsize=(12, 8))
+        # Create a layout for the graph.
+        pos = nx.spring_layout(graph, k=1, iterations=50)
+        # Color the nodes by type.
+        node_colors = []
+        for node in graph.nodes():
+            node_type = self.node_types.get(node, "unknown")
+            if node_type == "embedding":
+                node_colors.append('lightblue')
+            elif node_type == "feature":
+                node_colors.append('lightgreen')
+            elif node_type == "output":
+                node_colors.append('orange')
+            else:
+                node_colors.append('gray')
+        # Draw the nodes.
+        nx.draw_networkx_nodes(graph, pos, node_color=node_colors,
+                              node_size=300, alpha=0.8)
+        # Draw the edges with thickness based on weight.
+        edges = graph.edges()
+        edge_weights = [abs(self.edge_weights.get((u, v), 0.1)) for u, v in edges]
+        max_weight = max(edge_weights) if edge_weights else 1
+        edge_widths = [w / max_weight * 3 for w in edge_weights]
+        nx.draw_networkx_edges(graph, pos, width=edge_widths, alpha=0.6,
+                              edge_color='gray', arrows=True)
+        # Draw the labels.
+        nx.draw_networkx_labels(graph, pos, font_size=8)
+        plt.title("Attribution Graph")
+        plt.axis('off')
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        plt.show()
+class PerturbationExperiments:
+    # Conducts perturbation experiments to validate hypotheses.
+    def __init__(self, model, clt: CrossLayerTranscoder, tokenizer):
+        self.model = model
+        self.clt = clt
+        self.tokenizer = tokenizer
+        self._transformer_blocks: Optional[List[nn.Module]] = None
+    def _get_transformer_blocks(self) -> List[nn.Module]:
+        if self._transformer_blocks is not None:
+            return self._transformer_blocks
+        n_layers = getattr(self.model.config, "num_hidden_layers", None)
+        if n_layers is None:
+            raise ValueError("Model config does not expose num_hidden_layers; cannot resolve transformer blocks.")
+        candidate_lists: List[Tuple[str, nn.ModuleList]] = []
+        for name, module in self.model.named_modules():
+            if isinstance(module, nn.ModuleList) and len(module) == n_layers:
+                candidate_lists.append((name, module))
+        if not candidate_lists:
+            raise ValueError("Unable to locate transformer block ModuleList in model.")
+        # Prefer names that look like transformer blocks.
+        def _score(name: str) -> Tuple[int, str]:
+            preferred_suffixes = ("layers", "blocks", "h")
+            for idx, suffix in enumerate(preferred_suffixes):
+                if name.endswith(suffix):
+                    return (idx, name)
+            return (len(preferred_suffixes), name)
+        selected_name, selected_list = sorted(candidate_lists, key=lambda item: _score(item[0]))[0]
+        self._transformer_blocks = list(selected_list)
+        logger.debug(f"Resolved transformer blocks from ModuleList '{selected_name}'.")
+        return self._transformer_blocks
+    def _format_top_tokens(self, top_tokens: torch.return_types.topk) -> List[Tuple[str, float]]:
+        return [
+            (self.tokenizer.decode([idx]), prob.item())
+            for idx, prob in zip(top_tokens.indices, top_tokens.values)
+        ]
+    def _prepare_inputs(self, input_text: str, top_k: int) -> Dict[str, Any]:
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+        device = next(self.model.parameters()).device
+        inputs = self.tokenizer(
+            input_text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        if inputs["input_ids"].size(0) != 1:
+            raise ValueError("Perturbation experiments currently support only batch size 1.")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            baseline_outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        baseline_logits = baseline_outputs.logits[0]
+        target_position = baseline_logits.size(0) - 1
+        baseline_last_token_logits = baseline_logits[target_position]
+        baseline_probs = F.softmax(baseline_last_token_logits, dim=-1)
+        baseline_top_tokens = torch.topk(baseline_probs, k=top_k)
+        hidden_states: List[torch.Tensor] = list(baseline_outputs.hidden_states[1:])
+        with torch.no_grad():
+            feature_activations, _ = self.clt(hidden_states)
+        return {
+            'inputs': inputs,
+            'baseline_outputs': baseline_outputs,
+            'baseline_logits': baseline_logits,
+            'baseline_last_token_logits': baseline_last_token_logits,
+            'baseline_probs': baseline_probs,
+            'baseline_top_tokens': baseline_top_tokens,
+            'target_position': target_position,
+            'hidden_states': hidden_states,
+            'feature_activations': feature_activations,
+            'default_target_token_id': baseline_top_tokens.indices[0].item()
+        }
+    def _compute_feature_contributions(
+        self,
+        feature_activations: List[torch.Tensor],
+        feature_set: List[Tuple[int, int]]
+    ) -> Dict[int, torch.Tensor]:
+        contributions: Dict[int, torch.Tensor] = {}
+        with torch.no_grad():
+            for layer_idx, feature_idx in feature_set:
+                if layer_idx >= len(feature_activations):
+                    continue
+                features = feature_activations[layer_idx]
+                if feature_idx >= features.size(-1):
+                    continue
+                feature_values = features[:, :, feature_idx].detach()
+                for dest_layer in range(layer_idx, self.clt.n_layers):
+                    decoder_key = f"{layer_idx}_to_{dest_layer}"
+                    if decoder_key not in self.clt.decoders:
+                        continue
+                    decoder = self.clt.decoders[decoder_key]
+                    weight_column = decoder.weight[:, feature_idx]
+                    contrib = torch.einsum('bs,h->bsh', feature_values, weight_column).detach()
+                    if dest_layer in contributions:
+                        contributions[dest_layer] += contrib
+                    else:
+                        contributions[dest_layer] = contrib
+        return contributions
+    def _run_with_hooks(
+        self,
+        inputs: Dict[str, torch.Tensor],
+        contributions: Dict[int, torch.Tensor],
+        intervention_strength: float
+    ):
+        blocks = self._get_transformer_blocks()
+        handles: List[Any] = []
+        def _make_hook(cached_contrib: torch.Tensor):
+            def hook(module, module_input, module_output):
+                if isinstance(module_output, torch.Tensor):
+                    target_tensor = module_output
+                elif isinstance(module_output, (tuple, list)):
+                    target_tensor = module_output[0]
+                elif hasattr(module_output, "last_hidden_state"):
+                    target_tensor = module_output.last_hidden_state
+                else:
+                    raise TypeError(
+                        f"Unsupported module output type '{type(module_output)}' for perturbation hook."
+                    )
+                tensor_contrib = cached_contrib.to(target_tensor.device).to(target_tensor.dtype)
+                scaled = intervention_strength * tensor_contrib
+                if isinstance(module_output, torch.Tensor):
+                    return module_output - scaled
+                elif isinstance(module_output, tuple):
+                    modified = module_output[0] - scaled
+                    return (modified,) + tuple(module_output[1:])
+                elif isinstance(module_output, list):
+                    modified = [module_output[0] - scaled, *module_output[1:]]
+                    return modified
+                else:
+                    module_output.last_hidden_state = module_output.last_hidden_state - scaled
+                    return module_output
+            return hook
+        try:
+            for dest_layer, contrib in contributions.items():
+                if dest_layer >= len(blocks):
+                    continue
+                handles.append(blocks[dest_layer].register_forward_hook(_make_hook(contrib)))
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        finally:
+            for handle in handles:
+                handle.remove()
+        return outputs
+    def feature_set_ablation_experiment(
+        self,
+        input_text: str,
+        feature_set: List[Tuple[int, int]],
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+        ablation_label: str = "feature_set",
+        extra_metadata: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        try:
+            baseline_data = self._prepare_inputs(input_text, top_k)
+            if target_token_id is None:
+                target_token_id = baseline_data['default_target_token_id']
+            feature_set_normalized = [
+                (int(layer_idx), int(feature_idx)) for layer_idx, feature_idx in feature_set
+            ]
+            contributions = self._compute_feature_contributions(
+                baseline_data['feature_activations'],
+                feature_set_normalized
+            )
+            baseline_probs = baseline_data['baseline_probs']
+            baseline_top_tokens = baseline_data['baseline_top_tokens']
+            baseline_last_token_logits = baseline_data['baseline_last_token_logits']
+            target_position = baseline_data['target_position']
+            hidden_states = baseline_data['hidden_states']
+            baseline_prob = baseline_probs[target_token_id].item()
+            baseline_logit = baseline_last_token_logits[target_token_id].item()
+            baseline_summary = {
+                'baseline_top_tokens': self._format_top_tokens(baseline_top_tokens),
+                'baseline_probability': baseline_prob,
+                'baseline_logit': baseline_logit
+            }
+            if not contributions:
+                result = {
+                    **baseline_summary,
+                    'ablated_top_tokens': baseline_summary['baseline_top_tokens'],
+                    'ablated_probability': baseline_prob,
+                    'ablated_logit': baseline_logit,
+                    'probability_change': 0.0,
+                    'logit_change': 0.0,
+                    'kl_divergence': 0.0,
+                    'entropy_change': 0.0,
+                    'hidden_state_delta_norm': 0.0,
+                    'hidden_state_relative_change': 0.0,
+                    'ablation_flips_top_prediction': False,
+                    'feature_set': [
+                        {'layer': layer_idx, 'feature': feature_idx}
+                        for layer_idx, feature_idx in feature_set_normalized
+                    ],
+                    'feature_set_size': len(feature_set_normalized),
+                    'intervention_strength': intervention_strength,
+                    'target_token_id': target_token_id,
+                    'target_token': self.tokenizer.decode([target_token_id]),
+                    'contributing_layers': [],
+                    'ablation_applied': False,
+                    'ablation_type': ablation_label,
+                    'warning': 'no_contributions_found'
+                }
+                if extra_metadata:
+                    result.update(extra_metadata)
+                return result
+            ablated_outputs = self._run_with_hooks(
+                baseline_data['inputs'],
+                contributions,
+                intervention_strength
+            )
+            ablated_logits = ablated_outputs.logits[0, target_position]
+            ablated_probs = F.softmax(ablated_logits, dim=-1)
+            ablated_top_tokens = torch.topk(ablated_probs, k=top_k)
+            ablated_prob = ablated_probs[target_token_id].item()
+            ablated_logit = ablated_logits[target_token_id].item()
+            epsilon = 1e-9
+            kl_divergence = torch.sum(
+                baseline_probs * (torch.log(baseline_probs + epsilon) - torch.log(ablated_probs + epsilon))
+            ).item()
+            if not np.isfinite(kl_divergence):
+                kl_divergence = 0.0
+            entropy_baseline = -(baseline_probs * torch.log(baseline_probs + epsilon)).sum().item()
+            entropy_ablated = -(ablated_probs * torch.log(ablated_probs + epsilon)).sum().item()
+            entropy_change = entropy_ablated - entropy_baseline
+            if not np.isfinite(entropy_change):
+                entropy_change = 0.0
+            baseline_hidden = hidden_states[-1][:, target_position, :]
+            ablated_hidden = ablated_outputs.hidden_states[-1][:, target_position, :]
+            hidden_delta_norm = torch.norm(baseline_hidden - ablated_hidden, dim=-1).item()
+            hidden_baseline_norm = torch.norm(baseline_hidden, dim=-1).item()
+            hidden_relative_change = hidden_delta_norm / (hidden_baseline_norm + 1e-9)
+            result = {
+                **baseline_summary,
+                'ablated_top_tokens': self._format_top_tokens(ablated_top_tokens),
+                'ablated_probability': ablated_prob,
+                'ablated_logit': ablated_logit,
+                'probability_change': baseline_prob - ablated_prob,
+                'logit_change': baseline_logit - ablated_logit,
+                'kl_divergence': kl_divergence,
+                'entropy_change': entropy_change,
+                'hidden_state_delta_norm': hidden_delta_norm,
+                'hidden_state_relative_change': hidden_relative_change,
+                'ablation_flips_top_prediction': bool(
+                    baseline_top_tokens.indices[0].item() != ablated_top_tokens.indices[0].item()
+                ),
+                'feature_set': [
+                    {'layer': layer_idx, 'feature': feature_idx}
+                    for layer_idx, feature_idx in feature_set_normalized
+                ],
+                'feature_set_size': len(feature_set_normalized),
+                'intervention_strength': intervention_strength,
+                'target_token_id': target_token_id,
+                'target_token': self.tokenizer.decode([target_token_id]),
+                'contributing_layers': sorted(list(contributions.keys())),
+                'ablation_applied': True,
+                'ablation_type': ablation_label
+            }
+            if extra_metadata:
+                result.update(extra_metadata)
+            return result
+        except Exception as e:
+            logger.warning(f"Perturbation experiment failed: {e}")
+            return {
+                'baseline_top_tokens': [],
+                'ablated_top_tokens': [],
+                'feature_set': [
+                    {'layer': layer_idx, 'feature': feature_idx}
+                    for layer_idx, feature_idx in feature_set
+                ],
+                'feature_set_size': len(feature_set),
+                'intervention_strength': intervention_strength,
+                'probability_change': 0.0,
+                'logit_change': 0.0,
+                'kl_divergence': 0.0,
+                'entropy_change': 0.0,
+                'hidden_state_delta_norm': 0.0,
+                'hidden_state_relative_change': 0.0,
+                'ablation_flips_top_prediction': False,
+                'ablation_applied': False,
+                'ablation_type': ablation_label,
+                'error': str(e)
+            }
+    def feature_ablation_experiment(
+        self,
+        input_text: str,
+        target_layer: int,
+        target_feature: int,
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+    ) -> Dict[str, Any]:
+        return self.feature_set_ablation_experiment(
+            input_text=input_text,
+            feature_set=[(target_layer, target_feature)],
+            intervention_strength=intervention_strength,
+            target_token_id=target_token_id,
+            top_k=top_k,
+            ablation_label="targeted_feature"
+        )
+    def random_feature_ablation_experiment(
+        self,
+        input_text: str,
+        num_features: int = 1,
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+        seed: Optional[int] = None
+    ) -> Dict[str, Any]:
+        rng = random.Random(seed)
+        num_features = max(1, int(num_features))
+        feature_set: List[Tuple[int, int]] = []
+        for _ in range(num_features):
+            layer_idx = rng.randrange(self.clt.n_layers)
+            feature_idx = rng.randrange(self.clt.n_features)
+            feature_set.append((layer_idx, feature_idx))
+        result = self.feature_set_ablation_experiment(
+            input_text=input_text,
+            feature_set=feature_set,
+            intervention_strength=intervention_strength,
+            target_token_id=target_token_id,
+            top_k=top_k,
+            ablation_label="random_baseline",
+            extra_metadata={'random_seed': seed}
+        )
+        return result
+class AttributionGraphsPipeline:
+    # The main pipeline for the attribution graph analysis.
+    def __init__(self, config: AttributionGraphConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        # Load the model and tokenizer.
+        logger.info(f"Loading OLMo2 7B model from {config.model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_path)
+        # Configure model loading based on the device.
+        if "mps" in config.device:
+            # MPS supports float16 but not device_map.
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map=None
+            ).to(self.device)
+        elif "cuda" in config.device:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+        else:
+            # CPU
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float32,
+                device_map=None
+            ).to(self.device)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize the CLT.
+        model_config = self.model.config.to_dict()
+        self.clt = CrossLayerTranscoder(model_config, config).to(self.device)
+        # Initialize the other components.
+        # cache_dir = Path(RESULTS_DIR) / "feature_interpretations_cache"
+        # Disable persistent caching to ensure interpretations are prompt-specific and not reused from other contexts.
+        self.feature_visualizer = FeatureVisualizer(self.tokenizer, cache_dir=None)
+        self.attribution_graph = AttributionGraph(self.clt, self.tokenizer, config)
+        self.perturbation_experiments = PerturbationExperiments(self.model, self.clt, self.tokenizer)
+        logger.info("Attribution Graphs Pipeline initialized successfully")
+    def train_clt(self, training_texts: List[str]) -> Dict:
+        # Trains the Cross-Layer Transcoder.
+        logger.info("Starting CLT training...")
+        optimizer = torch.optim.Adam(self.clt.parameters(), lr=self.config.learning_rate)
+        training_stats = {
+            'reconstruction_losses': [],
+            'sparsity_losses': [],
+            'total_losses': []
+        }
+        for step in tqdm(range(self.config.training_steps), desc="Training CLT"):
+            # Sample a batch of texts.
+            batch_texts = np.random.choice(training_texts, size=self.config.batch_size)
+            total_loss = 0.0
+            total_recon_loss = 0.0
+            total_sparsity_loss = 0.0
+            for text in batch_texts:
+                # Tokenize the text.
+                inputs = self.tokenizer(text, return_tensors="pt", max_length=self.config.max_seq_length,
+                                      truncation=True, padding=True).to(self.device)
+                # Get the model activations.
+                with torch.no_grad():
+                    outputs = self.model(**inputs, output_hidden_states=True)
+                    hidden_states = outputs.hidden_states[1:]
+                # Forward pass through the CLT.
+                feature_activations, reconstructed_outputs = self.clt(hidden_states)
+                # Compute the reconstruction loss.
+                recon_loss = 0.0
+                for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+                    recon_loss += F.mse_loss(pred, target)
+                # Compute the sparsity loss.
+                sparsity_loss = 0.0
+                for features in feature_activations:
+                    sparsity_loss += torch.mean(torch.tanh(self.config.sparsity_lambda * features))
+                # Total loss.
+                loss = (self.config.reconstruction_loss_weight * recon_loss +
+                       self.config.sparsity_lambda * sparsity_loss)
+                total_loss += loss
+                total_recon_loss += recon_loss
+                total_sparsity_loss += sparsity_loss
+            # Average the losses.
+            total_loss /= self.config.batch_size
+            total_recon_loss /= self.config.batch_size
+            total_sparsity_loss /= self.config.batch_size
+            # Backward pass.
+            optimizer.zero_grad()
+            total_loss.backward()
+            optimizer.step()
+            # Log the progress.
+            training_stats['total_losses'].append(total_loss.item())
+            training_stats['reconstruction_losses'].append(total_recon_loss.item())
+            training_stats['sparsity_losses'].append(total_sparsity_loss.item())
+            if step % 100 == 0:
+                logger.info(f"Step {step}: Total Loss = {total_loss.item():.4f}, "
+                           f"Recon Loss = {total_recon_loss.item():.4f}, "
+                           f"Sparsity Loss = {total_sparsity_loss.item():.4f}")
+        logger.info("CLT training completed")
+        return training_stats
+    def analyze_prompt(self, prompt: str, target_token_idx: int = -1) -> Dict:
+        # Performs a complete analysis for a single prompt.
+        logger.info(f"Analyzing prompt: '{prompt[:50]}...'")
+        # Tokenize the prompt.
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        input_tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = self.clt(hidden_states)
+        logger.info("  > Starting feature visualization and interpretation...")
+        feature_visualizations = {}
+        for layer_idx, features in enumerate(feature_activations):
+            logger.info(f"  - Processing Layer {layer_idx}...")
+            layer_viz = {}
+            # Analyze the top features for this layer.
+            # features shape: [batch_size, seq_len, n_features]
+            feature_importance = torch.mean(features, dim=(0, 1))  # Average over batch and sequence
+            top_features = torch.topk(feature_importance, k=min(5, feature_importance.size(0))).indices
+            for feat_idx in top_features:
+                viz = self.feature_visualizer.visualize_feature(
+                    feat_idx.item(), layer_idx, features[0], input_tokens
+                )
+                interpretation = self.feature_visualizer.interpret_feature(
+                    feat_idx.item(), layer_idx, viz, self.config.qwen_api_config
+                )
+                viz['interpretation'] = interpretation
+                layer_viz[f"feature_{feat_idx.item()}"] = viz
+            feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+        # Construct the attribution graph.
+        graph = self.attribution_graph.construct_graph(
+            input_tokens, feature_activations, target_token_idx
+        )
+        # Prune the graph.
+        pruned_graph = self.attribution_graph.prune_graph(self.config.pruning_threshold)
+        # Analyze the most important paths.
+        important_paths = []
+        if len(pruned_graph.nodes()) > 0:
+            # Find paths from embeddings to the output.
+            embedding_nodes = [node for node, type_ in self.attribution_graph.node_types.items()
+                             if type_ == "embedding" and node in pruned_graph]
+            output_nodes = [node for node, type_ in self.attribution_graph.node_types.items()
+                           if type_ == "output" and node in pruned_graph]
+            for emb_node in embedding_nodes[:3]:  # Top 3 embedding nodes
+                for out_node in output_nodes:
+                    try:
+                        paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=5))
+                        for path in paths[:2]:  # Top 2 paths
+                            path_weight = 1.0
+                            for i in range(len(path) - 1):
+                                edge_weight = self.attribution_graph.edge_weights.get(
+                                    (path[i], path[i+1]), 0.0
+                                )
+                                path_weight *= abs(edge_weight)
+                            important_paths.append({
+                                'path': path,
+                                'weight': path_weight,
+                                'description': self._describe_path(path)
+                            })
+                    except nx.NetworkXNoPath:
+                        continue
+        # Sort paths by importance.
+        important_paths.sort(key=lambda x: x['weight'], reverse=True)
+        # Run targeted perturbation experiments for highlighted features.
+        targeted_feature_ablation_results: List[Dict[str, Any]] = []
+        max_total_experiments = self.config.max_ablation_experiments
+        per_layer_limit = self.config.ablation_features_per_layer
+        total_run = 0
+        stop_all = False
+        for layer_name, layer_features in feature_visualizations.items():
+            if stop_all:
+                break
+            try:
+                layer_idx = int(layer_name.split('_')[1])
+            except (IndexError, ValueError):
+                logger.warning(f"Unable to parse layer index from key '{layer_name}'. Skipping perturbation experiments for this layer.")
+                continue
+            feature_items = list(layer_features.items())
+            if per_layer_limit is not None:
+                feature_items = feature_items[:per_layer_limit]
+            for feature_name, feature_payload in feature_items:
+                if max_total_experiments is not None and total_run >= max_total_experiments:
+                    stop_all = True
+                    break
+                try:
+                    feature_idx = int(feature_name.split('_')[1])
+                except (IndexError, ValueError):
+                    logger.warning(f"Unable to parse feature index from key '{feature_name}'. Skipping perturbation experiment.")
+                    continue
+                ablation = self.perturbation_experiments.feature_ablation_experiment(
+                    prompt,
+                    layer_idx,
+                    feature_idx,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                )
+                ablation.update({
+                    'layer_name': layer_name,
+                    'feature_name': feature_name,
+                    'feature_interpretation': feature_payload.get('interpretation'),
+                    'feature_max_activation': feature_payload.get('max_activation'),
+                })
+                targeted_feature_ablation_results.append(ablation)
+                total_run += 1
+        # Random baseline perturbations for comparison.
+        random_baseline_results: List[Dict[str, Any]] = []
+        baseline_trials = self.config.random_baseline_trials
+        if baseline_trials and baseline_trials > 0:
+            num_features = self.config.random_baseline_features or 1
+            for trial_idx in range(baseline_trials):
+                seed = None
+                if self.config.random_baseline_seed is not None:
+                    seed = self.config.random_baseline_seed + trial_idx
+                random_result = self.perturbation_experiments.random_feature_ablation_experiment(
+                    prompt,
+                    num_features=num_features,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    seed=seed
+                )
+                random_result['trial_index'] = trial_idx
+                random_baseline_results.append(random_result)
+        # Path-level ablations for the most important circuits.
+        path_ablation_results: List[Dict[str, Any]] = []
+        max_paths = self.config.path_ablation_top_k or 0
+        extracted_paths: List[Dict[str, Any]] = []
+        if max_paths > 0 and important_paths:
+            for path_info in important_paths[:max_paths]:
+                feature_set = self._extract_feature_set_from_path(path_info.get('path', []))
+                if not feature_set:
+                    continue
+                path_result = self.perturbation_experiments.feature_set_ablation_experiment(
+                    prompt,
+                    feature_set=feature_set,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    ablation_label="path",
+                    extra_metadata={
+                        'path_nodes': path_info.get('path'),
+                        'path_description': path_info.get('description'),
+                        'path_weight': path_info.get('weight')
+                    }
+                )
+                path_ablation_results.append(path_result)
+                enriched_path_info = path_info.copy()
+                enriched_path_info['feature_set'] = feature_set
+                extracted_paths.append(enriched_path_info)
+        random_path_baseline_results: List[Dict[str, Any]] = []
+        path_baseline_trials = self.config.random_path_baseline_trials
+        if path_baseline_trials and path_baseline_trials > 0 and extracted_paths:
+            rng = random.Random(self.config.random_baseline_seed)
+            available_nodes = [
+                data for data in self.attribution_graph.node_types.items()
+                if data[1] == "feature"
+            ]
+            for trial in range(path_baseline_trials):
+                selected_path = extracted_paths[min(trial % len(extracted_paths), len(extracted_paths) - 1)]
+                target_length = len(selected_path.get('feature_set', []))
+                source_layers = [layer for layer, _ in selected_path.get('feature_set', [])]
+                min_layer = min(source_layers) if source_layers else 0
+                max_layer = max(source_layers) if source_layers else self.clt.n_layers - 1
+                excluded_keys = {
+                    (layer, feature)
+                    for layer, feature in selected_path.get('feature_set', [])
+                }
+                random_feature_set: List[Tuple[int, int]] = []
+                attempts = 0
+                while len(random_feature_set) < target_length and attempts < target_length * 5:
+                    attempts += 1
+                    if not available_nodes:
+                        break
+                    node_name, node_type = rng.choice(available_nodes)
+                    metadata = self.attribution_graph.feature_metadata.get(node_name)
+                    if metadata is None:
+                        continue
+                    if metadata['layer'] < min_layer or metadata['layer'] > max_layer:
+                        continue
+                    key = (metadata['layer'], metadata['feature_index'])
+                    if key in excluded_keys:
+                        continue
+                    if key not in random_feature_set:
+                        random_feature_set.append(key)
+                if not random_feature_set:
+                    continue
+                if len(random_feature_set) < max(1, target_length):
+                    continue
+                random_path_result = self.perturbation_experiments.feature_set_ablation_experiment(
+                    prompt,
+                    feature_set=random_feature_set,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    ablation_label="random_path_baseline",
+                    extra_metadata={
+                        'trial_index': trial,
+                        'sampled_feature_set': random_feature_set,
+                        'reference_path_weight': selected_path.get('weight')
+                    }
+                )
+                random_path_baseline_results.append(random_path_result)
+        targeted_summary = self._summarize_ablation_results(targeted_feature_ablation_results)
+        random_summary = self._summarize_ablation_results(random_baseline_results)
+        path_summary = self._summarize_ablation_results(path_ablation_results)
+        random_path_summary = self._summarize_ablation_results(random_path_baseline_results)
+        summary_statistics = {
+            'targeted': targeted_summary,
+            'random_baseline': random_summary,
+            'path': path_summary,
+            'random_path_baseline': random_path_summary,
+            'target_minus_random_abs_probability_change': targeted_summary.get('avg_abs_probability_change', 0.0) - random_summary.get('avg_abs_probability_change', 0.0),
+            'target_flip_rate_minus_random': targeted_summary.get('flip_rate', 0.0) - random_summary.get('flip_rate', 0.0),
+            'path_minus_random_abs_probability_change': path_summary.get('avg_abs_probability_change', 0.0) - random_path_summary.get('avg_abs_probability_change', 0.0),
+            'path_flip_rate_minus_random': path_summary.get('flip_rate', 0.0) - random_path_summary.get('flip_rate', 0.0)
+        }
+        results = {
+            'prompt': prompt,
+            'input_tokens': input_tokens,
+            'feature_visualizations': feature_visualizations,
+            'full_graph_stats': {
+                'n_nodes': len(graph.nodes()),
+                'n_edges': len(graph.edges()),
+                'node_types': dict(self.attribution_graph.node_types)
+            },
+            'pruned_graph_stats': {
+                'n_nodes': len(pruned_graph.nodes()),
+                'n_edges': len(pruned_graph.edges())
+            },
+            'important_paths': important_paths[:5],  # Top 5 paths
+            'graph': pruned_graph,
+            'perturbation_experiments': targeted_feature_ablation_results,
+            'random_baseline_experiments': random_baseline_results,
+            'path_ablation_experiments': path_ablation_results,
+            'random_path_baseline_experiments': random_path_baseline_results,
+            'summary_statistics': summary_statistics
+        }
+        return results
+    def _extract_feature_set_from_path(self, path: List[str]) -> List[Tuple[int, int]]:
+        feature_set: List[Tuple[int, int]] = []
+        seen: Set[Tuple[int, int]] = set()
+        for node in path:
+            if not isinstance(node, str):
+                continue
+            if not node.startswith("feat_"):
+                continue
+            parts = node.split('_')
+            try:
+                layer_str = parts[1]  # e.g., "L0"
+                feature_str = parts[3]  # e.g., "F123"
+                layer_idx = int(layer_str[1:])
+                feature_idx = int(feature_str[1:])
+            except (IndexError, ValueError):
+                continue
+            key = (layer_idx, feature_idx)
+            if key not in seen:
+                seen.add(key)
+                feature_set.append(key)
+        return feature_set
+    def _summarize_ablation_results(self, experiments: List[Dict[str, Any]]) -> Dict[str, Any]:
+        summary = {
+            'count': len(experiments),
+            'avg_probability_change': 0.0,
+            'avg_abs_probability_change': 0.0,
+            'std_probability_change': 0.0,
+            'avg_logit_change': 0.0,
+            'avg_abs_logit_change': 0.0,
+            'std_logit_change': 0.0,
+            'avg_kl_divergence': 0.0,
+            'avg_entropy_change': 0.0,
+            'avg_hidden_state_delta_norm': 0.0,
+            'avg_hidden_state_relative_change': 0.0,
+            'flip_rate': 0.0,
+            'count_flipped': 0
+        }
+        if not experiments:
+            return summary
+        probability_changes = np.array([exp.get('probability_change', 0.0) for exp in experiments], dtype=float)
+        logit_changes = np.array([exp.get('logit_change', 0.0) for exp in experiments], dtype=float)
+        kl_divergences = np.array([exp.get('kl_divergence', 0.0) for exp in experiments], dtype=float)
+        entropy_changes = np.array([exp.get('entropy_change', 0.0) for exp in experiments], dtype=float)
+        hidden_norms = np.array([exp.get('hidden_state_delta_norm', 0.0) for exp in experiments], dtype=float)
+        hidden_relative = np.array([exp.get('hidden_state_relative_change', 0.0) for exp in experiments], dtype=float)
+        flip_flags = np.array([1.0 if exp.get('ablation_flips_top_prediction') else 0.0 for exp in experiments], dtype=float)
+        # Helper to safely compute mean/std ignoring NaNs
+        def safe_mean(arr):
+            with np.errstate(all='ignore'):
+                m = np.nanmean(arr)
+                return float(m) if np.isfinite(m) else 0.0
+        def safe_std(arr):
+            with np.errstate(all='ignore'):
+                s = np.nanstd(arr)
+                return float(s) if np.isfinite(s) else 0.0
+        summary.update({
+            'avg_probability_change': safe_mean(probability_changes),
+            'avg_abs_probability_change': safe_mean(np.abs(probability_changes)),
+            'std_probability_change': safe_std(probability_changes),
+            'avg_logit_change': safe_mean(logit_changes),
+            'avg_abs_logit_change': safe_mean(np.abs(logit_changes)),
+            'std_logit_change': safe_std(logit_changes),
+            'avg_kl_divergence': safe_mean(kl_divergences),
+            'avg_entropy_change': safe_mean(entropy_changes),
+            'avg_hidden_state_delta_norm': safe_mean(hidden_norms),
+            'avg_hidden_state_relative_change': safe_mean(hidden_relative),
+            'flip_rate': safe_mean(flip_flags),
+            'count_flipped': int(np.round(np.nansum(flip_flags)))
+        })
+        return summary
+    def analyze_prompts_batch(self, prompts: List[str]) -> Dict[str, Any]:
+        analyses: Dict[str, Dict[str, Any]] = {}
+        aggregated_targeted: List[Dict[str, Any]] = []
+        aggregated_random: List[Dict[str, Any]] = []
+        aggregated_path: List[Dict[str, Any]] = []
+        for idx, prompt in enumerate(prompts):
+            logger.info(f"[Batch Eval] Processing prompt {idx + 1}/{len(prompts)}")
+            analysis = self.analyze_prompt(prompt)
+            key = f"prompt_{idx + 1}"
+            analyses[key] = analysis
+            aggregated_targeted.extend(analysis.get('perturbation_experiments', []))
+            aggregated_random.extend(analysis.get('random_baseline_experiments', []))
+            aggregated_path.extend(analysis.get('path_ablation_experiments', []))
+        aggregate_summary = {
+            'targeted': self._summarize_ablation_results(aggregated_targeted),
+            'random_baseline': self._summarize_ablation_results(aggregated_random),
+            'path': self._summarize_ablation_results(aggregated_path),
+            'random_path_baseline': self._summarize_ablation_results(
+                [
+                    exp
+                    for analysis in analyses.values()
+                    for exp in analysis.get('random_path_baseline_experiments', [])
+                ]
+            )
+        }
+        aggregate_summary['target_minus_random_abs_probability_change'] = (
+            aggregate_summary['targeted'].get('avg_abs_probability_change', 0.0)
+            - aggregate_summary['random_baseline'].get('avg_abs_probability_change', 0.0)
+        )
+        aggregate_summary['target_flip_rate_minus_random'] = (
+            aggregate_summary['targeted'].get('flip_rate', 0.0)
+            - aggregate_summary['random_baseline'].get('flip_rate', 0.0)
+        )
+        aggregate_summary['path_minus_random_abs_probability_change'] = (
+            aggregate_summary['path'].get('avg_abs_probability_change', 0.0)
+            - aggregate_summary['random_path_baseline'].get('avg_abs_probability_change', 0.0)
+        )
+        aggregate_summary['path_flip_rate_minus_random'] = (
+            aggregate_summary['path'].get('flip_rate', 0.0)
+            - aggregate_summary['random_path_baseline'].get('flip_rate', 0.0)
+        )
+        return {
+            'analyses': analyses,
+            'aggregate_summary': aggregate_summary,
+            'prompt_texts': prompts
+        }
+    def _describe_path(self, path: List[str]) -> str:
+        # Generates a human-readable description of a path.
+        descriptions = []
+        for node in path:
+            if self.attribution_graph.node_types[node] == "embedding":
+                token = node.split('_')[2]
+                descriptions.append(f"Token '{token}'")
+            elif self.attribution_graph.node_types[node] == "feature":
+                parts = node.split('_')
+                layer = parts[1][1:]  # Remove 'L'
+                feature = parts[3][1:]  # Remove 'F'
+                # Try to get the interpretation.
+                key = f"L{layer}_F{feature}"
+                interpretation = self.feature_visualizer.feature_interpretations.get(key, "unknown")
+                descriptions.append(f"Feature L{layer}F{feature} ({interpretation})")
+            elif self.attribution_graph.node_types[node] == "output":
+                descriptions.append("Output")
+        return " → ".join(descriptions)
+    def save_results(self, results: Dict, save_path: str):
+        # Saves the analysis results to a file.
+        serializable_results = copy.deepcopy(results)
+        if 'graph' in serializable_results:
+            serializable_results['graph'] = nx.node_link_data(serializable_results['graph'])
+        analyses = serializable_results.get('analyses', {})
+        for key, analysis in analyses.items():
+            if 'graph' in analysis:
+                analysis['graph'] = nx.node_link_data(analysis['graph'])
+        with open(save_path, 'w') as f:
+            json.dump(serializable_results, f, indent=2, default=str)
+        logger.info(f"Results saved to {save_path}")
+    def save_clt(self, path: str):
+        # Saves the trained CLT model.
+        torch.save(self.clt.state_dict(), path)
+        logger.info(f"CLT model saved to {path}")
+    def load_clt(self, path: str):
+        # Loads a trained CLT model.
+        self.clt.load_state_dict(torch.load(path, map_location=self.device))
+        self.clt.to(self.device)
+        self.clt.eval()  # Set the model to evaluation mode
+        logger.info(f"Loaded CLT model from {path}")
+# --- Configuration ---
+MAX_SEQ_LEN = 256
+N_FEATURES_PER_LAYER = 512
+TRAINING_STEPS = 2500
+BATCH_SIZE = 64
+LEARNING_RATE = 1e-3
+# Prompts for generating the final analysis.
+ANALYSIS_PROMPTS = [
+    "The capital of France is",
+    "def factorial(n):",
+    "The literary device in the phrase 'The wind whispered through the trees' is"
+]
+# A larger set of prompts for training.
+TRAINING_PROMPTS = [
+    "The capital of France is", "To be or not to be, that is the", "A stitch in time saves",
+    "The first person to walk on the moon was", "The chemical formula for water is H2O.",
+    "Translate to German: 'The cat sits on the mat.'", "def factorial(n):", "import numpy as np",
+    "The main ingredients in a pizza are", "What is the powerhouse of the cell?",
+    "The equation E=mc^2 relates energy to", "Continue the story: Once upon a time, there was a",
+    "Classify the sentiment: 'I am overjoyed!'", "Extract the entities: 'Apple Inc. is in Cupertino.'",
+    "What is the next number: 2, 4, 8, 16, __?", "A rolling stone gathers no",
+    "The opposite of hot is", "import torch", "import pandas as pd", "class MyClass:",
+    "def __init__(self):", "The primary colors are", "What is the capital of Japan?",
+    "Who wrote 'Hamlet'?", "The square root of 64 is", "The sun rises in the",
+    "The Pacific Ocean is the largest ocean on Earth.", "The mitochondria is the powerhouse of the cell.",
+    "What is the capital of Mongolia?", "The movie 'The Matrix' can be classified into the following genre:",
+    "The French translation of 'I would like to order a coffee, please.' is:",
+    "The literary device in the phrase 'The wind whispered through the trees' is",
+    "A Python function that calculates the factorial of a number is:",
+    "The main ingredient in a Negroni cocktail is",
+    "Summarize the plot of 'Hamlet' in one sentence:",
+    "The sentence 'The cake was eaten by the dog' is in the following voice:",
+    "A good headline for an article about a new breakthrough in battery technology would be:"
+]
+# --- Qwen API for Feature Interpretation ---
+@torch.no_grad()
+def get_feature_interpretation_with_qwen(
+    api_config: dict,
+    top_tokens: list[str],
+    feature_name: str,
+    layer_index: int,
+    max_retries: int = 3,
+    initial_backoff: float = 2.0
+) -> str:
+    # Generates a high-quality interpretation for a feature using the Qwen API.
+    if not api_config or not api_config.get('api_key'):
+        logger.warning("Qwen API not configured. Skipping interpretation.")
+        return "API not configured"
+    headers = {
+        "Authorization": f"Bearer {api_config['api_key']}",
+        "Content-Type": "application/json"
+    }
+    # Create a specialized prompt.
+    prompt_text = f"""
+You are an expert in transformer interpretability. A feature in a language model (feature '{feature_name}' at layer {layer_index}) is most strongly activated by the following tokens:
+{', '.join(f"'{token}'" for token in top_tokens)}
+Based *only* on these tokens, what is the most likely function or role of this feature?
+Your answer must be a short, concise phrase (e.g., "Detecting proper nouns", "Identifying JSON syntax", "Completing lists", "Recognizing negative sentiment"). Do not write a full sentence.
+"""
+    data = {
+        "model": api_config["model"],
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt_text}]
+            }
+        ],
+        "max_tokens": 50,
+        "temperature": 0.1,
+        "top_p": 0.9,
+        "seed": 42
+    }
+    logger.info(f"  > Interpreting {feature_name} (Layer {layer_index})...")
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"    - Attempt {attempt + 1}/{max_retries}: Sending request to Qwen API...")
+            response = requests.post(
+                f"{api_config['api_endpoint']}/chat/completions",
+                headers=headers,
+                json=data,
+                timeout=60
+            )
+            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+            result = response.json()
+            interpretation = result["choices"][0]["message"]["content"].strip()
+            # Remove quotes from the output.
+            if interpretation.startswith('"') and interpretation.endswith('"'):
+                interpretation = interpretation[1:-1]
+            logger.info(f"    - Success! Interpretation: '{interpretation}'")
+            return interpretation
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"    - Qwen API request failed (Attempt {attempt + 1}/{max_retries}): {e}")
+            if attempt < max_retries - 1:
+                backoff_time = initial_backoff * (2 ** attempt)
+                logger.info(f"    - Retrying in {backoff_time:.1f} seconds...")
+                time.sleep(backoff_time)
+            else:
+                logger.error("    - Max retries reached. Failing.")
+                return f"API Error: {e}"
+        except (KeyError, IndexError) as e:
+            logger.error(f"    - Failed to parse Qwen API response: {e}")
+            return "API Error: Invalid response format"
+        finally:
+            # Add a delay to respect API rate limits.
+            time.sleep(2.1)
+    return "API Error: Max retries exceeded"
+def train_transcoder(transcoder, model, tokenizer, training_prompts, device, steps=1000, batch_size=16, optimizer=None):
+    # Trains the Cross-Layer Transcoder.
+    transcoder.train()
+    # Use a progress bar for visual feedback.
+    progress_bar = tqdm(range(steps), desc="Training CLT")
+    for step in progress_bar:
+        # Get a random batch of prompts.
+        batch_prompts = random.choices(training_prompts, k=batch_size)
+        # Tokenize the batch.
+        inputs = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=MAX_SEQ_LEN
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = transcoder(hidden_states)
+        # Compute the reconstruction loss.
+        recon_loss = 0.0
+        for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+            recon_loss += F.mse_loss(pred, target)
+        # Compute the sparsity loss.
+        sparsity_loss = 0.0
+        for features in feature_activations:
+            sparsity_loss += torch.mean(torch.tanh(0.01 * features)) # Use config.sparsity_lambda
+        # Total loss.
+        loss = (0.8 * recon_loss + 0.2 * sparsity_loss) # Use config.reconstruction_loss_weight
+        if optimizer:
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        progress_bar.set_postfix({
+            "Recon Loss": f"{recon_loss.item():.4f}",
+            "Sparsity Loss": f"{sparsity_loss.item():.4f}",
+            "Total Loss": f"{loss.item():.4f}"
+        })
+def generate_feature_visualizations(transcoder, model, tokenizer, prompt, device, qwen_api_config=None, graph_config: Optional[AttributionGraphConfig] = None):
+    # Generates feature visualizations and interpretations for a prompt.
+    # Tokenize the prompt.
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Get the model activations.
+    with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
+        hidden_states = outputs.hidden_states[1:]
+    # Forward pass through the CLT.
+    feature_activations, reconstructed_outputs = transcoder(hidden_states)
+    # Visualize the features.
+    feature_visualizations = {}
+    for layer_idx, features in enumerate(feature_activations):
+        layer_viz = {}
+        # Analyze the top features for this layer.
+        # features shape: [batch_size, seq_len, n_features]
+        feature_importance = torch.mean(features, dim=(0, 1))  # Average over batch and sequence
+        top_features = torch.topk(feature_importance, k=min(5, feature_importance.size(0))).indices
+        for feat_idx in top_features:
+            viz = FeatureVisualizer(tokenizer).visualize_feature(
+                feat_idx.item(), layer_idx, features[0], tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+            )
+            interpretation = FeatureVisualizer(tokenizer).interpret_feature(
+                feat_idx.item(), layer_idx, viz, qwen_api_config
+            )
+            viz['interpretation'] = interpretation
+            layer_viz[f"feature_{feat_idx.item()}"] = viz
+        feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+    # Construct the attribution graph.
+    if graph_config is None:
+        graph_config = AttributionGraphConfig()
+    attribution_graph = AttributionGraph(transcoder, tokenizer, graph_config)
+    graph = attribution_graph.construct_graph(
+        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), feature_activations, -1 # No target token for visualization
+    )
+    # Prune the graph.
+    pruned_graph = attribution_graph.prune_graph(0.8) # Use config.pruning_threshold
+    # Analyze the most important paths.
+    important_paths = []
+    if len(pruned_graph.nodes()) > 0:
+        # Find paths from embeddings to the output.
+        embedding_nodes = [node for node, type_ in attribution_graph.node_types.items()
+                         if type_ == "embedding" and node in pruned_graph]
+        output_nodes = [node for node, type_ in attribution_graph.node_types.items()
+                       if type_ == "output" and node in pruned_graph]
+        for emb_node in embedding_nodes[:3]:  # Top 3 embedding nodes
+            for out_node in output_nodes:
+                try:
+                    paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=5))
+                    for path in paths[:2]:  # Top 2 paths
+                        path_weight = 1.0
+                        for i in range(len(path) - 1):
+                            edge_weight = attribution_graph.edge_weights.get(
+                                (path[i], path[i+1]), 0.0
+                            )
+                            path_weight *= abs(edge_weight)
+                        important_paths.append({
+                            'path': path,
+                            'weight': path_weight,
+                            'description': attribution_graph._describe_path(path)
+                        })
+                except nx.NetworkXNoPath:
+                    continue
+    # Sort paths by importance.
+    important_paths.sort(key=lambda x: x['weight'], reverse=True)
+    return {
+        "prompt": prompt,
+        "full_graph_stats": {
+            "n_nodes": len(graph.nodes()),
+            "n_edges": len(graph.edges()),
+            "node_types": dict(attribution_graph.node_types)
+        },
+        "pruned_graph_stats": {
+            "n_nodes": len(pruned_graph.nodes()),
+            "n_edges": len(pruned_graph.edges())
+        },
+        "feature_visualizations": feature_visualizations,
+        "important_paths": important_paths[:5] # Top 5 paths
+    }
+def main():
+    # Main function to run the analysis for a single prompt.
+    # Set a seed for reproducibility.
+    set_seed()
+    # --- Argument Parser ---
+    parser = argparse.ArgumentParser(description="Run Attribution Graph analysis for a single prompt.")
+    parser.add_argument(
+        '--prompt-index',
+        type=int,
+        required=True,
+        help=f"The 0-based index of the prompt to analyze from the ANALYSIS_PROMPTS list (0 to {len(ANALYSIS_PROMPTS) - 1})."
+    )
+    parser.add_argument(
+        '--force-retrain-clt',
+        action='store_true',
+        help="Force re-training of the Cross-Layer Transcoder, even if a saved model exists."
+    )
+    parser.add_argument(
+        '--batch-eval',
+        action='store_true',
+        help="Analyze all predefined prompts and compute aggregate faithfulness metrics."
+    )
+    args = parser.parse_args()
+    prompt_idx = args.prompt_index
+    if not (0 <= prompt_idx < len(ANALYSIS_PROMPTS)):
+        print(f"❌ Error: --prompt-index must be between 0 and {len(ANALYSIS_PROMPTS) - 1}.")
+        return
+    # Get the API config from the utility function.
+    qwen_api_config = init_qwen_api()
+    # Configuration - Use consistent settings matching trained CLT
+    config = AttributionGraphConfig(
+        model_path="./models/OLMo-2-1124-7B",
+        n_features_per_layer=512,           # Match trained CLT
+        training_steps=500,
+        batch_size=4,
+        max_seq_length=256,
+        learning_rate=1e-4,
+        sparsity_lambda=1e-3,                # Match training (L1 sparsity)
+        graph_feature_activation_threshold=0.01,
+        graph_edge_weight_threshold=0.003,
+        graph_max_features_per_layer=40,
+        graph_max_edges_per_node=20,
+        qwen_api_config=qwen_api_config
+    )
+    print("Attribution Graphs for OLMo2 7B - Single Prompt Pipeline")
+    print("=" * 50)
+    print(f"Model path: {config.model_path}")
+    print(f"Device: {config.device}")
+    try:
+        # Initialize the full pipeline.
+        print("🚀 Initializing Attribution Graphs Pipeline...")
+        pipeline = AttributionGraphsPipeline(config)
+        print("✓ Pipeline initialized successfully")
+        print()
+        # Load an existing CLT model or train a new one.
+        if os.path.exists(CLT_SAVE_PATH) and not args.force_retrain_clt:
+            print(f"🧠 Loading existing CLT model from {CLT_SAVE_PATH}...")
+            pipeline.load_clt(CLT_SAVE_PATH)
+            print("✓ CLT model loaded successfully.")
+        else:
+            if args.force_retrain_clt and os.path.exists(CLT_SAVE_PATH):
+                print("��‍♂️ --force-retrain-clt flag is set. Overwriting existing model.")
+            # Train a new CLT model.
+            print("📚 Training a new CLT model...")
+            print(f"   Training on {len(TRAINING_PROMPTS)} example texts...")
+            training_stats = pipeline.train_clt(TRAINING_PROMPTS)
+            print("✓ CLT training completed.")
+            # Save the training statistics.
+            stats_save_path = os.path.join(RESULTS_DIR, "clt_training_stats.json")
+            with open(stats_save_path, 'w') as f:
+                json.dump(training_stats, f, indent=2)
+            print(f"   Saved training stats to {stats_save_path}")
+            # Save the new model.
+            pipeline.save_clt(CLT_SAVE_PATH)
+            print(f"   Saved trained model to {CLT_SAVE_PATH} for future use.")
+        print()
+        if args.batch_eval:
+            print("📊 Running batch faithfulness evaluation across all prompts...")
+            batch_payload = pipeline.analyze_prompts_batch(ANALYSIS_PROMPTS)
+            final_results = copy.deepcopy(batch_payload)
+            final_results['config'] = config.__dict__
+            final_results['timestamp'] = str(time.time())
+            for analysis_entry in final_results['analyses'].values():
+                analysis_entry.pop('graph', None)
+            batch_save_path = os.path.join(RESULTS_DIR, "attribution_graphs_batch_results.json")
+            pipeline.save_results(final_results, batch_save_path)
+            print(f"💾 Batch results saved to {batch_save_path}")
+            aggregate_summary = batch_payload['aggregate_summary']
+            targeted_summary = aggregate_summary.get('targeted', {})
+            random_summary = aggregate_summary.get('random_baseline', {})
+            path_summary = aggregate_summary.get('path', {})
+            def _format_summary(label: str, summary: Dict[str, Any]) -> str:
+                return (
+                    f"{label}: count={summary.get('count', 0)}, "
+                    f"avg|Δp|={summary.get('avg_abs_probability_change', 0.0):.4f}, "
+                    f"flip_rate={summary.get('flip_rate', 0.0):.2%}"
+                )
+            print("📈 Aggregate faithfulness summary")
+            print(f"    {_format_summary('Targeted', targeted_summary)}")
+            print(f"    {_format_summary('Random baseline', random_summary)}")
+            print(f"    {_format_summary('Path', path_summary)}")
+            print(f"    {_format_summary('Random path baseline', aggregate_summary.get('random_path_baseline', {}))}")
+            diff_abs = aggregate_summary.get('target_minus_random_abs_probability_change', 0.0)
+            diff_flip = aggregate_summary.get('target_flip_rate_minus_random', 0.0)
+            path_diff_abs = aggregate_summary.get('path_minus_random_abs_probability_change', 0.0)
+            path_diff_flip = aggregate_summary.get('path_flip_rate_minus_random', 0.0)
+            print(f"    Targeted vs Random |Δp| difference: {diff_abs:.4f}")
+            print(f"    Targeted vs Random flip rate difference: {diff_flip:.4f}")
+            print(f"    Path vs Random path |Δp| difference: {path_diff_abs:.4f}")
+            print(f"    Path vs Random path flip rate difference: {path_diff_flip:.4f}")
+            print("\n🎉 Batch evaluation completed successfully!")
+            return
+        # Analyze the selected prompt.
+        prompt_to_analyze = ANALYSIS_PROMPTS[prompt_idx]
+        print(f"🔍 Analyzing prompt {prompt_idx + 1}/{len(ANALYSIS_PROMPTS)}: '{prompt_to_analyze}'")
+        analysis = pipeline.analyze_prompt(prompt_to_analyze, target_token_idx=-1)
+        # Display the key results.
+        print(f"  ✓ Tokenized into {len(analysis['input_tokens'])} tokens")
+        print(f"  ✓ Full graph: {analysis['full_graph_stats']['n_nodes']} nodes, {analysis['full_graph_stats']['n_edges']} edges")
+        print(f"  ✓ Pruned graph: {analysis['pruned_graph_stats']['n_nodes']} nodes, {analysis['pruned_graph_stats']['n_edges']} edges")
+        # Show the top features.
+        print("  📊 Top active features:")
+        feature_layers_items = list(analysis['feature_visualizations'].items())
+        if config.summary_max_layers is not None:
+            feature_layers_items = feature_layers_items[:config.summary_max_layers]
+        for layer_name, layer_features in feature_layers_items:
+            print(f"    {layer_name}:")
+            feature_items = layer_features.items()
+            if config.summary_features_per_layer is not None:
+                feature_items = list(feature_items)[:config.summary_features_per_layer]
+            for feat_name, feat_data in feature_items:
+                print(f"      {feat_name}: {feat_data['interpretation']} (max: {feat_data['max_activation']:.3f})")
+        print()
+        # Summarize perturbation experiments and baselines.
+        print("🧪 Targeted feature ablations:")
+        targeted_results = analysis.get('perturbation_experiments', [])
+        if targeted_results:
+            for experiment in targeted_results:
+                layer_name = experiment.get('layer_name', f"L{experiment.get('feature_set', [{}])[0].get('layer', '?')}")
+                feature_name = experiment.get('feature_name', f"F{experiment.get('feature_set', [{}])[0].get('feature', '?')}")
+                prob_delta = experiment.get('probability_change', 0.0)
+                logit_delta = experiment.get('logit_change', 0.0)
+                flips = experiment.get('ablation_flips_top_prediction', False)
+                print(f"    {layer_name}/{feature_name}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No targeted ablations were recorded.")
+        print("\n🎲 Random baseline ablations:")
+        random_baseline = analysis.get('random_baseline_experiments', [])
+        if random_baseline:
+            for experiment in random_baseline:
+                prob_delta = experiment.get('probability_change', 0.0)
+                logit_delta = experiment.get('logit_change', 0.0)
+                flips = experiment.get('ablation_flips_top_prediction', False)
+                trial_idx = experiment.get('trial_index', '?')
+                print(f"    Trial {trial_idx}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No random baseline trials were run.")
+        print("\n🛤️ Path ablations:")
+        path_results = analysis.get('path_ablation_experiments', [])
+        if path_results:
+            for path_exp in path_results:
+                description = path_exp.get('path_description', 'Path')
+                prob_delta = path_exp.get('probability_change', 0.0)
+                logit_delta = path_exp.get('logit_change', 0.0)
+                flips = path_exp.get('ablation_flips_top_prediction', False)
+                print(f"    {description}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No path ablations were run.")
+        summary_stats = analysis.get('summary_statistics', {})
+        targeted_summary = summary_stats.get('targeted', {})
+        random_summary = summary_stats.get('random_baseline', {})
+        path_summary = summary_stats.get('path', {})
+        random_path_summary = summary_stats.get('random_path_baseline', {})
+        print("\n📈 Summary statistics:")
+        print(f"    Targeted: avg|Δp|={targeted_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={targeted_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Random baseline: avg|Δp|={random_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={random_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Path: avg|Δp|={path_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={path_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Random path baseline: avg|Δp|={random_path_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={random_path_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Targeted vs Random |Δp| diff: {summary_stats.get('target_minus_random_abs_probability_change', 0.0):.4f}")
+        print(f"    Targeted vs Random flip diff: {summary_stats.get('target_flip_rate_minus_random', 0.0):.4f}")
+        print(f"    Path vs Random path |Δp| diff: {summary_stats.get('path_minus_random_abs_probability_change', 0.0):.4f}")
+        print(f"    Path vs Random path flip diff: {summary_stats.get('path_flip_rate_minus_random', 0.0):.4f}")
+        print("\n✓ Faithfulness experiments summarized\n")
+        # Generate a visualization for the prompt.
+        print("📈 Generating visualization...")
+        if 'graph' in analysis and analysis['pruned_graph_stats']['n_nodes'] > 0:
+            viz_path = os.path.join(RESULTS_DIR, f"attribution_graph_prompt_{prompt_idx + 1}.png")
+            pipeline.attribution_graph.visualize_graph(analysis['graph'], save_path=viz_path)
+            print(f"  ✓ Graph visualization saved to {viz_path}")
+        else:
+            print("  - Skipping visualization as no graph was generated or it was empty.")
+        # Save the results in a format for the web app.
+        save_path = os.path.join(RESULTS_DIR, f"attribution_graphs_results_prompt_{prompt_idx + 1}.json")
+        # Create a JSON file that can be merged with others.
+        final_results = {
+            "analyses": {
+                f"prompt_{prompt_idx + 1}": analysis
+            },
+            "config": config.__dict__,
+            "timestamp": str(time.time())
+        }
+        # The web page doesn't use the graph object, so remove it.
+        if 'graph' in final_results['analyses'][f"prompt_{prompt_idx + 1}"]:
+            del final_results['analyses'][f"prompt_{prompt_idx + 1}"]['graph']
+        pipeline.save_results(final_results, save_path)
+        print(f"💾 Results saved to {save_path}")
+        print("\n🎉 Analysis for this prompt completed successfully!")
+    except Exception as e:
+        print(f"❌ Error during execution: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

circuit_analysis/attribution_graphs_olmo_de.py ADDED Viewed

	@@ -0,0 +1,1165 @@

+#!/usr/bin/env python3
+# This script generates attribution graphs for the German OLMo2 7B model.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Tuple, Optional, Any
+import json
+import logging
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from collections import defaultdict
+import networkx as nx
+from dataclasses import dataclass
+from tqdm import tqdm
+import pickle
+import requests
+import time
+import random
+import os
+import argparse
+# --- Add this block to fix the import path ---
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+# ---------------------------------------------
+from utils import init_qwen_api, set_seed
+# --- Constants ---
+# Configuration for the attribution graph generation pipeline.
+RESULTS_DIR = "circuit_analysis/results"
+CLT_SAVE_PATH = "circuit_analysis/models/clt_model_de.pth"
+# Configure logging.
+logging.basicConfig(level=logging.INFO, format='%(asctime=s - %(levelname=s - %(message=s')
+logger = logging.getLogger(__name__)
+# Set the device for training.
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+    logger.info("Using MPS (Metal Performance Shaders) for GPU acceleration")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+    logger.info("Using CUDA for GPU acceleration")
+else:
+    DEVICE = torch.device("cpu")
+    logger.info("Using CPU")
+@dataclass
+class AttributionGraphConfig:
+    # Configuration for building the attribution graph.
+    model_path: str = "./models/OLMo-2-1124-7B"
+    max_seq_length: int = 512
+    n_features_per_layer: int = 512
+    sparsity_lambda: float = 0.01
+    reconstruction_loss_weight: float = 1.0
+    batch_size: int = 8
+    learning_rate: float = 1e-4
+    training_steps: int = 1000
+    device: str = str(DEVICE)
+    pruning_threshold: float = 0.8  # For graph pruning
+    intervention_strength: float = 5.0  # For perturbation experiments
+    qwen_api_config: Optional[Dict[str, str]] = None
+class JumpReLU(nn.Module):
+    # The JumpReLU activation function.
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x):
+        return F.relu(x - self.threshold)
+class CrossLayerTranscoder(nn.Module):
+    # The Cross-Layer Transcoder (CLT) model.
+    def __init__(self, model_config: Dict, clt_config: AttributionGraphConfig):
+        super().__init__()
+        self.config = clt_config
+        self.model_config = model_config
+        self.n_layers = model_config['num_hidden_layers']
+        self.hidden_size = model_config['hidden_size']
+        self.n_features = clt_config.n_features_per_layer
+        # Encoder weights for each layer.
+        self.encoders = nn.ModuleList([
+            nn.Linear(self.hidden_size, self.n_features, bias=False)
+            for _ in range(self.n_layers)
+        ])
+        # Decoder weights for cross-layer connections.
+        self.decoders = nn.ModuleDict()
+        for source_layer in range(self.n_layers):
+            for target_layer in range(source_layer, self.n_layers):
+                key = f"{source_layer}_to_{target_layer}"
+                self.decoders[key] = nn.Linear(self.n_features, self.hidden_size, bias=False)
+        # The activation function.
+        self.activation = JumpReLU(threshold=0.0)
+        # Initialize the weights.
+        self._init_weights()
+    def _init_weights(self):
+        # Initializes the weights with small random values.
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=0.01)
+    def encode(self, layer_idx: int, residual_activations: torch.Tensor) -> torch.Tensor:
+        # Encodes residual stream activations to feature activations.
+        return self.activation(self.encoders[layer_idx](residual_activations))
+    def decode(self, source_layer: int, target_layer: int, feature_activations: torch.Tensor) -> torch.Tensor:
+        # Decodes feature activations to the MLP output space.
+        key = f"{source_layer}_to_{target_layer}"
+        return self.decoders[key](feature_activations)
+    def forward(self, residual_activations: List[torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        # The forward pass of the CLT.
+        feature_activations = []
+        reconstructed_mlp_outputs = []
+        # Encode features for each layer.
+        for layer_idx, residual in enumerate(residual_activations):
+            features = self.encode(layer_idx, residual)
+            feature_activations.append(features)
+        # Reconstruct MLP outputs with cross-layer connections.
+        for target_layer in range(self.n_layers):
+            reconstruction = torch.zeros_like(residual_activations[target_layer])
+            # Sum contributions from all previous layers.
+            for source_layer in range(target_layer + 1):
+                decoded = self.decode(source_layer, target_layer, feature_activations[source_layer])
+                reconstruction += decoded
+            reconstructed_mlp_outputs.append(reconstruction)
+        return feature_activations, reconstructed_mlp_outputs
+class FeatureVisualizer:
+    # A class to visualize and interpret individual features.
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.feature_interpretations = {}
+    def visualize_feature(self, feature_idx: int, layer_idx: int,
+                         activations: torch.Tensor, input_tokens: List[str],
+                         top_k: int = 10) -> Dict:
+        # Creates a visualization for a single feature.
+        feature_acts = activations[:, feature_idx].detach().cpu().numpy()
+        # Find the top activating positions.
+        top_positions = np.argsort(feature_acts)[-top_k:][::-1]
+        visualization = {
+            'feature_idx': feature_idx,
+            'layer_idx': layer_idx,
+            'max_activation': float(feature_acts.max()),
+            'mean_activation': float(feature_acts.mean()),
+            'sparsity': float((feature_acts > 0.1).mean()),
+            'top_activations': []
+        }
+        for pos in top_positions:
+            if pos < len(input_tokens):
+                visualization['top_activations'].append({
+                    'token': input_tokens[pos],
+                    'position': int(pos),
+                    'activation': float(feature_acts[pos])
+                })
+        return visualization
+    def interpret_feature(self, feature_idx: int, layer_idx: int,
+                          visualization_data: Dict,
+                          qwen_api_config: Optional[Dict[str, str]] = None) -> str:
+        # Interprets a feature based on its top activating tokens.
+        top_tokens = [item['token'] for item in visualization_data['top_activations']]
+        # Use the Qwen API if it is configured.
+        if qwen_api_config and qwen_api_config.get('api_key'):
+            feature_name = f"L{layer_idx}_F{feature_idx}"
+            interpretation = get_feature_interpretation_with_qwen(
+                qwen_api_config, top_tokens, feature_name, layer_idx
+            )
+        else:
+            # Use a simple heuristic as a fallback.
+            if len(set(top_tokens)) == 1:
+                interpretation = f"Spezifischer Token: '{top_tokens[0]}'"
+            elif all(token.isalpha() for token in top_tokens):
+                interpretation = "Wort/alphabetische Tokens"
+            elif all(token.isdigit() for token in top_tokens):
+                interpretation = "Numerische Tokens"
+            elif all(token in '.,!?;:' for token in top_tokens):
+                interpretation = "Interpunktion"
+            else:
+                interpretation = "Gemischte/polysemische Merkmale"
+        self.feature_interpretations[f"L{layer_idx}_F{feature_idx}"] = interpretation
+        return interpretation
+class AttributionGraph:
+    # A class to construct and analyze attribution graphs.
+    def __init__(self, clt: CrossLayerTranscoder, tokenizer):
+        self.clt = clt
+        self.tokenizer = tokenizer
+        self.graph = nx.DiGraph()
+        self.node_types = {}  # Track node types (feature, embedding, error, output)
+        self.edge_weights = {}
+    def compute_virtual_weights(self, source_layer: int, target_layer: int,
+                               source_feature: int, target_feature: int) -> float:
+        # Computes the virtual weight between two features.
+        if target_layer <= source_layer:
+            return 0.0
+        # Get the encoder and decoder weights.
+        encoder_weight = self.clt.encoders[target_layer].weight[target_feature]
+        total_weight = 0.0
+        for intermediate_layer in range(source_layer, target_layer):
+            decoder_key = f"{source_layer}_to_{intermediate_layer}"
+            if decoder_key in self.clt.decoders:
+                decoder_weight = self.clt.decoders[decoder_key].weight[:, source_feature]
+                # The virtual weight is the inner product.
+                virtual_weight = torch.dot(decoder_weight, encoder_weight).item()
+                total_weight += virtual_weight
+        return total_weight
+    def construct_graph(self, input_tokens: List[str],
+                       feature_activations: List[torch.Tensor],
+                       target_token_idx: int = -1) -> nx.DiGraph:
+        # Constructs the attribution graph for a prompt.
+        self.graph.clear()
+        self.node_types.clear()
+        self.edge_weights.clear()
+        seq_len = len(input_tokens)
+        n_layers = len(feature_activations)
+        # Add embedding nodes for the input tokens.
+        for i, token in enumerate(input_tokens):
+            node_id = f"emb_{i}_{token}"
+            self.graph.add_node(node_id)
+            self.node_types[node_id] = "embedding"
+        # Add nodes for the features.
+        active_features = {}
+        max_features_per_layer = 20
+        for layer_idx, features in enumerate(feature_activations):
+            # features shape: [batch_size, seq_len, n_features]
+            batch_size, seq_len_layer, n_features = features.shape
+            # Get the top activating features for this layer.
+            layer_activations = features[0].mean(dim=0)
+            top_features = torch.topk(layer_activations,
+                                    k=min(max_features_per_layer, n_features)).indices
+            for token_pos in range(min(seq_len, seq_len_layer)):
+                for feat_idx in top_features:
+                    activation = features[0, token_pos, feat_idx.item()].item()
+                    if activation > 0.05:
+                        node_id = f"feat_L{layer_idx}_T{token_pos}_F{feat_idx.item()}"
+                        self.graph.add_node(node_id)
+                        self.node_types[node_id] = "feature"
+                        active_features[node_id] = {
+                            'layer': layer_idx,
+                            'token_pos': token_pos,
+                            'feature_idx': feat_idx.item(),
+                            'activation': activation
+                        }
+        # Add an output node for the target token.
+        output_node = f"output_{target_token_idx}"
+        self.graph.add_node(output_node)
+        self.node_types[output_node] = "output"
+        # Add edges based on virtual weights and activations.
+        feature_nodes = [node for node, type_ in self.node_types.items() if type_ == "feature"]
+        print(f"  Building attribution graph: {len(feature_nodes)} feature nodes, {len(self.graph.nodes())} total nodes")
+        # Limit the number of edges to compute.
+        max_edges_per_node = 5
+        for i, source_node in enumerate(feature_nodes):
+            if i % 50 == 0:
+                print(f"  Processing node {i+1}/{len(feature_nodes)}")
+            edges_added = 0
+            source_info = active_features[source_node]
+            source_activation = source_info['activation']
+            # Add edges to other features.
+            for target_node in feature_nodes:
+                if source_node == target_node or edges_added >= max_edges_per_node:
+                    continue
+                target_info = active_features[target_node]
+                # Only add edges that go forward in the network.
+                if (target_info['layer'] > source_info['layer'] or
+                    (target_info['layer'] == source_info['layer'] and
+                     target_info['token_pos'] > source_info['token_pos'])):
+                    virtual_weight = self.compute_virtual_weights(
+                        source_info['layer'], target_info['layer'],
+                        source_info['feature_idx'], target_info['feature_idx']
+                    )
+                    if abs(virtual_weight) > 0.05:
+                        edge_weight = source_activation * virtual_weight
+                        self.graph.add_edge(source_node, target_node, weight=edge_weight)
+                        self.edge_weights[(source_node, target_node)] = edge_weight
+                        edges_added += 1
+            # Add edges to the output node.
+            if source_info['layer'] >= n_layers - 2:
+                output_weight = source_activation * 0.1
+                self.graph.add_edge(source_node, output_node, weight=output_weight)
+                self.edge_weights[(source_node, output_node)] = output_weight
+        # Add edges from embeddings to early features.
+        for emb_node in [node for node, type_ in self.node_types.items() if type_ == "embedding"]:
+            token_idx = int(emb_node.split('_')[1])
+            for feat_node in feature_nodes:
+                feat_info = active_features[feat_node]
+                if feat_info['layer'] == 0 and feat_info['token_pos'] == token_idx:
+                    # Direct connection from an embedding to a first-layer feature.
+                    weight = feat_info['activation'] * 0.5
+                    self.graph.add_edge(emb_node, feat_node, weight=weight)
+                    self.edge_weights[(emb_node, feat_node)] = weight
+        return self.graph
+    def prune_graph(self, threshold: float = 0.8) -> nx.DiGraph:
+        # Prunes the graph to keep only the most important nodes.
+        # Calculate node importance based on edge weights.
+        node_importance = defaultdict(float)
+        for (source, target), weight in self.edge_weights.items():
+            node_importance[source] += abs(weight)
+            node_importance[target] += abs(weight)
+        # Keep the top nodes by importance.
+        sorted_nodes = sorted(node_importance.items(), key=lambda x: x[1], reverse=True)
+        n_keep = int(len(sorted_nodes) * threshold)
+        important_nodes = set([node for node, _ in sorted_nodes[:n_keep]])
+        # Always keep the output and embedding nodes.
+        for node, type_ in self.node_types.items():
+            if type_ in ["output", "embedding"]:
+                important_nodes.add(node)
+        # Create the pruned graph.
+        pruned_graph = self.graph.subgraph(important_nodes).copy()
+        return pruned_graph
+    def visualize_graph(self, graph: nx.DiGraph = None, save_path: str = None):
+        # Visualizes the attribution graph.
+        if graph is None:
+            graph = self.graph
+        plt.figure(figsize=(12, 8))
+        # Create a layout for the graph.
+        pos = nx.spring_layout(graph, k=1, iterations=50)
+        # Color the nodes by type.
+        node_colors = []
+        for node in graph.nodes():
+            node_type = self.node_types.get(node, "unknown")
+            if node_type == "embedding":
+                node_colors.append('lightblue')
+            elif node_type == "feature":
+                node_colors.append('lightgreen')
+            elif node_type == "output":
+                node_colors.append('orange')
+            else:
+                node_colors.append('gray')
+        # Draw the nodes.
+        nx.draw_networkx_nodes(graph, pos, node_color=node_colors,
+                              node_size=300, alpha=0.8)
+        # Draw the edges with thickness based on weight.
+        edges = graph.edges()
+        edge_weights = [abs(self.edge_weights.get((u, v), 0.1)) for u, v in edges]
+        max_weight = max(edge_weights) if edge_weights else 1
+        edge_widths = [w / max_weight * 3 for w in edge_weights]
+        nx.draw_networkx_edges(graph, pos, width=edge_widths, alpha=0.6,
+                              edge_color='gray', arrows=True)
+        # Draw the labels.
+        nx.draw_networkx_labels(graph, pos, font_size=8)
+        plt.title("Attribution Graph (German)")
+        plt.axis('off')
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        plt.show()
+class PerturbationExperiments:
+    # Conducts perturbation experiments to validate hypotheses.
+    def __init__(self, model, clt: CrossLayerTranscoder, tokenizer):
+        self.model = model
+        self.clt = clt
+        self.tokenizer = tokenizer
+    def feature_ablation_experiment(self, input_text: str,
+                                   target_layer: int, target_feature: int,
+                                   intervention_strength: float = 5.0) -> Dict:
+        # Ablates a feature and measures the effect on the model's output.
+        try:
+            # Clear the MPS cache to prevent memory issues.
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+            # Tokenize the input.
+            inputs = self.tokenizer(input_text, return_tensors="pt", padding=True,
+                                  truncation=True, max_length=512)
+            # Move inputs to the correct device.
+            device = next(self.model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # Get the baseline predictions.
+            with torch.no_grad():
+                baseline_outputs = self.model(**inputs)
+            baseline_logits = baseline_outputs.logits[0, -1, :]
+            baseline_probs = F.softmax(baseline_logits, dim=-1)
+            baseline_top_tokens = torch.topk(baseline_probs, k=5)
+            # TODO: Implement the actual feature intervention.
+            # Simulate the effect of the intervention.
+            intervention_effect = {
+                'baseline_top_tokens': [
+                    (self.tokenizer.decode([idx]), prob.item())
+                    for idx, prob in zip(baseline_top_tokens.indices, baseline_top_tokens.values)
+                ],
+                'intervention_layer': target_layer,
+                'intervention_feature': target_feature,
+                'intervention_strength': intervention_strength,
+                'effect_magnitude': 0.1,
+                'probability_change': 0.05
+            }
+            return intervention_effect
+        except Exception as e:
+            # Handle MPS memory issues.
+            print(f"    Warning: Perturbation experiment failed due to device issue: {e}")
+            return {
+                'baseline_top_tokens': [],
+                'intervention_layer': target_layer,
+                'intervention_feature': target_feature,
+                'intervention_strength': intervention_strength,
+                'effect_magnitude': 0.0,
+                'probability_change': 0.0,
+                'error': str(e)
+            }
+class AttributionGraphsPipeline:
+    # The main pipeline for the attribution graph analysis.
+    def __init__(self, config: AttributionGraphConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        # Load the model and tokenizer.
+        logger.info(f"Loading OLMo2 7B model from {config.model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_path)
+        # Configure model loading based on the device.
+        if "mps" in config.device:
+            # MPS supports float16 but not device_map.
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map=None
+            ).to(self.device)
+        elif "cuda" in config.device:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+        else:
+            # CPU
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float32,
+                device_map=None
+            ).to(self.device)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize the CLT.
+        model_config = self.model.config.to_dict()
+        self.clt = CrossLayerTranscoder(model_config, config).to(self.device)
+        # Initialize the other components.
+        self.feature_visualizer = FeatureVisualizer(self.tokenizer)
+        self.attribution_graph = AttributionGraph(self.clt, self.tokenizer)
+        self.perturbation_experiments = PerturbationExperiments(self.model, self.clt, self.tokenizer)
+        logger.info("Attribution Graphs Pipeline initialized successfully")
+    def train_clt(self, training_texts: List[str]) -> Dict:
+        # Trains the Cross-Layer Transcoder.
+        logger.info("Starting CLT training...")
+        optimizer = torch.optim.Adam(self.clt.parameters(), lr=self.config.learning_rate)
+        training_stats = {
+            'reconstruction_losses': [],
+            'sparsity_losses': [],
+            'total_losses': []
+        }
+        for step in tqdm(range(self.config.training_steps), desc="Training CLT"):
+            # Sample a batch of texts.
+            batch_texts = np.random.choice(training_texts, size=self.config.batch_size)
+            total_loss = 0.0
+            total_recon_loss = 0.0
+            total_sparsity_loss = 0.0
+            for text in batch_texts:
+                # Tokenize the text.
+                inputs = self.tokenizer(text, return_tensors="pt", max_length=self.config.max_seq_length,
+                                      truncation=True, padding=True).to(self.device)
+                # Get the model activations.
+                with torch.no_grad():
+                    outputs = self.model(**inputs, output_hidden_states=True)
+                    hidden_states = outputs.hidden_states[1:]
+                # Forward pass through the CLT.
+                feature_activations, reconstructed_outputs = self.clt(hidden_states)
+                # Compute the reconstruction loss.
+                recon_loss = 0.0
+                for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+                    recon_loss += F.mse_loss(pred, target)
+                # Compute the sparsity loss.
+                sparsity_loss = 0.0
+                for features in feature_activations:
+                    sparsity_loss += torch.mean(torch.tanh(self.config.sparsity_lambda * features))
+                # Total loss.
+                loss = (self.config.reconstruction_loss_weight * recon_loss +
+                       self.config.sparsity_lambda * sparsity_loss)
+                total_loss += loss
+                total_recon_loss += recon_loss
+                total_sparsity_loss += sparsity_loss
+            # Average the losses.
+            total_loss /= self.config.batch_size
+            total_recon_loss /= self.config.batch_size
+            total_sparsity_loss /= self.config.batch_size
+            # Backward pass.
+            optimizer.zero_grad()
+            total_loss.backward()
+            optimizer.step()
+            # Log the progress.
+            training_stats['total_losses'].append(total_loss.item())
+            training_stats['reconstruction_losses'].append(total_recon_loss.item())
+            training_stats['sparsity_losses'].append(total_sparsity_loss.item())
+            if step % 100 == 0:
+                logger.info(f"Step {step}: Total Loss = {total_loss.item():.4f}, "
+                           f"Recon Loss = {total_recon_loss.item():.4f}, "
+                           f"Sparsity Loss = {total_sparsity_loss.item():.4f}")
+        logger.info("CLT training completed")
+        return training_stats
+    def analyze_prompt(self, prompt: str, target_token_idx: int = -1) -> Dict:
+        # Performs a complete analysis for a single prompt.
+        logger.info(f"Analyzing prompt: '{prompt[:50]}...'")
+        # Tokenize the prompt.
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        input_tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = self.clt(hidden_states)
+        logger.info("  > Starting feature visualization and interpretation...")
+        feature_visualizations = {}
+        for layer_idx, features in enumerate(feature_activations):
+            logger.info(f"  - Processing Layer {layer_idx}...")
+            layer_viz = {}
+            # Analyze the top features for this layer.
+            # features shape: [batch_size, seq_len, n_features]
+            feature_importance = torch.mean(features, dim=(0, 1))
+            top_features = torch.topk(feature_importance, k=min(5, feature_importance.size(0))).indices
+            for feat_idx in top_features:
+                viz = self.feature_visualizer.visualize_feature(
+                    feat_idx.item(), layer_idx, features[0], input_tokens
+                )
+                interpretation = self.feature_visualizer.interpret_feature(
+                    feat_idx.item(), layer_idx, viz, self.config.qwen_api_config
+                )
+                viz['interpretation'] = interpretation
+                layer_viz[f"feature_{feat_idx.item()}"] = viz
+            feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+        # Construct the attribution graph.
+        graph = self.attribution_graph.construct_graph(
+            input_tokens, feature_activations, target_token_idx
+        )
+        # Prune the graph.
+        pruned_graph = self.attribution_graph.prune_graph(self.config.pruning_threshold)
+        # Analyze the most important paths.
+        important_paths = []
+        if len(pruned_graph.nodes()) > 0:
+            # Find paths from embeddings to the output.
+            embedding_nodes = [node for node, type_ in self.attribution_graph.node_types.items()
+                             if type_ == "embedding" and node in pruned_graph]
+            output_nodes = [node for node, type_ in self.attribution_graph.node_types.items()
+                           if type_ == "output" and node in pruned_graph]
+            for emb_node in embedding_nodes[:3]:
+                for out_node in output_nodes:
+                    try:
+                        paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=5))
+                        for path in paths[:2]:
+                            path_weight = 1.0
+                            for i in range(len(path) - 1):
+                                edge_weight = self.attribution_graph.edge_weights.get(
+                                    (path[i], path[i+1]), 0.0
+                                )
+                                path_weight *= abs(edge_weight)
+                            important_paths.append({
+                                'path': path,
+                                'weight': path_weight,
+                                'description': self._describe_path(path)
+                            })
+                    except nx.NetworkXNoPath:
+                        continue
+        # Sort paths by importance.
+        important_paths.sort(key=lambda x: x['weight'], reverse=True)
+        results = {
+            'prompt': prompt,
+            'input_tokens': input_tokens,
+            'feature_visualizations': feature_visualizations,
+            'full_graph_stats': {
+                'n_nodes': len(graph.nodes()),
+                'n_edges': len(graph.edges()),
+                'node_types': dict(self.attribution_graph.node_types)
+            },
+            'pruned_graph_stats': {
+                'n_nodes': len(pruned_graph.nodes()),
+                'n_edges': len(pruned_graph.edges())
+            },
+            'important_paths': important_paths[:5],
+            'graph': pruned_graph
+        }
+        return results
+    def _describe_path(self, path: List[str]) -> str:
+        # Generates a human-readable description of a path.
+        descriptions = []
+        for node in path:
+            if self.attribution_graph.node_types[node] == "embedding":
+                token = node.split('_')[2]
+                descriptions.append(f"Token '{token}'")
+            elif self.attribution_graph.node_types[node] == "feature":
+                parts = node.split('_')
+                layer = parts[1][1:]
+                feature = parts[3][1:]
+                # Try to get the interpretation.
+                key = f"L{layer}_F{feature}"
+                interpretation = self.feature_visualizer.feature_interpretations.get(key, "unknown")
+                descriptions.append(f"Feature L{layer}F{feature} ({interpretation})")
+            elif self.attribution_graph.node_types[node] == "output":
+                descriptions.append("Output")
+        return " → ".join(descriptions)
+    def save_results(self, results: Dict, save_path: str):
+        # Saves the analysis results to a file.
+        # Convert the graph to a serializable format.
+        serializable_results = results.copy()
+        if 'graph' in serializable_results:
+            graph_data = nx.node_link_data(serializable_results['graph'])
+            serializable_results['graph'] = graph_data
+        with open(save_path, 'w', encoding='utf-8') as f:
+            json.dump(serializable_results, f, indent=2, default=str, ensure_ascii=False)
+        logger.info(f"Results saved to {save_path}")
+    def save_clt(self, path: str):
+        # Saves the trained CLT model.
+        torch.save(self.clt.state_dict(), path)
+        logger.info(f"CLT model saved to {path}")
+    def load_clt(self, path: str):
+        # Loads a trained CLT model.
+        self.clt.load_state_dict(torch.load(path, map_location=self.device))
+        self.clt.to(self.device)
+        self.clt.eval()
+        logger.info(f"Loaded CLT model from {path}")
+# --- Configuration ---
+MAX_SEQ_LEN = 256
+N_FEATURES_PER_LAYER = 512
+TRAINING_STEPS = 2500
+BATCH_SIZE = 64
+LEARNING_RATE = 1e-3
+# German prompts for the final analysis.
+ANALYSIS_PROMPTS = [
+    "Die Hauptstadt von Frankreich ist",
+    "def fakultaet(n):",
+    "Das literarische Stilmittel im Satz 'Der Wind flüsterte durch die Bäume' ist"
+]
+# A larger set of German prompts for training.
+TRAINING_PROMPTS = [
+    "Die Hauptstadt von Frankreich ist", "Sein oder Nichtsein, das ist hier die Frage", "Was du heute kannst besorgen, das verschiebe nicht auf morgen",
+    "Der erste Mensch auf dem Mond war", "Die chemische Formel für Wasser ist H2O.",
+    "Übersetze ins Englische: 'Die Katze sitzt auf der Matte.'", "def fakultaet(n):", "import numpy as np",
+    "Die Hauptzutaten einer Pizza sind", "Was ist das Kraftwerk der Zelle?",
+    "Die Gleichung E=mc^2 beschreibt die Beziehung zwischen Energie und", "Setze die Geschichte fort: Es war einmal, da war ein",
+    "Klassifiziere das Sentiment: 'Ich bin überglücklich!'", "Extrahiere die Entitäten: 'Apple Inc. ist in Cupertino.'",
+    "Was ist die nächste Zahl: 2, 4, 8, 16, __?", "Ein rollender Stein setzt kein Moos an",
+    "Das Gegenteil von heiß ist", "import torch", "import pandas as pd", "class MeineKlasse:",
+    "def __init__(self):", "Die Primärfarben sind", "Was ist die Hauptstadt von Japan?",
+    "Wer hat 'Hamlet' geschrieben?", "Die Quadratwurzel von 64 ist", "Die Sonne geht im Osten auf",
+    "Der Pazifische Ozean ist der größte Ozean der Erde.", "Die Mitochondrien sind das Kraftwerk der Zelle.",
+    "Was ist die Hauptstadt der Mongolei?", "Der Film 'Matrix' kann folgendem Genre zugeordnet werden:",
+    "Die englische Übersetzung von 'Ich möchte bitte einen Kaffee bestellen.' lautet:",
+    "Das literarische Stilmittel im Satz 'Der Wind flüsterte durch die Bäume' ist",
+    "Eine Python-Funktion, die die Fakultät einer Zahl berechnet, lautet:",
+    "Die Hauptzutat eines Negroni-Cocktails ist",
+    "Fasse die Handlung von 'Hamlet' in einem Satz zusammen:",
+    "Der Satz 'Der Kuchen wurde vom Hund gefressen' steht in folgender Form:",
+    "Eine gute Überschrift für einen Artikel über einen neuen Durchbruch in der Batterietechnologie wäre:"
+]
+# --- Qwen API for Feature Interpretation ---
+@torch.no_grad()
+def get_feature_interpretation_with_qwen(
+    api_config: dict,
+    top_tokens: list[str],
+    feature_name: str,
+    layer_index: int,
+    max_retries: int = 3,
+    initial_backoff: float = 2.0
+) -> str:
+    # Generates a high-quality interpretation for a feature using the Qwen API.
+    if not api_config or not api_config.get('api_key'):
+        logger.warning("Qwen API not configured. Skipping interpretation.")
+        return "API not configured"
+    headers = {
+        "Authorization": f"Bearer {api_config['api_key']}",
+        "Content-Type": "application/json"
+    }
+    # Create a specialized German prompt.
+    prompt_text = f"""
+Sie sind ein Experte für die Interpretierbarkeit von Transformern. Ein Merkmal in einem Sprachmodell (Merkmal '{feature_name}' auf Schicht {layer_index}) wird am stärksten durch die folgenden Token aktiviert:
+{', '.join(f"'{token}'" for token in top_tokens)}
+Was ist, basierend *nur* auf diesen Token, die wahrscheinlichste Funktion oder Rolle dieses Merkmals?
+Ihre Antwort muss ein kurzer, prägnanter Ausdruck sein (z.B. "Erkennen von Eigennamen", "Identifizieren von JSON-Syntax", "Vervollständigen von Listen", "Erkennen negativer Stimmung"). Schreiben Sie keinen ganzen Satz.
+"""
+    data = {
+        "model": api_config["model"],
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt_text}]
+            }
+        ],
+        "max_tokens": 50,
+        "temperature": 0.1,
+        "top_p": 0.9,
+        "seed": 42
+    }
+    logger.info(f"  > Interpreting {feature_name} (Layer {layer_index})...")
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"    - Attempt {attempt + 1}/{max_retries}: Sending request to Qwen API...")
+            response = requests.post(
+                f"{api_config['api_endpoint']}/chat/completions",
+                headers=headers,
+                json=data,
+                timeout=60
+            )
+            response.raise_for_status()
+            result = response.json()
+            interpretation = result["choices"][0]["message"]["content"].strip()
+            # Remove quotes from the output.
+            if interpretation.startswith('"') and interpretation.endswith('"'):
+                interpretation = interpretation[1:-1]
+            logger.info(f"    - Success! Interpretation: '{interpretation}'")
+            return interpretation
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"    - Qwen API request failed (Attempt {attempt + 1}/{max_retries}): {e}")
+            if attempt < max_retries - 1:
+                backoff_time = initial_backoff * (2 ** attempt)
+                logger.info(f"    - Retrying in {backoff_time:.1f} seconds...")
+                time.sleep(backoff_time)
+            else:
+                logger.error("    - Max retries reached. Failing.")
+                return f"API Error: {e}"
+        except (KeyError, IndexError) as e:
+            logger.error(f"    - Failed to parse Qwen API response: {e}")
+            return "API Error: Invalid response format"
+        finally:
+            # Add a delay to respect API rate limits.
+            time.sleep(2.1)
+    return "API Error: Max retries exceeded"
+def train_transcoder(transcoder, model, tokenizer, training_prompts, device, steps=1000, batch_size=16, optimizer=None):
+    # Trains the Cross-Layer Transcoder.
+    transcoder.train()
+    # Use a progress bar for visual feedback.
+    progress_bar = tqdm(range(steps), desc="Training CLT")
+    for step in progress_bar:
+        # Get a random batch of prompts.
+        batch_prompts = random.choices(training_prompts, k=batch_size)
+        # Tokenize the batch.
+        inputs = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=MAX_SEQ_LEN
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = transcoder(hidden_states)
+        # Compute the reconstruction loss.
+        recon_loss = 0.0
+        for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+            recon_loss += F.mse_loss(pred, target)
+        # Compute the sparsity loss.
+        sparsity_loss = 0.0
+        for features in feature_activations:
+            sparsity_loss += torch.mean(torch.tanh(0.01 * features))
+        # Total loss.
+        loss = (0.8 * recon_loss + 0.2 * sparsity_loss)
+        if optimizer:
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        progress_bar.set_postfix({
+            "Recon Loss": f"{recon_loss.item():.4f}",
+            "Sparsity Loss": f"{sparsity_loss.item():.4f}",
+            "Total Loss": f"{loss.item():.4f}"
+        })
+def generate_feature_visualizations(transcoder, model, tokenizer, prompt, device, qwen_api_config=None, graph_config: Optional[AttributionGraphConfig] = None):
+    # Generates feature visualizations and interpretations for a prompt.
+    # Tokenize the prompt.
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Get the model activations.
+    with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
+        hidden_states = outputs.hidden_states[1:]
+    # Forward pass through the CLT.
+    feature_activations, reconstructed_outputs = transcoder(hidden_states)
+    # Visualize the features.
+    feature_visualizations = {}
+    for layer_idx, features in enumerate(feature_activations):
+        layer_viz = {}
+        # Analyze the top features for this layer.
+        # features shape: [batch_size, seq_len, n_features]
+        feature_importance = torch.mean(features, dim=(0, 1))
+        top_features = torch.topk(feature_importance, k=min(5, feature_importance.size(0))).indices
+        for feat_idx in top_features:
+            viz = FeatureVisualizer(tokenizer).visualize_feature(
+                feat_idx.item(), layer_idx, features[0], tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+            )
+            interpretation = FeatureVisualizer(tokenizer).interpret_feature(
+                feat_idx.item(), layer_idx, viz, qwen_api_config
+            )
+            viz['interpretation'] = interpretation
+            layer_viz[f"feature_{feat_idx.item()}"] = viz
+        feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+    # Construct the attribution graph.
+    if graph_config is None:
+        graph_config = AttributionGraphConfig()
+    attribution_graph = AttributionGraph(transcoder, tokenizer, graph_config)
+    graph = attribution_graph.construct_graph(
+        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), feature_activations, -1
+    )
+    # Prune the graph.
+    pruned_graph = attribution_graph.prune_graph(0.8)
+    # Analyze the most important paths.
+    important_paths = []
+    if len(pruned_graph.nodes()) > 0:
+        # Find paths from embeddings to the output.
+        embedding_nodes = [node for node, type_ in attribution_graph.node_types.items()
+                         if type_ == "embedding" and node in pruned_graph]
+        output_nodes = [node for node, type_ in attribution_graph.node_types.items()
+                       if type_ == "output" and node in pruned_graph]
+        for emb_node in embedding_nodes[:3]:
+            for out_node in output_nodes:
+                try:
+                    paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=5))
+                    for path in paths[:2]:
+                        path_weight = 1.0
+                        for i in range(len(path) - 1):
+                            edge_weight = attribution_graph.edge_weights.get(
+                                (path[i], path[i+1]), 0.0
+                            )
+                            path_weight *= abs(edge_weight)
+                        important_paths.append({
+                            'path': path,
+                            'weight': path_weight,
+                            'description': attribution_graph._describe_path(path)
+                        })
+                except nx.NetworkXNoPath:
+                    continue
+    # Sort paths by importance.
+    important_paths.sort(key=lambda x: x['weight'], reverse=True)
+    return {
+        "prompt": prompt,
+        "full_graph_stats": {
+            "n_nodes": len(graph.nodes()),
+            "n_edges": len(graph.edges()),
+            "node_types": dict(attribution_graph.node_types)
+        },
+        "pruned_graph_stats": {
+            "n_nodes": len(pruned_graph.nodes()),
+            "n_edges": len(pruned_graph.edges())
+        },
+        "feature_visualizations": feature_visualizations,
+        "important_paths": important_paths[:5]
+    }
+def main():
+    # Main function to run the analysis for a single prompt.
+    # Set a seed for reproducibility.
+    set_seed()
+    # --- Argument Parser ---
+    parser = argparse.ArgumentParser(description="Run Attribution Graph analysis for a single prompt.")
+    parser.add_argument(
+        '--prompt-index',
+        type=int,
+        required=True,
+        help=f"The 0-based index of the prompt to analyze from the ANALYSIS_PROMPTS list (0 to {len(ANALYSIS_PROMPTS) - 1})."
+    )
+    parser.add_argument(
+        '--force-retrain-clt',
+        action='store_true',
+        help="Force re-training of the Cross-Layer Transcoder, even if a saved model exists."
+    )
+    args = parser.parse_args()
+    prompt_idx = args.prompt_index
+    if not (0 <= prompt_idx < len(ANALYSIS_PROMPTS)):
+        print(f"❌ Error: --prompt-index must be between 0 and {len(ANALYSIS_PROMPTS) - 1}.")
+        return
+    # Get the API config from the utility function.
+    qwen_api_config = init_qwen_api()
+    # Configuration
+    config = AttributionGraphConfig(
+        model_path="./models/OLMo-2-1124-7B",
+        n_features_per_layer=512,
+        training_steps=500,
+        batch_size=4,
+        max_seq_length=256,
+        learning_rate=1e-4,
+        sparsity_lambda=0.01,
+        qwen_api_config=qwen_api_config
+    )
+    print("Attribution Graphs for OLMo2 7B - Single Prompt Pipeline (German)")
+    print("=" * 50)
+    print(f"Model path: {config.model_path}")
+    print(f"Device: {config.device}")
+    try:
+        # Initialize the full pipeline.
+        print("🚀 Initializing Attribution Graphs Pipeline...")
+        pipeline = AttributionGraphsPipeline(config)
+        print("✓ Pipeline initialized successfully")
+        print()
+        # Load an existing CLT model or train a new one.
+        if os.path.exists(CLT_SAVE_PATH) and not args.force_retrain_clt:
+            print(f"🧠 Loading existing CLT model from {CLT_SAVE_PATH}...")
+            pipeline.load_clt(CLT_SAVE_PATH)
+            print("✓ CLT model loaded successfully.")
+        else:
+            if args.force_retrain_clt and os.path.exists(CLT_SAVE_PATH):
+                print("🏃‍♂️ --force-retrain-clt flag is set. Overwriting existing model.")
+            # Train a new CLT model.
+            print("📚 Training a new CLT model...")
+            print(f"   Training on {len(TRAINING_PROMPTS)} example texts...")
+            training_stats = pipeline.train_clt(TRAINING_PROMPTS)
+            print("✓ CLT training completed.")
+            # Save the new model.
+            pipeline.save_clt(CLT_SAVE_PATH)
+            print(f"   Saved trained model to {CLT_SAVE_PATH} for future use.")
+        print()
+        # Analyze the selected prompt.
+        prompt_to_analyze = ANALYSIS_PROMPTS[prompt_idx]
+        print(f"🔍 Analyzing prompt {prompt_idx + 1}/{len(ANALYSIS_PROMPTS)}: '{prompt_to_analyze}'")
+        analysis = pipeline.analyze_prompt(prompt_to_analyze, target_token_idx=-1)
+        # Display the key results.
+        print(f"  ✓ Tokenized into {len(analysis['input_tokens'])} tokens")
+        print(f"  ✓ Full graph: {analysis['full_graph_stats']['n_nodes']} nodes, {analysis['full_graph_stats']['n_edges']} edges")
+        print(f"  ✓ Pruned graph: {analysis['pruned_graph_stats']['n_nodes']} nodes, {analysis['pruned_graph_stats']['n_edges']} edges")
+        # Show the top features.
+        print("  📊 Top active features:")
+        for layer_name, layer_features in list(analysis['feature_visualizations'].items())[:3]:
+            print(f"    {layer_name}:")
+            for feat_name, feat_data in list(layer_features.items())[:2]:
+                print(f"      {feat_name}: {feat_data['interpretation']} (max: {feat_data['max_activation']:.3f})")
+        print()
+        # Run a perturbation experiment.
+        print("🧪 Running perturbation experiment...")
+        # (No need to pass training_stats to the experiment)
+        if analysis['feature_visualizations']:
+            first_layer_key = next(iter(analysis['feature_visualizations']), None)
+            if first_layer_key:
+                layer_idx = int(first_layer_key.split('_')[1])
+                first_feature_key = next(iter(analysis['feature_visualizations'][first_layer_key]), None)
+                if first_feature_key:
+                    feature_idx = int(first_feature_key.split('_')[1])
+                    ablation_result = pipeline.perturbation_experiments.feature_ablation_experiment(
+                        prompt_to_analyze, layer_idx, feature_idx, intervention_strength=3.0
+                    )
+                    print(f"    Ablated L{layer_idx}F{feature_idx}: Δ probability = {ablation_result['probability_change']:.4f}")
+        print("✓ Perturbation experiment completed")
+        print()
+        # Generate a visualization for the prompt.
+        print("📈 Generating visualization...")
+        if 'graph' in analysis and analysis['pruned_graph_stats']['n_nodes'] > 0:
+            viz_path = os.path.join(RESULTS_DIR, f"attribution_graph_prompt_de_{prompt_idx + 1}.png")
+            pipeline.attribution_graph.visualize_graph(analysis['graph'], save_path=viz_path)
+            print(f"  ✓ Graph visualization saved to {viz_path}")
+        else:
+            print("  - Skipping visualization as no graph was generated or it was empty.")
+        # Save the results in a format for the web app.
+        save_path = os.path.join(RESULTS_DIR, f"attribution_graphs_results_de_prompt_{prompt_idx + 1}.json")
+        # Create a JSON file that can be merged with others.
+        final_results = {
+            "analyses": {
+                f"prompt_de_{prompt_idx + 1}": analysis
+            },
+            "config": config.__dict__,
+            "timestamp": str(time.time())
+        }
+        # The web page doesn't use the graph object, so remove it.
+        if 'graph' in final_results['analyses'][f"prompt_de_{prompt_idx + 1}"]:
+            del final_results['analyses'][f"prompt_de_{prompt_idx + 1}"]['graph']
+        pipeline.save_results(final_results, save_path)
+        print(f"💾 Results saved to {save_path}")
+        print("\n🎉 Analysis for this prompt completed successfully!")
+    except Exception as e:
+        print(f"❌ Error during execution: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

circuit_analysis/attribution_graphs_olmo_offline.py ADDED Viewed

	@@ -0,0 +1,1922 @@

+#!/usr/bin/env python3
+# This script generates attribution graphs for the OLMo2 7B model.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Tuple, Optional, Any, Set
+import json
+import logging
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from collections import defaultdict
+import networkx as nx
+from dataclasses import dataclass
+from tqdm import tqdm
+import pickle
+import requests
+import time
+import random
+import copy
+import os
+import argparse
+# --- Add this block to fix the import path ---
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+# ---------------------------------------------
+from utilities.utils import init_qwen_api, set_seed
+# --- Constants ---
+RESULTS_DIR = "circuit_analysis/results"
+CLT_SAVE_PATH = "circuit_analysis/models/clt_model.pth"
+# Configure logging.
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set the device for training.
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+    logger.info("Using MPS (Metal Performance Shaders) for GPU acceleration")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+    logger.info("Using CUDA for GPU acceleration")
+else:
+    DEVICE = torch.device("cpu")
+    logger.info("Using CPU")
+@dataclass
+class AttributionGraphConfig:
+    # Configuration for building the attribution graph.
+    model_path: str = "./models/OLMo-2-1124-7B"
+    max_seq_length: int = 512
+    n_features_per_layer: int = 512   # Number of features in each CLT layer
+    sparsity_lambda: float = 0.01
+    reconstruction_loss_weight: float = 1.0
+    batch_size: int = 8
+    learning_rate: float = 1e-4
+    training_steps: int = 1000
+    device: str = str(DEVICE)
+    pruning_threshold: float = 0.8  # For graph pruning
+    intervention_strength: float = 5.0  # For perturbation experiments
+    qwen_api_config: Optional[Dict[str, str]] = None
+    max_ablation_experiments: Optional[int] = None
+    ablation_top_k_tokens: int = 5
+    ablation_features_per_layer: Optional[int] = 6
+    summary_max_layers: Optional[int] = None
+    summary_features_per_layer: Optional[int] = 2
+    feature_visualization_top_k: int = 24
+    random_baseline_trials: int = 12
+    random_baseline_features: int = 1
+    random_baseline_seed: int = 1234
+    path_ablation_top_k: int = 6
+    path_search_cutoff: int = 6
+    random_path_baseline_trials: int = 12
+    graph_max_features_per_layer: int = 48
+    graph_feature_activation_threshold: float = 0.01
+    graph_edge_weight_threshold: float = 0.0
+    graph_max_edges_per_node: int = 12
+class JumpReLU(nn.Module):
+    # The JumpReLU activation function.
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x):
+        return F.relu(x - self.threshold)
+class CrossLayerTranscoder(nn.Module):
+    # The Cross-Layer Transcoder (CLT) model.
+    def __init__(self, model_config: Dict, clt_config: AttributionGraphConfig):
+        super().__init__()
+        self.config = clt_config
+        self.model_config = model_config
+        self.n_layers = model_config['num_hidden_layers']
+        self.hidden_size = model_config['hidden_size']
+        self.n_features = clt_config.n_features_per_layer
+        # Encoder weights for each layer.
+        self.encoders = nn.ModuleList([
+            nn.Linear(self.hidden_size, self.n_features, bias=False)
+            for _ in range(self.n_layers)
+        ])
+        # Decoder weights for cross-layer connections.
+        self.decoders = nn.ModuleDict()
+        for source_layer in range(self.n_layers):
+            for target_layer in range(source_layer, self.n_layers):
+                key = f"{source_layer}_to_{target_layer}"
+                self.decoders[key] = nn.Linear(self.n_features, self.hidden_size, bias=False)
+        # The activation function.
+        self.activation = JumpReLU(threshold=0.0)
+        # Initialize the weights.
+        self._init_weights()
+    def _init_weights(self):
+        # Initializes the weights with small random values.
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=0.01)
+    def encode(self, layer_idx: int, residual_activations: torch.Tensor) -> torch.Tensor:
+        # Encodes residual stream activations to feature activations.
+        return self.activation(self.encoders[layer_idx](residual_activations))
+    def decode(self, source_layer: int, target_layer: int, feature_activations: torch.Tensor) -> torch.Tensor:
+        # Decodes feature activations to the MLP output space.
+        key = f"{source_layer}_to_{target_layer}"
+        return self.decoders[key](feature_activations)
+    def forward(self, residual_activations: List[torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        # The forward pass of the CLT.
+        feature_activations = []
+        reconstructed_mlp_outputs = []
+        # Encode features for each layer.
+        for layer_idx, residual in enumerate(residual_activations):
+            features = self.encode(layer_idx, residual)
+            feature_activations.append(features)
+        # Reconstruct MLP outputs with cross-layer connections.
+        for target_layer in range(self.n_layers):
+            reconstruction = torch.zeros_like(residual_activations[target_layer])
+            # Sum contributions from all previous layers.
+            for source_layer in range(target_layer + 1):
+                decoded = self.decode(source_layer, target_layer, feature_activations[source_layer])
+                reconstruction += decoded
+            reconstructed_mlp_outputs.append(reconstruction)
+        return feature_activations, reconstructed_mlp_outputs
+class FeatureVisualizer:
+    # A class to visualize and interpret individual features.
+    def __init__(self, tokenizer, cache_dir: Optional[Path] = None):
+        self.tokenizer = tokenizer
+        self.feature_interpretations: Dict[str, str] = {}
+        self.cache_dir = cache_dir
+        if self.cache_dir is not None:
+            self.cache_dir = Path(self.cache_dir)
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+            self._load_cache()
+    def _cache_file(self) -> Optional[Path]:
+        if self.cache_dir is None:
+            return None
+        return self.cache_dir / "feature_interpretations.json"
+    def _load_cache(self):
+        cache_file = self._cache_file()
+        if cache_file is None or not cache_file.exists():
+            return
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, dict):
+                    self.feature_interpretations.update({str(k): str(v) for k, v in data.items()})
+        except Exception as e:
+            logger.warning(f"Failed to load feature interpretation cache: {e}")
+    def _save_cache(self):
+        cache_file = self._cache_file()
+        if cache_file is None:
+            return
+        try:
+            with open(cache_file, 'w', encoding='utf-8') as f:
+                json.dump(self.feature_interpretations, f, indent=2)
+        except Exception as e:
+            logger.warning(f"Failed to save feature interpretation cache: {e}")
+    def visualize_feature(self, feature_idx: int, layer_idx: int,
+                         activations: torch.Tensor, input_tokens: List[str],
+                         top_k: int = 10) -> Dict:
+        # Creates a visualization for a single feature.
+        feature_acts = activations[:, feature_idx].detach().cpu().numpy()
+        # Find the top activating positions.
+        top_positions = np.argsort(feature_acts)[-top_k:][::-1]
+        visualization = {
+            'feature_idx': feature_idx,
+            'layer_idx': layer_idx,
+            'max_activation': float(feature_acts.max()),
+            'mean_activation': float(feature_acts.mean()),
+            'sparsity': float((feature_acts > 0.1).mean()),
+            'top_activations': []
+        }
+        for pos in top_positions:
+            if pos < len(input_tokens):
+                visualization['top_activations'].append({
+                    'token': input_tokens[pos],
+                    'position': int(pos),
+                    'activation': float(feature_acts[pos])
+                })
+        return visualization
+    def interpret_feature(self, feature_idx: int, layer_idx: int,
+                          visualization_data: Dict,
+                          qwen_api_config: Optional[Dict[str, str]] = None) -> str:
+        # Interprets a feature based on its top activating tokens.
+        top_tokens = [item['token'] for item in visualization_data['top_activations']]
+        cache_key = f"L{layer_idx}_F{feature_idx}"
+        if cache_key in self.feature_interpretations:
+            return self.feature_interpretations[cache_key]
+        # Use the Qwen API if it is configured.
+        if qwen_api_config and qwen_api_config.get('api_key'):
+            feature_name = cache_key
+            interpretation = get_feature_interpretation_with_qwen(
+                qwen_api_config, top_tokens, feature_name, layer_idx
+            )
+        else:
+            # Use a simple heuristic as a fallback.
+            if len(set(top_tokens)) == 1 and top_tokens:
+                interpretation = f"Specific token: '{top_tokens[0]}'"
+            elif top_tokens and all(token.isalpha() for token in top_tokens):
+                interpretation = "Word/alphabetic tokens"
+            elif top_tokens and all(token.isdigit() for token in top_tokens):
+                interpretation = "Numeric tokens"
+            elif top_tokens and all(token in '.,!?;:' for token in top_tokens):
+                interpretation = "Punctuation"
+            else:
+                interpretation = "Mixed/polysemantic feature"
+        self.feature_interpretations[cache_key] = interpretation
+        self._save_cache()
+        return interpretation
+class AttributionGraph:
+    # A class to construct and analyze attribution graphs.
+    def __init__(self, clt: CrossLayerTranscoder, tokenizer, config: AttributionGraphConfig):
+        self.clt = clt
+        self.tokenizer = tokenizer
+        self.config = config
+        self.graph = nx.DiGraph()
+        self.node_types = {}  # Track node types (feature, embedding, error, output)
+        self.edge_weights = {}
+        self.feature_metadata: Dict[str, Dict[str, Any]] = {}
+    def compute_virtual_weights(self, source_layer: int, target_layer: int,
+                               source_feature: int, target_feature: int) -> float:
+        # Computes the virtual weight between two features.
+        if target_layer <= source_layer:
+            return 0.0
+        # Get the encoder and decoder weights.
+        encoder_weight = self.clt.encoders[target_layer].weight[target_feature]  # [hidden_size]
+        total_weight = 0.0
+        for intermediate_layer in range(source_layer, target_layer):
+            decoder_key = f"{source_layer}_to_{intermediate_layer}"
+            if decoder_key in self.clt.decoders:
+                decoder_weight = self.clt.decoders[decoder_key].weight[:, source_feature]  # [hidden_size]
+                # The virtual weight is inner product
+                virtual_weight = torch.dot(decoder_weight, encoder_weight).item()
+                total_weight += virtual_weight
+        return total_weight
+    def construct_graph(self, input_tokens: List[str],
+                       feature_activations: List[torch.Tensor],
+                       target_token_idx: int = -1) -> nx.DiGraph:
+        # Constructs the attribution graph for a prompt.
+        self.graph.clear()
+        self.node_types.clear()
+        self.edge_weights.clear()
+        seq_len = len(input_tokens)
+        n_layers = len(feature_activations)
+        # Add embedding nodes for the input tokens.
+        for i, token in enumerate(input_tokens):
+            node_id = f"emb_{i}_{token}"
+            self.graph.add_node(node_id)
+            self.node_types[node_id] = "embedding"
+        # Add nodes for the features.
+        active_features = {}  # Track which features are significantly active
+        max_features_per_layer = self.config.graph_max_features_per_layer or 20  # Limit features per layer to prevent explosion
+        activation_threshold = self.config.graph_feature_activation_threshold
+        edge_weight_threshold = self.config.graph_edge_weight_threshold
+        max_edges_per_node_cfg = self.config.graph_max_edges_per_node or 5
+        for layer_idx, features in enumerate(feature_activations):
+            # features shape: [batch_size, seq_len, n_features]
+            batch_size, seq_len_layer, n_features = features.shape
+            # Get the top activating features for this layer.
+            layer_activations = features[0].mean(dim=0)  # Average across sequence
+            top_features = torch.topk(layer_activations,
+                                    k=min(max_features_per_layer, n_features)).indices
+            for token_pos in range(min(seq_len, seq_len_layer)):
+                for feat_idx in top_features:
+                    activation = features[0, token_pos, feat_idx.item()].item()
+                    if activation > activation_threshold:
+                        node_id = f"feat_L{layer_idx}_T{token_pos}_F{feat_idx.item()}"
+                        self.graph.add_node(node_id)
+                        self.node_types[node_id] = "feature"
+                        active_features[node_id] = {
+                            'layer': layer_idx,
+                            'token_pos': token_pos,
+                            'feature_idx': feat_idx.item(),
+                            'activation': activation
+                        }
+                        self.feature_metadata[node_id] = {
+                            'layer': layer_idx,
+                            'token_position': token_pos,
+                            'feature_index': feat_idx.item(),
+                            'activation': activation,
+                            'input_token': input_tokens[token_pos] if token_pos < len(input_tokens) else None
+                        }
+        # Add an output node for the target token.
+        output_node = f"output_{target_token_idx}"
+        self.graph.add_node(output_node)
+        self.node_types[output_node] = "output"
+        # Add edges based on virtual weights and activations.
+        feature_nodes = [node for node, type_ in self.node_types.items() if type_ == "feature"]
+        tqdm.write(f"  Building attribution graph: {len(feature_nodes)} feature nodes, {len(self.graph.nodes())} total nodes")
+        # Limit the number of edges to compute.
+        max_edges_per_node = max(max_edges_per_node_cfg, 1)  # Limit connections per node
+        for i, source_node in enumerate(tqdm(feature_nodes, desc="Adding feature edges")):
+            edges_added = 0
+            source_info = active_features[source_node]
+            source_activation = source_info['activation']
+            for target_node in feature_nodes:
+                if source_node == target_node or edges_added >= max_edges_per_node:
+                    continue
+                target_info = active_features[target_node]
+                if (target_info['layer'] > source_info['layer'] or
+                    (target_info['layer'] == source_info['layer'] and
+                     target_info['token_pos'] > source_info['token_pos'])):
+                    virtual_weight = self.compute_virtual_weights(
+                        source_info['layer'], target_info['layer'],
+                        source_info['feature_idx'], target_info['feature_idx']
+                    )
+                    if abs(virtual_weight) > edge_weight_threshold:
+                        edge_weight = source_activation * virtual_weight
+                        self.graph.add_edge(source_node, target_node, weight=edge_weight)
+                        self.edge_weights[(source_node, target_node)] = edge_weight
+                        edges_added += 1
+            # Add edges to the output node.
+            layer_position = source_info['layer']
+            # Allow contributions from all layers, with smaller weights for early layers.
+            layer_scale = 0.1 if layer_position >= n_layers - 2 else max(0.05, 0.1 * (layer_position + 1) / n_layers)
+            output_weight = source_activation * layer_scale
+            if abs(output_weight) > 0:
+                self.graph.add_edge(source_node, output_node, weight=output_weight)
+                self.edge_weights[(source_node, output_node)] = output_weight
+        first_layer_features = [
+            node for node in feature_nodes if active_features[node]['layer'] == 0
+        ]
+        embedding_nodes = [node for node, type_ in self.node_types.items() if type_ == "embedding"]
+        for emb_idx, emb_node in enumerate(tqdm(embedding_nodes, desc="Linking embeddings"), start=1):
+            token_idx = int(emb_node.split('_')[1])
+            linked = 0
+            for feat_node in first_layer_features:
+                feat_info = active_features[feat_node]
+                if feat_info['token_pos'] == token_idx:
+                    weight = feat_info['activation'] * 0.5
+                    self.graph.add_edge(emb_node, feat_node, weight=weight)
+                    self.edge_weights[(emb_node, feat_node)] = weight
+                    linked += 1
+        return self.graph
+    def prune_graph(self, threshold: float = 0.8) -> nx.DiGraph:
+        # Prunes the graph to keep only the most important nodes.
+        # Calculate node importance based on edge weights.
+        node_importance = defaultdict(float)
+        for (source, target), weight in self.edge_weights.items():
+            node_importance[source] += abs(weight)
+            node_importance[target] += abs(weight)
+        # Keep the top nodes by importance.
+        sorted_nodes = sorted(node_importance.items(), key=lambda x: x[1], reverse=True)
+        n_keep = int(len(sorted_nodes) * threshold)
+        important_nodes = set([node for node, _ in sorted_nodes[:n_keep]])
+        # Always keep the output and embedding nodes.
+        for node, type_ in self.node_types.items():
+            if type_ in ["output", "embedding"]:
+                important_nodes.add(node)
+        # Create the pruned graph.
+        pruned_graph = self.graph.subgraph(important_nodes).copy()
+        return pruned_graph
+    def visualize_graph(self, graph: nx.DiGraph = None, save_path: str = None):
+        # Visualizes the attribution graph.
+        if graph is None:
+            graph = self.graph
+        plt.figure(figsize=(12, 8))
+        # Create a layout for the graph.
+        pos = nx.spring_layout(graph, k=1, iterations=50)
+        # Color the nodes by type.
+        node_colors = []
+        for node in graph.nodes():
+            node_type = self.node_types.get(node, "unknown")
+            if node_type == "embedding":
+                node_colors.append('lightblue')
+            elif node_type == "feature":
+                node_colors.append('lightgreen')
+            elif node_type == "output":
+                node_colors.append('orange')
+            else:
+                node_colors.append('gray')
+        # Draw the nodes.
+        nx.draw_networkx_nodes(graph, pos, node_color=node_colors,
+                              node_size=300, alpha=0.8)
+        # Draw the edges with thickness based on weight.
+        edges = graph.edges()
+        edge_weights = [abs(self.edge_weights.get((u, v), 0.1)) for u, v in edges]
+        max_weight = max(edge_weights) if edge_weights else 1
+        edge_widths = [w / max_weight * 3 for w in edge_weights]
+        nx.draw_networkx_edges(graph, pos, width=edge_widths, alpha=0.6,
+                              edge_color='gray', arrows=True)
+        # Draw the labels.
+        nx.draw_networkx_labels(graph, pos, font_size=8)
+        plt.title("Attribution Graph")
+        plt.axis('off')
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        plt.show()
+class PerturbationExperiments:
+    # Conducts perturbation experiments to validate hypotheses.
+    def __init__(self, model, clt: CrossLayerTranscoder, tokenizer):
+        self.model = model
+        self.clt = clt
+        self.tokenizer = tokenizer
+        self._transformer_blocks: Optional[List[nn.Module]] = None
+    def _get_transformer_blocks(self) -> List[nn.Module]:
+        if self._transformer_blocks is not None:
+            return self._transformer_blocks
+        n_layers = getattr(self.model.config, "num_hidden_layers", None)
+        if n_layers is None:
+            raise ValueError("Model config does not expose num_hidden_layers; cannot resolve transformer blocks.")
+        candidate_lists: List[Tuple[str, nn.ModuleList]] = []
+        for name, module in self.model.named_modules():
+            if isinstance(module, nn.ModuleList) and len(module) == n_layers:
+                candidate_lists.append((name, module))
+        if not candidate_lists:
+            raise ValueError("Unable to locate transformer block ModuleList in model.")
+        # Prefer names that look like transformer blocks.
+        def _score(name: str) -> Tuple[int, str]:
+            preferred_suffixes = ("layers", "blocks", "h")
+            for idx, suffix in enumerate(preferred_suffixes):
+                if name.endswith(suffix):
+                    return (idx, name)
+            return (len(preferred_suffixes), name)
+        selected_name, selected_list = sorted(candidate_lists, key=lambda item: _score(item[0]))[0]
+        self._transformer_blocks = list(selected_list)
+        logger.debug(f"Resolved transformer blocks from ModuleList '{selected_name}'.")
+        return self._transformer_blocks
+    def _format_top_tokens(self, top_tokens: torch.return_types.topk) -> List[Tuple[str, float]]:
+        return [
+            (self.tokenizer.decode([idx]), prob.item())
+            for idx, prob in zip(top_tokens.indices, top_tokens.values)
+        ]
+    def _prepare_inputs(self, input_text: str, top_k: int) -> Dict[str, Any]:
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
+        device = next(self.model.parameters()).device
+        inputs = self.tokenizer(
+            input_text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        if inputs["input_ids"].size(0) != 1:
+            raise ValueError("Perturbation experiments currently support only batch size 1.")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            baseline_outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        baseline_logits = baseline_outputs.logits[0]
+        target_position = baseline_logits.size(0) - 1
+        baseline_last_token_logits = baseline_logits[target_position]
+        baseline_probs = F.softmax(baseline_last_token_logits, dim=-1)
+        baseline_top_tokens = torch.topk(baseline_probs, k=top_k)
+        hidden_states: List[torch.Tensor] = list(baseline_outputs.hidden_states[1:])
+        with torch.no_grad():
+            feature_activations, _ = self.clt(hidden_states)
+        return {
+            'inputs': inputs,
+            'baseline_outputs': baseline_outputs,
+            'baseline_logits': baseline_logits,
+            'baseline_last_token_logits': baseline_last_token_logits,
+            'baseline_probs': baseline_probs,
+            'baseline_top_tokens': baseline_top_tokens,
+            'target_position': target_position,
+            'hidden_states': hidden_states,
+            'feature_activations': feature_activations,
+            'default_target_token_id': baseline_top_tokens.indices[0].item()
+        }
+    def _compute_feature_contributions(
+        self,
+        feature_activations: List[torch.Tensor],
+        feature_set: List[Tuple[int, int]]
+    ) -> Dict[int, torch.Tensor]:
+        contributions: Dict[int, torch.Tensor] = {}
+        with torch.no_grad():
+            for layer_idx, feature_idx in feature_set:
+                if layer_idx >= len(feature_activations):
+                    continue
+                features = feature_activations[layer_idx]
+                if feature_idx >= features.size(-1):
+                    continue
+                feature_values = features[:, :, feature_idx].detach()
+                for dest_layer in range(layer_idx, self.clt.n_layers):
+                    decoder_key = f"{layer_idx}_to_{dest_layer}"
+                    if decoder_key not in self.clt.decoders:
+                        continue
+                    decoder = self.clt.decoders[decoder_key]
+                    weight_column = decoder.weight[:, feature_idx]
+                    contrib = torch.einsum('bs,h->bsh', feature_values, weight_column).detach()
+                    if dest_layer in contributions:
+                        contributions[dest_layer] += contrib
+                    else:
+                        contributions[dest_layer] = contrib
+        return contributions
+    def _run_with_hooks(
+        self,
+        inputs: Dict[str, torch.Tensor],
+        contributions: Dict[int, torch.Tensor],
+        intervention_strength: float
+    ):
+        blocks = self._get_transformer_blocks()
+        handles: List[Any] = []
+        def _make_hook(cached_contrib: torch.Tensor):
+            def hook(module, module_input, module_output):
+                if isinstance(module_output, torch.Tensor):
+                    target_tensor = module_output
+                elif isinstance(module_output, (tuple, list)):
+                    target_tensor = module_output[0]
+                elif hasattr(module_output, "last_hidden_state"):
+                    target_tensor = module_output.last_hidden_state
+                else:
+                    raise TypeError(
+                        f"Unsupported module output type '{type(module_output)}' for perturbation hook."
+                    )
+                tensor_contrib = cached_contrib.to(target_tensor.device).to(target_tensor.dtype)
+                scaled = intervention_strength * tensor_contrib
+                if isinstance(module_output, torch.Tensor):
+                    return module_output - scaled
+                elif isinstance(module_output, tuple):
+                    modified = module_output[0] - scaled
+                    return (modified,) + tuple(module_output[1:])
+                elif isinstance(module_output, list):
+                    modified = [module_output[0] - scaled, *module_output[1:]]
+                    return modified
+                else:
+                    module_output.last_hidden_state = module_output.last_hidden_state - scaled
+                    return module_output
+            return hook
+        try:
+            for dest_layer, contrib in contributions.items():
+                if dest_layer >= len(blocks):
+                    continue
+                handles.append(blocks[dest_layer].register_forward_hook(_make_hook(contrib)))
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        finally:
+            for handle in handles:
+                handle.remove()
+        return outputs
+    def feature_set_ablation_experiment(
+        self,
+        input_text: str,
+        feature_set: List[Tuple[int, int]],
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+        ablation_label: str = "feature_set",
+        extra_metadata: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        try:
+            baseline_data = self._prepare_inputs(input_text, top_k)
+            if target_token_id is None:
+                target_token_id = baseline_data['default_target_token_id']
+            feature_set_normalized = [
+                (int(layer_idx), int(feature_idx)) for layer_idx, feature_idx in feature_set
+            ]
+            contributions = self._compute_feature_contributions(
+                baseline_data['feature_activations'],
+                feature_set_normalized
+            )
+            baseline_probs = baseline_data['baseline_probs']
+            baseline_top_tokens = baseline_data['baseline_top_tokens']
+            baseline_last_token_logits = baseline_data['baseline_last_token_logits']
+            target_position = baseline_data['target_position']
+            hidden_states = baseline_data['hidden_states']
+            baseline_prob = baseline_probs[target_token_id].item()
+            baseline_logit = baseline_last_token_logits[target_token_id].item()
+            baseline_summary = {
+                'baseline_top_tokens': self._format_top_tokens(baseline_top_tokens),
+                'baseline_probability': baseline_prob,
+                'baseline_logit': baseline_logit
+            }
+            if not contributions:
+                result = {
+                    **baseline_summary,
+                    'ablated_top_tokens': baseline_summary['baseline_top_tokens'],
+                    'ablated_probability': baseline_prob,
+                    'ablated_logit': baseline_logit,
+                    'probability_change': 0.0,
+                    'logit_change': 0.0,
+                    'kl_divergence': 0.0,
+                    'entropy_change': 0.0,
+                    'hidden_state_delta_norm': 0.0,
+                    'hidden_state_relative_change': 0.0,
+                    'ablation_flips_top_prediction': False,
+                    'feature_set': [
+                        {'layer': layer_idx, 'feature': feature_idx}
+                        for layer_idx, feature_idx in feature_set_normalized
+                    ],
+                    'feature_set_size': len(feature_set_normalized),
+                    'intervention_strength': intervention_strength,
+                    'target_token_id': target_token_id,
+                    'target_token': self.tokenizer.decode([target_token_id]),
+                    'contributing_layers': [],
+                    'ablation_applied': False,
+                    'ablation_type': ablation_label,
+                    'warning': 'no_contributions_found'
+                }
+                if extra_metadata:
+                    result.update(extra_metadata)
+                return result
+            ablated_outputs = self._run_with_hooks(
+                baseline_data['inputs'],
+                contributions,
+                intervention_strength
+            )
+            ablated_logits = ablated_outputs.logits[0, target_position]
+            ablated_probs = F.softmax(ablated_logits, dim=-1)
+            ablated_top_tokens = torch.topk(ablated_probs, k=top_k)
+            ablated_prob = ablated_probs[target_token_id].item()
+            ablated_logit = ablated_logits[target_token_id].item()
+            epsilon = 1e-9
+            kl_divergence = torch.sum(
+                baseline_probs * (torch.log(baseline_probs + epsilon) - torch.log(ablated_probs + epsilon))
+            ).item()
+            entropy_baseline = -(baseline_probs * torch.log(baseline_probs + epsilon)).sum().item()
+            entropy_ablated = -(ablated_probs * torch.log(ablated_probs + epsilon)).sum().item()
+            baseline_hidden = hidden_states[-1][:, target_position, :]
+            ablated_hidden = ablated_outputs.hidden_states[-1][:, target_position, :]
+            hidden_delta_norm = torch.norm(baseline_hidden - ablated_hidden, dim=-1).item()
+            hidden_baseline_norm = torch.norm(baseline_hidden, dim=-1).item()
+            hidden_relative_change = hidden_delta_norm / (hidden_baseline_norm + 1e-9)
+            result = {
+                **baseline_summary,
+                'ablated_top_tokens': self._format_top_tokens(ablated_top_tokens),
+                'ablated_probability': ablated_prob,
+                'ablated_logit': ablated_logit,
+                'probability_change': baseline_prob - ablated_prob,
+                'logit_change': baseline_logit - ablated_logit,
+                'kl_divergence': kl_divergence,
+                'entropy_change': entropy_ablated - entropy_baseline,
+                'hidden_state_delta_norm': hidden_delta_norm,
+                'hidden_state_relative_change': hidden_relative_change,
+                'ablation_flips_top_prediction': bool(
+                    baseline_top_tokens.indices[0].item() != ablated_top_tokens.indices[0].item()
+                ),
+                'feature_set': [
+                    {'layer': layer_idx, 'feature': feature_idx}
+                    for layer_idx, feature_idx in feature_set_normalized
+                ],
+                'feature_set_size': len(feature_set_normalized),
+                'intervention_strength': intervention_strength,
+                'target_token_id': target_token_id,
+                'target_token': self.tokenizer.decode([target_token_id]),
+                'contributing_layers': sorted(list(contributions.keys())),
+                'ablation_applied': True,
+                'ablation_type': ablation_label
+            }
+            if extra_metadata:
+                result.update(extra_metadata)
+            return result
+        except Exception as e:
+            logger.warning(f"Perturbation experiment failed: {e}")
+            return {
+                'baseline_top_tokens': [],
+                'ablated_top_tokens': [],
+                'feature_set': [
+                    {'layer': layer_idx, 'feature': feature_idx}
+                    for layer_idx, feature_idx in feature_set
+                ],
+                'feature_set_size': len(feature_set),
+                'intervention_strength': intervention_strength,
+                'probability_change': 0.0,
+                'logit_change': 0.0,
+                'kl_divergence': 0.0,
+                'entropy_change': 0.0,
+                'hidden_state_delta_norm': 0.0,
+                'hidden_state_relative_change': 0.0,
+                'ablation_flips_top_prediction': False,
+                'ablation_applied': False,
+                'ablation_type': ablation_label,
+                'error': str(e)
+            }
+    def feature_ablation_experiment(
+        self,
+        input_text: str,
+        target_layer: int,
+        target_feature: int,
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+    ) -> Dict[str, Any]:
+        return self.feature_set_ablation_experiment(
+            input_text=input_text,
+            feature_set=[(target_layer, target_feature)],
+            intervention_strength=intervention_strength,
+            target_token_id=target_token_id,
+            top_k=top_k,
+            ablation_label="targeted_feature"
+        )
+    def random_feature_ablation_experiment(
+        self,
+        input_text: str,
+        num_features: int = 1,
+        intervention_strength: float = 5.0,
+        target_token_id: Optional[int] = None,
+        top_k: int = 5,
+        seed: Optional[int] = None
+    ) -> Dict[str, Any]:
+        rng = random.Random(seed)
+        num_features = max(1, int(num_features))
+        feature_set: List[Tuple[int, int]] = []
+        for _ in range(num_features):
+            layer_idx = rng.randrange(self.clt.n_layers)
+            feature_idx = rng.randrange(self.clt.n_features)
+            feature_set.append((layer_idx, feature_idx))
+        result = self.feature_set_ablation_experiment(
+            input_text=input_text,
+            feature_set=feature_set,
+            intervention_strength=intervention_strength,
+            target_token_id=target_token_id,
+            top_k=top_k,
+            ablation_label="random_baseline",
+            extra_metadata={'random_seed': seed}
+        )
+        return result
+class AttributionGraphsPipeline:
+    # The main pipeline for the attribution graph analysis.
+    def __init__(self, config: AttributionGraphConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        # Load the model and tokenizer.
+        logger.info(f"Loading OLMo2 7B model from {config.model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_path)
+        # Configure model loading based on the device.
+        if "mps" in config.device:
+            # MPS supports float16 but not device_map.
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map=None
+            ).to(self.device)
+        elif "cuda" in config.device:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+        else:
+            # CPU
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.model_path,
+                torch_dtype=torch.float32,
+                device_map=None
+            ).to(self.device)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize the CLT.
+        model_config = self.model.config.to_dict()
+        self.clt = CrossLayerTranscoder(model_config, config).to(self.device)
+        # Initialize the other components.
+        cache_dir = Path(RESULTS_DIR) / "feature_interpretations_cache"
+        self.feature_visualizer = FeatureVisualizer(self.tokenizer, cache_dir=cache_dir)
+        self.attribution_graph = AttributionGraph(self.clt, self.tokenizer, config)
+        self.perturbation_experiments = PerturbationExperiments(self.model, self.clt, self.tokenizer)
+        logger.info("Attribution Graphs Pipeline initialized successfully")
+    def train_clt(self, training_texts: List[str]) -> Dict:
+        # Trains the Cross-Layer Transcoder.
+        logger.info("Starting CLT training...")
+        optimizer = torch.optim.Adam(self.clt.parameters(), lr=self.config.learning_rate)
+        training_stats = {
+            'reconstruction_losses': [],
+            'sparsity_losses': [],
+            'total_losses': []
+        }
+        for step in tqdm(range(self.config.training_steps), desc="Training CLT"):
+            # Sample a batch of texts.
+            batch_texts = np.random.choice(training_texts, size=self.config.batch_size)
+            total_loss = 0.0
+            total_recon_loss = 0.0
+            total_sparsity_loss = 0.0
+            for text in batch_texts:
+                # Tokenize the text.
+                inputs = self.tokenizer(text, return_tensors="pt", max_length=self.config.max_seq_length,
+                                      truncation=True, padding=True).to(self.device)
+                # Get the model activations.
+                with torch.no_grad():
+                    outputs = self.model(**inputs, output_hidden_states=True)
+                    hidden_states = outputs.hidden_states[1:]
+                # Forward pass through the CLT.
+                feature_activations, reconstructed_outputs = self.clt(hidden_states)
+                # Compute the reconstruction loss.
+                recon_loss = 0.0
+                for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+                    recon_loss += F.mse_loss(pred, target)
+                # Compute the sparsity loss.
+                sparsity_loss = 0.0
+                for features in feature_activations:
+                    sparsity_loss += torch.mean(torch.tanh(self.config.sparsity_lambda * features))
+                # Total loss.
+                loss = (self.config.reconstruction_loss_weight * recon_loss +
+                       self.config.sparsity_lambda * sparsity_loss)
+                total_loss += loss
+                total_recon_loss += recon_loss
+                total_sparsity_loss += sparsity_loss
+            # Average the losses.
+            total_loss /= self.config.batch_size
+            total_recon_loss /= self.config.batch_size
+            total_sparsity_loss /= self.config.batch_size
+            # Backward pass.
+            optimizer.zero_grad()
+            total_loss.backward()
+            optimizer.step()
+            # Log the progress.
+            training_stats['total_losses'].append(total_loss.item())
+            training_stats['reconstruction_losses'].append(total_recon_loss.item())
+            training_stats['sparsity_losses'].append(total_sparsity_loss.item())
+            if step % 100 == 0:
+                logger.info(f"Step {step}: Total Loss = {total_loss.item():.4f}, "
+                           f"Recon Loss = {total_recon_loss.item():.4f}, "
+                           f"Sparsity Loss = {total_sparsity_loss.item():.4f}")
+        logger.info("CLT training completed")
+        return training_stats
+    def analyze_prompt(self, prompt: str, target_token_idx: int = -1) -> Dict:
+        # Performs a complete analysis for a single prompt.
+        logger.info(f"Analyzing prompt: '{prompt[:50]}...'")
+        # Tokenize the prompt.
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        input_tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = self.clt(hidden_states)
+        logger.info("  > Starting feature visualization and interpretation...")
+        feature_visualizations = {}
+        for layer_idx, features in enumerate(tqdm(feature_activations, desc="Interpreting features")):
+            logger.info(f"  - Processing Layer {layer_idx}...")
+            layer_viz = {}
+            # Analyze the top features for this layer.
+            # features shape: [batch_size, seq_len, n_features]
+            feature_importance = torch.mean(features, dim=(0, 1))  # Average over batch and sequence
+            top_k = min(self.config.feature_visualization_top_k, feature_importance.size(0))
+            top_features = torch.topk(feature_importance, k=top_k).indices
+            for feat_idx in top_features:
+                viz = self.feature_visualizer.visualize_feature(
+                    feat_idx.item(), layer_idx, features[0], input_tokens
+                )
+                interpretation = self.feature_visualizer.interpret_feature(
+                    feat_idx.item(), layer_idx, viz, self.config.qwen_api_config
+                )
+                viz['interpretation'] = interpretation
+                layer_viz[f"feature_{feat_idx.item()}"] = viz
+            feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+        # Construct the attribution graph.
+        graph = self.attribution_graph.construct_graph(
+            input_tokens, feature_activations, target_token_idx
+        )
+        # Prune the graph.
+        pruned_graph = self.attribution_graph.prune_graph(self.config.pruning_threshold)
+        # Analyze the most important paths.
+        important_paths = []
+        if len(pruned_graph.nodes()) > 0:
+            embedding_nodes = [
+                node for node, type_ in self.attribution_graph.node_types.items()
+                if type_ == "embedding" and node in pruned_graph
+            ]
+            output_nodes = [
+                node for node, type_ in self.attribution_graph.node_types.items()
+                if type_ == "output" and node in pruned_graph
+            ]
+            path_loop = tqdm(embedding_nodes[:3], desc="Enumerating paths")
+            for emb_node in path_loop:
+                for out_node in output_nodes:
+                    try:
+                        cutoff = max(2, self.config.path_search_cutoff or 5)
+                        paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=cutoff))
+                        for path in paths[:2]:
+                            path_weight = 1.0
+                            for i in range(len(path) - 1):
+                                edge_weight = self.attribution_graph.edge_weights.get(
+                                    (path[i], path[i + 1]), 0.0
+                                )
+                                path_weight *= abs(edge_weight)
+                            important_paths.append({
+                                'path': path,
+                                'weight': path_weight,
+                                'description': self._describe_path(path)
+                            })
+                    except nx.NetworkXNoPath:
+                        continue
+        # Sort paths by importance.
+        important_paths.sort(key=lambda x: x['weight'], reverse=True)
+        # Run targeted perturbation experiments for highlighted features.
+        targeted_feature_ablation_results: List[Dict[str, Any]] = []
+        max_total_experiments = self.config.max_ablation_experiments
+        per_layer_limit = self.config.ablation_features_per_layer
+        total_run = 0
+        stop_all = False
+        layer_items = list(feature_visualizations.items())
+        for layer_name, layer_features in tqdm(layer_items, desc="Targeted ablations"):
+            if stop_all:
+                break
+            try:
+                layer_idx = int(layer_name.split('_')[1])
+            except (IndexError, ValueError):
+                logger.warning(f"Unable to parse layer index from key '{layer_name}'. Skipping perturbation experiments for this layer.")
+                continue
+            feature_items = list(layer_features.items())
+            if per_layer_limit is not None:
+                feature_items = feature_items[:per_layer_limit]
+            feature_bar = tqdm(feature_items, desc=f"{layer_name} features", leave=False)
+            for feature_name, feature_payload in feature_bar:
+                if max_total_experiments is not None and total_run >= max_total_experiments:
+                    stop_all = True
+                    break
+                try:
+                    feature_idx = int(feature_name.split('_')[1])
+                except (IndexError, ValueError):
+                    logger.warning(f"Unable to parse feature index from key '{feature_name}'. Skipping perturbation experiment.")
+                    continue
+                ablation = self.perturbation_experiments.feature_ablation_experiment(
+                    prompt,
+                    layer_idx,
+                    feature_idx,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                )
+                ablation.update({
+                    'layer_name': layer_name,
+                    'feature_name': feature_name,
+                    'feature_interpretation': feature_payload.get('interpretation'),
+                    'feature_max_activation': feature_payload.get('max_activation'),
+                })
+                targeted_feature_ablation_results.append(ablation)
+                total_run += 1
+        # Random baseline perturbations for comparison.
+        random_baseline_results: List[Dict[str, Any]] = []
+        baseline_trials = self.config.random_baseline_trials
+        if baseline_trials and baseline_trials > 0:
+            num_features = self.config.random_baseline_features or 1
+            for trial_idx in tqdm(range(baseline_trials), desc="Random feature baselines"):
+                seed = None
+                if self.config.random_baseline_seed is not None:
+                    seed = self.config.random_baseline_seed + trial_idx
+                random_result = self.perturbation_experiments.random_feature_ablation_experiment(
+                    prompt,
+                    num_features=num_features,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    seed=seed
+                )
+                random_result['trial_index'] = trial_idx
+                random_baseline_results.append(random_result)
+        # Path-level ablations for the most important circuits.
+        path_ablation_results: List[Dict[str, Any]] = []
+        max_paths = self.config.path_ablation_top_k or 0
+        extracted_paths: List[Dict[str, Any]] = []
+        if max_paths > 0 and important_paths:
+            for path_info in tqdm(important_paths[:max_paths], desc="Path ablations"):
+                feature_set = self._extract_feature_set_from_path(path_info.get('path', []))
+                if not feature_set:
+                    continue
+                path_result = self.perturbation_experiments.feature_set_ablation_experiment(
+                    prompt,
+                    feature_set=feature_set,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    ablation_label="path",
+                    extra_metadata={
+                        'path_nodes': path_info.get('path'),
+                        'path_description': path_info.get('description'),
+                        'path_weight': path_info.get('weight')
+                    }
+                )
+                path_ablation_results.append(path_result)
+                enriched_path_info = path_info.copy()
+                enriched_path_info['feature_set'] = feature_set
+                extracted_paths.append(enriched_path_info)
+        random_path_baseline_results: List[Dict[str, Any]] = []
+        path_baseline_trials = self.config.random_path_baseline_trials
+        if path_baseline_trials and path_baseline_trials > 0 and extracted_paths:
+            rng = random.Random(self.config.random_baseline_seed)
+            available_nodes = [
+                data for data in self.attribution_graph.node_types.items()
+                if data[1] == "feature"
+            ]
+            for trial in tqdm(range(path_baseline_trials), desc="Random path baselines"):
+                selected_path = extracted_paths[min(trial % len(extracted_paths), len(extracted_paths) - 1)]
+                target_length = len(selected_path.get('feature_set', []))
+                source_layers = [layer for layer, _ in selected_path.get('feature_set', [])]
+                min_layer = min(source_layers) if source_layers else 0
+                max_layer = max(source_layers) if source_layers else self.clt.n_layers - 1
+                excluded_keys = {
+                    (layer, feature)
+                    for layer, feature in selected_path.get('feature_set', [])
+                }
+                random_feature_set: List[Tuple[int, int]] = []
+                attempts = 0
+                while len(random_feature_set) < target_length and attempts < target_length * 10:
+                    attempts += 1
+                    if not available_nodes:
+                        break
+                    node_name, node_type = rng.choice(available_nodes)
+                    metadata = self.attribution_graph.feature_metadata.get(node_name)
+                    if metadata is None:
+                        continue
+                    if metadata['layer'] < min_layer or metadata['layer'] > max_layer:
+                        continue
+                    key = (metadata['layer'], metadata['feature_index'])
+                    if key in excluded_keys:
+                        continue
+                    if key not in random_feature_set:
+                        random_feature_set.append(key)
+                if not random_feature_set:
+                    continue
+                if len(random_feature_set) < max(1, target_length):
+                    continue
+                random_path_result = self.perturbation_experiments.feature_set_ablation_experiment(
+                    prompt,
+                    feature_set=random_feature_set,
+                    intervention_strength=self.config.intervention_strength,
+                    target_token_id=None,
+                    top_k=self.config.ablation_top_k_tokens,
+                    ablation_label="random_path_baseline",
+                    extra_metadata={
+                        'trial_index': trial,
+                        'sampled_feature_set': random_feature_set,
+                        'reference_path_weight': selected_path.get('weight')
+                    }
+                )
+                random_path_baseline_results.append(random_path_result)
+        targeted_summary = self._summarize_ablation_results(targeted_feature_ablation_results)
+        random_summary = self._summarize_ablation_results(random_baseline_results)
+        path_summary = self._summarize_ablation_results(path_ablation_results)
+        random_path_summary = self._summarize_ablation_results(random_path_baseline_results)
+        summary_statistics = {
+            'targeted': targeted_summary,
+            'random_baseline': random_summary,
+            'path': path_summary,
+            'random_path_baseline': random_path_summary,
+            'target_minus_random_abs_probability_change': targeted_summary.get('avg_abs_probability_change', 0.0) - random_summary.get('avg_abs_probability_change', 0.0),
+            'target_flip_rate_minus_random': targeted_summary.get('flip_rate', 0.0) - random_summary.get('flip_rate', 0.0),
+            'path_minus_random_abs_probability_change': path_summary.get('avg_abs_probability_change', 0.0) - random_path_summary.get('avg_abs_probability_change', 0.0),
+            'path_flip_rate_minus_random': path_summary.get('flip_rate', 0.0) - random_path_summary.get('flip_rate', 0.0)
+        }
+        results = {
+            'prompt': prompt,
+            'input_tokens': input_tokens,
+            'feature_visualizations': feature_visualizations,
+            'full_graph_stats': {
+                'n_nodes': len(graph.nodes()),
+                'n_edges': len(graph.edges()),
+                'node_types': dict(self.attribution_graph.node_types)
+            },
+            'pruned_graph_stats': {
+                'n_nodes': len(pruned_graph.nodes()),
+                'n_edges': len(pruned_graph.edges())
+            },
+            'important_paths': important_paths[:5],  # Top 5 paths
+            'graph': pruned_graph,
+            'perturbation_experiments': targeted_feature_ablation_results,
+            'random_baseline_experiments': random_baseline_results,
+            'path_ablation_experiments': path_ablation_results,
+            'random_path_baseline_experiments': random_path_baseline_results,
+            'summary_statistics': summary_statistics
+        }
+        return results
+    def _extract_feature_set_from_path(self, path: List[str]) -> List[Tuple[int, int]]:
+        feature_set: List[Tuple[int, int]] = []
+        seen: Set[Tuple[int, int]] = set()
+        for node in path:
+            if not isinstance(node, str):
+                continue
+            if not node.startswith("feat_"):
+                continue
+            parts = node.split('_')
+            try:
+                layer_str = parts[1]  # e.g., "L0"
+                feature_str = parts[3]  # e.g., "F123"
+                layer_idx = int(layer_str[1:])
+                feature_idx = int(feature_str[1:])
+            except (IndexError, ValueError):
+                continue
+            key = (layer_idx, feature_idx)
+            if key not in seen:
+                seen.add(key)
+                feature_set.append(key)
+        return feature_set
+    def _summarize_ablation_results(self, experiments: List[Dict[str, Any]]) -> Dict[str, Any]:
+        summary = {
+            'count': len(experiments),
+            'avg_probability_change': 0.0,
+            'avg_abs_probability_change': 0.0,
+            'std_probability_change': 0.0,
+            'avg_logit_change': 0.0,
+            'avg_abs_logit_change': 0.0,
+            'std_logit_change': 0.0,
+            'avg_kl_divergence': 0.0,
+            'avg_entropy_change': 0.0,
+            'avg_hidden_state_delta_norm': 0.0,
+            'avg_hidden_state_relative_change': 0.0,
+            'flip_rate': 0.0,
+            'count_flipped': 0
+        }
+        if not experiments:
+            return summary
+        probability_changes = np.array([exp.get('probability_change', 0.0) for exp in experiments], dtype=float)
+        logit_changes = np.array([exp.get('logit_change', 0.0) for exp in experiments], dtype=float)
+        kl_divergences = np.array([exp.get('kl_divergence', 0.0) for exp in experiments], dtype=float)
+        entropy_changes = np.array([exp.get('entropy_change', 0.0) for exp in experiments], dtype=float)
+        hidden_norms = np.array([exp.get('hidden_state_delta_norm', 0.0) for exp in experiments], dtype=float)
+        hidden_relative = np.array([exp.get('hidden_state_relative_change', 0.0) for exp in experiments], dtype=float)
+        flip_flags = np.array([1.0 if exp.get('ablation_flips_top_prediction') else 0.0 for exp in experiments], dtype=float)
+        summary.update({
+            'avg_probability_change': float(np.mean(probability_changes)),
+            'avg_abs_probability_change': float(np.mean(np.abs(probability_changes))),
+            'std_probability_change': float(np.std(probability_changes)),
+            'avg_logit_change': float(np.mean(logit_changes)),
+            'avg_abs_logit_change': float(np.mean(np.abs(logit_changes))),
+            'std_logit_change': float(np.std(logit_changes)),
+            'avg_kl_divergence': float(np.mean(kl_divergences)),
+            'avg_entropy_change': float(np.mean(entropy_changes)),
+            'avg_hidden_state_delta_norm': float(np.mean(hidden_norms)),
+            'avg_hidden_state_relative_change': float(np.mean(hidden_relative)),
+            'flip_rate': float(np.mean(flip_flags)),
+            'count_flipped': int(np.round(np.sum(flip_flags)))
+        })
+        return summary
+    def analyze_prompts_batch(self, prompts: List[str]) -> Dict[str, Any]:
+        analyses: Dict[str, Dict[str, Any]] = {}
+        aggregated_targeted: List[Dict[str, Any]] = []
+        aggregated_random: List[Dict[str, Any]] = []
+        aggregated_path: List[Dict[str, Any]] = []
+        for idx, prompt in enumerate(tqdm(prompts, desc="Analyzing prompts")):
+            logger.info(f"[Batch Eval] Processing prompt {idx + 1}/{len(prompts)}")
+            analysis = self.analyze_prompt(prompt)
+            key = f"prompt_{idx + 1}"
+            analyses[key] = analysis
+            aggregated_targeted.extend(analysis.get('perturbation_experiments', []))
+            aggregated_random.extend(analysis.get('random_baseline_experiments', []))
+            aggregated_path.extend(analysis.get('path_ablation_experiments', []))
+        aggregate_summary = {
+            'targeted': self._summarize_ablation_results(aggregated_targeted),
+            'random_baseline': self._summarize_ablation_results(aggregated_random),
+            'path': self._summarize_ablation_results(aggregated_path),
+            'random_path_baseline': self._summarize_ablation_results(
+                [
+                    exp
+                    for analysis in analyses.values()
+                    for exp in analysis.get('random_path_baseline_experiments', [])
+                ]
+            )
+        }
+        aggregate_summary['target_minus_random_abs_probability_change'] = (
+            aggregate_summary['targeted'].get('avg_abs_probability_change', 0.0)
+            - aggregate_summary['random_baseline'].get('avg_abs_probability_change', 0.0)
+        )
+        aggregate_summary['target_flip_rate_minus_random'] = (
+            aggregate_summary['targeted'].get('flip_rate', 0.0)
+            - aggregate_summary['random_baseline'].get('flip_rate', 0.0)
+        )
+        aggregate_summary['path_minus_random_abs_probability_change'] = (
+            aggregate_summary['path'].get('avg_abs_probability_change', 0.0)
+            - aggregate_summary['random_path_baseline'].get('avg_abs_probability_change', 0.0)
+        )
+        aggregate_summary['path_flip_rate_minus_random'] = (
+            aggregate_summary['path'].get('flip_rate', 0.0)
+            - aggregate_summary['random_path_baseline'].get('flip_rate', 0.0)
+        )
+        return {
+            'analyses': analyses,
+            'aggregate_summary': aggregate_summary,
+            'prompt_texts': prompts
+        }
+    def _describe_path(self, path: List[str]) -> str:
+        # Generates a human-readable description of a path.
+        descriptions = []
+        for node in path:
+            if self.attribution_graph.node_types[node] == "embedding":
+                token = node.split('_')[2]
+                descriptions.append(f"Token '{token}'")
+            elif self.attribution_graph.node_types[node] == "feature":
+                parts = node.split('_')
+                layer = parts[1][1:]  # Remove 'L'
+                feature = parts[3][1:]  # Remove 'F'
+                # Try to get the interpretation.
+                key = f"L{layer}_F{feature}"
+                interpretation = self.feature_visualizer.feature_interpretations.get(key, "unknown")
+                descriptions.append(f"Feature L{layer}F{feature} ({interpretation})")
+            elif self.attribution_graph.node_types[node] == "output":
+                descriptions.append("Output")
+        return " → ".join(descriptions)
+    def save_results(self, results: Dict, save_path: str):
+        # Saves the analysis results to a file.
+        serializable_results = copy.deepcopy(results)
+        if 'graph' in serializable_results:
+            serializable_results['graph'] = nx.node_link_data(serializable_results['graph'])
+        analyses = serializable_results.get('analyses', {})
+        for key, analysis in analyses.items():
+            if 'graph' in analysis:
+                analysis['graph'] = nx.node_link_data(analysis['graph'])
+        with open(save_path, 'w') as f:
+            json.dump(serializable_results, f, indent=2, default=str)
+        logger.info(f"Results saved to {save_path}")
+    def save_clt(self, path: str):
+        # Saves the trained CLT model.
+        torch.save(self.clt.state_dict(), path)
+        logger.info(f"CLT model saved to {path}")
+    def load_clt(self, path: str):
+        # Loads a trained CLT model.
+        self.clt.load_state_dict(torch.load(path, map_location=self.device))
+        self.clt.to(self.device)
+        self.clt.eval()  # Set the model to evaluation mode
+        logger.info(f"Loaded CLT model from {path}")
+# --- Configuration ---
+MAX_SEQ_LEN = 256
+N_FEATURES_PER_LAYER = 512
+TRAINING_STEPS = 2500
+BATCH_SIZE = 64
+LEARNING_RATE = 1e-3
+# Prompts for generating the final analysis.
+ANALYSIS_PROMPTS = [
+    "The capital of France is",
+    "def factorial(n):",
+    "The literary device in the phrase 'The wind whispered through the trees' is"
+]
+# A larger set of prompts for training.
+TRAINING_PROMPTS = [
+    "The capital of France is", "To be or not to be, that is the", "A stitch in time saves",
+    "The first person to walk on the moon was", "The chemical formula for water is H2O.",
+    "Translate to German: 'The cat sits on the mat.'", "def factorial(n):", "import numpy as np",
+    "The main ingredients in a pizza are", "What is the powerhouse of the cell?",
+    "The equation E=mc^2 relates energy to", "Continue the story: Once upon a time, there was a",
+    "Classify the sentiment: 'I am overjoyed!'", "Extract the entities: 'Apple Inc. is in Cupertino.'",
+    "What is the next number: 2, 4, 8, 16, __?", "A rolling stone gathers no",
+    "The opposite of hot is", "import torch", "import pandas as pd", "class MyClass:",
+    "def __init__(self):", "The primary colors are", "What is the capital of Japan?",
+    "Who wrote 'Hamlet'?", "The square root of 64 is", "The sun rises in the",
+    "The Pacific Ocean is the largest ocean on Earth.", "The mitochondria is the powerhouse of the cell.",
+    "What is the capital of Mongolia?", "The movie 'The Matrix' can be classified into the following genre:",
+    "The French translation of 'I would like to order a coffee, please.' is:",
+    "The literary device in the phrase 'The wind whispered through the trees' is",
+    "A Python function that calculates the factorial of a number is:",
+    "The main ingredient in a Negroni cocktail is",
+    "Summarize the plot of 'Hamlet' in one sentence:",
+    "The sentence 'The cake was eaten by the dog' is in the following voice:",
+    "A good headline for an article about a new breakthrough in battery technology would be:"
+]
+# --- Qwen API for Feature Interpretation ---
+@torch.no_grad()
+def get_feature_interpretation_with_qwen(
+    api_config: dict,
+    top_tokens: list[str],
+    feature_name: str,
+    layer_index: int,
+    max_retries: int = 3,
+    initial_backoff: float = 2.0
+) -> str:
+    # Generates a high-quality interpretation for a feature using the Qwen API.
+    if not api_config or not api_config.get('api_key'):
+        logger.warning("Qwen API not configured. Skipping interpretation.")
+        return "API not configured"
+    headers = {
+        "Authorization": f"Bearer {api_config['api_key']}",
+        "Content-Type": "application/json"
+    }
+    # Create a specialized prompt.
+    prompt_text = f"""
+You are an expert in transformer interpretability. A feature in a language model (feature '{feature_name}' at layer {layer_index}) is most strongly activated by the following tokens:
+{', '.join(f"'{token}'" for token in top_tokens)}
+Based *only* on these tokens, what is the most likely function or role of this feature?
+Your answer must be a short, concise phrase (e.g., "Detecting proper nouns", "Identifying JSON syntax", "Completing lists", "Recognizing negative sentiment"). Do not write a full sentence.
+"""
+    data = {
+        "model": api_config["model"],
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt_text}]
+            }
+        ],
+        "max_tokens": 50,
+        "temperature": 0.1,
+        "top_p": 0.9,
+        "seed": 42
+    }
+    logger.info(f"  > Interpreting {feature_name} (Layer {layer_index})...")
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"    - Attempt {attempt + 1}/{max_retries}: Sending request to Qwen API...")
+            response = requests.post(
+                f"{api_config['api_endpoint']}/chat/completions",
+                headers=headers,
+                json=data,
+                timeout=60
+            )
+            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+            result = response.json()
+            interpretation = result["choices"][0]["message"]["content"].strip()
+            # Remove quotes from the output.
+            if interpretation.startswith('"') and interpretation.endswith('"'):
+                interpretation = interpretation[1:-1]
+            logger.info(f"    - Success! Interpretation: '{interpretation}'")
+            return interpretation
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"    - Qwen API request failed (Attempt {attempt + 1}/{max_retries}): {e}")
+            if attempt < max_retries - 1:
+                backoff_time = initial_backoff * (2 ** attempt)
+                logger.info(f"    - Retrying in {backoff_time:.1f} seconds...")
+                time.sleep(backoff_time)
+            else:
+                logger.error("    - Max retries reached. Failing.")
+                return f"API Error: {e}"
+        except (KeyError, IndexError) as e:
+            logger.error(f"    - Failed to parse Qwen API response: {e}")
+            return "API Error: Invalid response format"
+        finally:
+            # Add a delay to respect API rate limits.
+            time.sleep(2.1)
+    return "API Error: Max retries exceeded"
+def train_transcoder(transcoder, model, tokenizer, training_prompts, device, steps=1000, batch_size=16, optimizer=None):
+    # Trains the Cross-Layer Transcoder.
+    transcoder.train()
+    # Use a progress bar for visual feedback.
+    progress_bar = tqdm(range(steps), desc="Training CLT")
+    for step in progress_bar:
+        # Get a random batch of prompts.
+        batch_prompts = random.choices(training_prompts, k=batch_size)
+        # Tokenize the batch.
+        inputs = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=MAX_SEQ_LEN
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Get the model activations.
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[1:]
+        # Forward pass through the CLT.
+        feature_activations, reconstructed_outputs = transcoder(hidden_states)
+        # Compute the reconstruction loss.
+        recon_loss = 0.0
+        for i, (target, pred) in enumerate(zip(hidden_states, reconstructed_outputs)):
+            recon_loss += F.mse_loss(pred, target)
+        # Compute the sparsity loss.
+        sparsity_loss = 0.0
+        for features in feature_activations:
+            sparsity_loss += torch.mean(torch.tanh(0.01 * features)) # Use config.sparsity_lambda
+        # Total loss.
+        loss = (0.8 * recon_loss + 0.2 * sparsity_loss) # Use config.reconstruction_loss_weight
+        if optimizer:
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        progress_bar.set_postfix({
+            "Recon Loss": f"{recon_loss.item():.4f}",
+            "Sparsity Loss": f"{sparsity_loss.item():.4f}",
+            "Total Loss": f"{loss.item():.4f}"
+        })
+def generate_feature_visualizations(transcoder, model, tokenizer, prompt, device, qwen_api_config=None, graph_config: Optional[AttributionGraphConfig] = None):
+    # Generates feature visualizations and interpretations for a prompt.
+    # Tokenize the prompt.
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Get the model activations.
+    with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
+        hidden_states = outputs.hidden_states[1:]
+    # Forward pass through the CLT.
+    feature_activations, reconstructed_outputs = transcoder(hidden_states)
+    # Visualize the features.
+    feature_visualizations = {}
+    for layer_idx, features in enumerate(tqdm(feature_activations, desc="Interpreting features")):
+        layer_viz = {}
+        # Analyze the top features for this layer.
+        # features shape: [batch_size, seq_len, n_features]
+        feature_importance = torch.mean(features, dim=(0, 1))  # Average over batch and sequence
+        top_lim = getattr(graph_config, "feature_visualization_top_k", 5) if graph_config else 5
+        top_features = torch.topk(feature_importance, k=min(top_lim, feature_importance.size(0))).indices
+        for feat_idx in top_features:
+            viz = FeatureVisualizer(tokenizer).visualize_feature(
+                feat_idx.item(), layer_idx, features[0], tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+            )
+            interpretation = FeatureVisualizer(tokenizer).interpret_feature(
+                feat_idx.item(), layer_idx, viz, qwen_api_config
+            )
+            viz['interpretation'] = interpretation
+            layer_viz[f"feature_{feat_idx.item()}"] = viz
+        feature_visualizations[f"layer_{layer_idx}"] = layer_viz
+    # Construct the attribution graph.
+    if graph_config is None:
+        graph_config = AttributionGraphConfig()
+    attribution_graph = AttributionGraph(transcoder, tokenizer, graph_config)
+    graph = attribution_graph.construct_graph(
+        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), feature_activations, -1 # No target token for visualization
+    )
+    # Prune the graph.
+    pruned_graph = attribution_graph.prune_graph(0.8) # Use config.pruning_threshold
+    # Analyze the most important paths.
+    important_paths = []
+    if len(pruned_graph.nodes()) > 0:
+        embedding_nodes = [
+            node for node, type_ in attribution_graph.node_types.items()
+            if type_ == "embedding" and node in pruned_graph
+        ]
+        output_nodes = [
+            node for node, type_ in attribution_graph.node_types.items()
+            if type_ == "output" and node in pruned_graph
+        ]
+        path_loop = tqdm(embedding_nodes[:3], desc="Enumerating paths")
+        for emb_node in path_loop:
+            for out_node in output_nodes:
+                try:
+                    cutoff = getattr(graph_config, "path_search_cutoff", 5) if graph_config else 5
+                    cutoff = max(2, cutoff)
+                    paths = list(nx.all_simple_paths(pruned_graph, emb_node, out_node, cutoff=cutoff))
+                    for path in paths[:2]:
+                        path_weight = 1.0
+                        for i in range(len(path) - 1):
+                            edge_weight = attribution_graph.edge_weights.get(
+                                (path[i], path[i + 1]), 0.0
+                            )
+                            path_weight *= abs(edge_weight)
+                        important_paths.append({
+                            'path': path,
+                            'weight': path_weight,
+                            'description': attribution_graph._describe_path(path)
+                        })
+                except nx.NetworkXNoPath:
+                    continue
+    # Sort paths by importance.
+    important_paths.sort(key=lambda x: x['weight'], reverse=True)
+    return {
+        "prompt": prompt,
+        "full_graph_stats": {
+            "n_nodes": len(graph.nodes()),
+            "n_edges": len(graph.edges()),
+            "node_types": dict(attribution_graph.node_types)
+        },
+        "pruned_graph_stats": {
+            "n_nodes": len(pruned_graph.nodes()),
+            "n_edges": len(pruned_graph.edges())
+        },
+        "feature_visualizations": feature_visualizations,
+        "important_paths": important_paths[:5] # Top 5 paths
+    }
+def main():
+    # Main function to run the analysis for a single prompt.
+    # Set a seed for reproducibility.
+    set_seed()
+    # --- Argument Parser ---
+    parser = argparse.ArgumentParser(description="Run Attribution Graph analysis for a single prompt.")
+    parser.add_argument(
+        '--prompt-index',
+        type=int,
+        required=True,
+        help=f"The 0-based index of the prompt to analyze from the ANALYSIS_PROMPTS list (0 to {len(ANALYSIS_PROMPTS) - 1})."
+    )
+    parser.add_argument(
+        '--force-retrain-clt',
+        action='store_true',
+        help="Force re-training of the Cross-Layer Transcoder, even if a saved model exists."
+    )
+    parser.add_argument(
+        '--batch-eval',
+        action='store_true',
+        help="Analyze all predefined prompts and compute aggregate faithfulness metrics."
+    )
+    args = parser.parse_args()
+    prompt_idx = args.prompt_index
+    if not (0 <= prompt_idx < len(ANALYSIS_PROMPTS)):
+        print(f"❌ Error: --prompt-index must be between 0 and {len(ANALYSIS_PROMPTS) - 1}.")
+        return
+    # Get the API config from the utility function.
+    qwen_api_config = init_qwen_api()
+    # Configuration
+    config = AttributionGraphConfig(
+        model_path="./models/OLMo-2-1124-7B",
+        n_features_per_layer=512,
+        training_steps=500,
+        batch_size=4,
+        max_seq_length=256,
+        learning_rate=1e-4,
+        sparsity_lambda=0.01,
+        qwen_api_config=qwen_api_config
+    )
+    print("Attribution Graphs for OLMo2 7B - Single Prompt Pipeline")
+    print("=" * 50)
+    print(f"Model path: {config.model_path}")
+    print(f"Device: {config.device}")
+    try:
+        # Initialize the full pipeline.
+        print("🚀 Initializing Attribution Graphs Pipeline...")
+        pipeline = AttributionGraphsPipeline(config)
+        print("✓ Pipeline initialized successfully")
+        print()
+        # Load an existing CLT model or train a new one.
+        if os.path.exists(CLT_SAVE_PATH) and not args.force_retrain_clt:
+            print(f"🧠 Loading existing CLT model from {CLT_SAVE_PATH}...")
+            pipeline.load_clt(CLT_SAVE_PATH)
+            print("✓ CLT model loaded successfully.")
+        else:
+            if args.force_retrain_clt and os.path.exists(CLT_SAVE_PATH):
+                print("��‍♂️ --force-retrain-clt flag is set. Overwriting existing model.")
+            # Train a new CLT model.
+            print("📚 Training a new CLT model...")
+            print(f"   Training on {len(TRAINING_PROMPTS)} example texts...")
+            training_stats = pipeline.train_clt(TRAINING_PROMPTS)
+            print("✓ CLT training completed.")
+            # Save the training statistics.
+            stats_save_path = os.path.join(RESULTS_DIR, "clt_training_stats.json")
+            with open(stats_save_path, 'w') as f:
+                json.dump(training_stats, f, indent=2)
+            print(f"   Saved training stats to {stats_save_path}")
+            # Save the new model.
+            pipeline.save_clt(CLT_SAVE_PATH)
+            print(f"   Saved trained model to {CLT_SAVE_PATH} for future use.")
+        print()
+        if args.batch_eval:
+            print("📊 Running batch faithfulness evaluation across all prompts...")
+            batch_payload = pipeline.analyze_prompts_batch(ANALYSIS_PROMPTS)
+            final_results = copy.deepcopy(batch_payload)
+            final_results['config'] = config.__dict__
+            final_results['timestamp'] = str(time.time())
+            for analysis_entry in final_results['analyses'].values():
+                analysis_entry.pop('graph', None)
+            batch_save_path = os.path.join(RESULTS_DIR, "attribution_graphs_batch_results.json")
+            pipeline.save_results(final_results, batch_save_path)
+            print(f"💾 Batch results saved to {batch_save_path}")
+            aggregate_summary = batch_payload['aggregate_summary']
+            targeted_summary = aggregate_summary.get('targeted', {})
+            random_summary = aggregate_summary.get('random_baseline', {})
+            path_summary = aggregate_summary.get('path', {})
+            def _format_summary(label: str, summary: Dict[str, Any]) -> str:
+                return (
+                    f"{label}: count={summary.get('count', 0)}, "
+                    f"avg|Δp|={summary.get('avg_abs_probability_change', 0.0):.4f}, "
+                    f"flip_rate={summary.get('flip_rate', 0.0):.2%}"
+                )
+            print("📈 Aggregate faithfulness summary")
+            print(f"    {_format_summary('Targeted', targeted_summary)}")
+            print(f"    {_format_summary('Random baseline', random_summary)}")
+            print(f"    {_format_summary('Path', path_summary)}")
+            print(f"    {_format_summary('Random path baseline', aggregate_summary.get('random_path_baseline', {}))}")
+            diff_abs = aggregate_summary.get('target_minus_random_abs_probability_change', 0.0)
+            diff_flip = aggregate_summary.get('target_flip_rate_minus_random', 0.0)
+            path_diff_abs = aggregate_summary.get('path_minus_random_abs_probability_change', 0.0)
+            path_diff_flip = aggregate_summary.get('path_flip_rate_minus_random', 0.0)
+            print(f"    Targeted vs Random |Δp| difference: {diff_abs:.4f}")
+            print(f"    Targeted vs Random flip rate difference: {diff_flip:.4f}")
+            print(f"    Path vs Random path |Δp| difference: {path_diff_abs:.4f}")
+            print(f"    Path vs Random path flip rate difference: {path_diff_flip:.4f}")
+            print("\n🎉 Batch evaluation completed successfully!")
+            return
+        # Analyze the selected prompt.
+        prompt_to_analyze = ANALYSIS_PROMPTS[prompt_idx]
+        print(f"🔍 Analyzing prompt {prompt_idx + 1}/{len(ANALYSIS_PROMPTS)}: '{prompt_to_analyze}'")
+        analysis = pipeline.analyze_prompt(prompt_to_analyze, target_token_idx=-1)
+        # Display the key results.
+        print(f"  ✓ Tokenized into {len(analysis['input_tokens'])} tokens")
+        print(f"  ✓ Full graph: {analysis['full_graph_stats']['n_nodes']} nodes, {analysis['full_graph_stats']['n_edges']} edges")
+        print(f"  ✓ Pruned graph: {analysis['pruned_graph_stats']['n_nodes']} nodes, {analysis['pruned_graph_stats']['n_edges']} edges")
+        # Show the top features.
+        print("  📊 Top active features:")
+        feature_layers_items = list(analysis['feature_visualizations'].items())
+        if config.summary_max_layers is not None:
+            feature_layers_items = feature_layers_items[:config.summary_max_layers]
+        for layer_name, layer_features in feature_layers_items:
+            print(f"    {layer_name}:")
+            feature_items = layer_features.items()
+            if config.summary_features_per_layer is not None:
+                feature_items = list(feature_items)[:config.summary_features_per_layer]
+            for feat_name, feat_data in feature_items:
+                print(f"      {feat_name}: {feat_data['interpretation']} (max: {feat_data['max_activation']:.3f})")
+        print()
+        # Summarize perturbation experiments and baselines.
+        print("🧪 Targeted feature ablations:")
+        targeted_results = analysis.get('perturbation_experiments', [])
+        if targeted_results:
+            for experiment in targeted_results:
+                layer_name = experiment.get('layer_name', f"L{experiment.get('feature_set', [{}])[0].get('layer', '?')}")
+                feature_name = experiment.get('feature_name', f"F{experiment.get('feature_set', [{}])[0].get('feature', '?')}")
+                prob_delta = experiment.get('probability_change', 0.0)
+                logit_delta = experiment.get('logit_change', 0.0)
+                flips = experiment.get('ablation_flips_top_prediction', False)
+                print(f"    {layer_name}/{feature_name}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No targeted ablations were recorded.")
+        print("\n🎲 Random baseline ablations:")
+        random_baseline = analysis.get('random_baseline_experiments', [])
+        if random_baseline:
+            for experiment in random_baseline:
+                prob_delta = experiment.get('probability_change', 0.0)
+                logit_delta = experiment.get('logit_change', 0.0)
+                flips = experiment.get('ablation_flips_top_prediction', False)
+                trial_idx = experiment.get('trial_index', '?')
+                print(f"    Trial {trial_idx}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No random baseline trials were run.")
+        print("\n🛤️ Path ablations:")
+        path_results = analysis.get('path_ablation_experiments', [])
+        if path_results:
+            for path_exp in path_results:
+                description = path_exp.get('path_description', 'Path')
+                prob_delta = path_exp.get('probability_change', 0.0)
+                logit_delta = path_exp.get('logit_change', 0.0)
+                flips = path_exp.get('ablation_flips_top_prediction', False)
+                print(f"    {description}: Δp={prob_delta:.4f}, Δlogit={logit_delta:.4f}, flips_top={flips}")
+        else:
+            print("    - No path ablations were run.")
+        summary_stats = analysis.get('summary_statistics', {})
+        targeted_summary = summary_stats.get('targeted', {})
+        random_summary = summary_stats.get('random_baseline', {})
+        path_summary = summary_stats.get('path', {})
+        random_path_summary = summary_stats.get('random_path_baseline', {})
+        print("\n📈 Summary statistics:")
+        print(f"    Targeted: avg|Δp|={targeted_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={targeted_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Random baseline: avg|Δp|={random_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={random_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Path: avg|Δp|={path_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={path_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Random path baseline: avg|Δp|={random_path_summary.get('avg_abs_probability_change', 0.0):.4f}, flip_rate={random_path_summary.get('flip_rate', 0.0):.2%}")
+        print(f"    Targeted vs Random |Δp| diff: {summary_stats.get('target_minus_random_abs_probability_change', 0.0):.4f}")
+        print(f"    Targeted vs Random flip diff: {summary_stats.get('target_flip_rate_minus_random', 0.0):.4f}")
+        print(f"    Path vs Random path |Δp| diff: {summary_stats.get('path_minus_random_abs_probability_change', 0.0):.4f}")
+        print(f"    Path vs Random path flip diff: {summary_stats.get('path_flip_rate_minus_random', 0.0):.4f}")
+        print("\n✓ Faithfulness experiments summarized\n")
+        # Generate a visualization for the prompt.
+        print("📈 Generating visualization...")
+        if 'graph' in analysis and analysis['pruned_graph_stats']['n_nodes'] > 0:
+            viz_path = os.path.join(RESULTS_DIR, f"attribution_graph_prompt_{prompt_idx + 1}.png")
+            pipeline.attribution_graph.visualize_graph(analysis['graph'], save_path=viz_path)
+            print(f"  ✓ Graph visualization saved to {viz_path}")
+        else:
+            print("  - Skipping visualization as no graph was generated or it was empty.")
+        # Save the results in a format for the web app.
+        save_path = os.path.join(RESULTS_DIR, f"attribution_graphs_results_prompt_{prompt_idx + 1}.json")
+        # Create a JSON file that can be merged with others.
+        final_results = {
+            "analyses": {
+                f"prompt_{prompt_idx + 1}": analysis
+            },
+            "config": config.__dict__,
+            "timestamp": str(time.time())
+        }
+        # The web page doesn't use the graph object, so remove it.
+        if 'graph' in final_results['analyses'][f"prompt_{prompt_idx + 1}"]:
+            del final_results['analyses'][f"prompt_{prompt_idx + 1}"]['graph']
+        pipeline.save_results(final_results, save_path)
+        print(f"💾 Results saved to {save_path}")
+        print("\n🎉 Analysis for this prompt completed successfully!")
+    except Exception as e:
+        print(f"❌ Error during execution: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

circuit_analysis/calculate_cpr_cmd.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import numpy as np
+import networkx as nx
+import argparse
+import json
+import os
+import sys
+import logging
+from typing import List, Tuple
+from pathlib import Path
+import math
+# Ensure we can import the pipeline
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from circuit_analysis.attribution_graphs_olmo import (
+    AttributionGraphsPipeline,
+    AttributionGraphConfig,
+    ANALYSIS_PROMPTS,
+    AttributionGraph
+)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def compute_cpr(k_values: List[float], f_values: List[float]) -> float:
+    """
+    Compute CPR (Integrated Circuit Performance Ratio) using the trapezoidal rule.
+    CPR = Integral of f(C_k) dk
+    """
+    cpr = 0.0
+    for i in range(len(k_values) - 1):
+        cpr += 0.5 * (f_values[i] + f_values[i+1]) * (k_values[i+1] - k_values[i])
+    return cpr
+def compute_cmd(k_values: List[float], f_values: List[float]) -> float:
+    """
+    Compute CMD (Integrated Circuit-Model Distance) using the trapezoidal rule.
+    CMD = Integral of |1 - f(C_k)| dk
+    """
+    cmd = 0.0
+    for i in range(len(k_values) - 1):
+        y0 = abs(1.0 - f_values[i])
+        y1 = abs(1.0 - f_values[i+1])
+        cmd += 0.5 * (y0 + y1) * (k_values[i+1] - k_values[i])
+    return cmd
+def get_active_features_from_graph(graph: nx.DiGraph) -> List[Tuple[int, int]]:
+    """
+    Extracts the list of feature nodes (as layer_idx, feature_idx tuples) from the graph.
+    """
+    features = []
+    for node in graph.nodes():
+        if node.startswith("feat_"):
+            parts = node.split('_')
+            try:
+                # Format: feat_L{layer}_T{token}_F{feature}
+                layer_idx = int(parts[1][1:])
+                feature_idx = int(parts[3][1:])
+                # We only care about unique (layer, feature) pairs for ablation
+                features.append((layer_idx, feature_idx))
+            except (IndexError, ValueError):
+                continue
+    return list(set(features))
+def calculate_graph_importance(attribution_graph_obj: AttributionGraph, graph: nx.DiGraph) -> List[Tuple[str, float]]:
+    """
+    Calculates the importance of each feature node in the graph based on edge weights.
+    Returns a list of (node_id, importance_score) sorted by importance descending.
+    """
+    node_importance = {}
+    # Identify feature nodes
+    feature_nodes = [n for n in graph.nodes() if attribution_graph_obj.node_types.get(n) == "feature"]
+    # Calculate importance as sum of absolute weights of connected edges
+    for node in feature_nodes:
+        importance = 0.0
+        # Outgoing edges
+        for _, target in graph.out_edges(node):
+            weight = attribution_graph_obj.edge_weights.get((node, target), 0.0)
+            importance += abs(weight)
+        # Incoming edges? MIB usually focuses on "importance" for the task.
+        # Using sum of absolute edge weights is a standard proxy.
+        # attribution_graphs_olmo.py prune_graph uses sum of all connected edge weights (in and out).
+        for source, _ in graph.in_edges(node):
+            weight = attribution_graph_obj.edge_weights.get((source, node), 0.0)
+            importance += abs(weight)
+        node_importance[node] = importance
+    return sorted(node_importance.items(), key=lambda x: x[1], reverse=True)
+def get_edges_count(graph: nx.DiGraph, nodes: List[str]) -> int:
+    """
+    Returns the number of edges in the subgraph induced by the given nodes
+    (plus edges to output/embedding if we consider them part of the circuit context).
+    However, strictly following "fraction of total edges":
+    We should count edges where BOTH source and target are in the kept set (including embeddings/output).
+    """
+    # Assuming embeddings and output are always "kept" or don't count towards the quota
+    # if we only ablate features.
+    # But for the metric k = |C|/|N|, we need a consistent definition.
+    # Let's define |C| as the number of edges in the subgraph induced by (Selected Features + Embeddings + Output).
+    nodes_set = set(nodes)
+    count = 0
+    for u, v in graph.edges():
+        if u in nodes_set and v in nodes_set:
+            count += 1
+    return count
+def run_cpr_cmd_analysis(pipeline: AttributionGraphsPipeline, prompt_idx: int):
+    """
+    Compute CPR and CMD for a given prompt, using:
+    - Universe: all feature nodes present in the attribution graph
+    - Metric m: logit(target) only (no foil)
+    - Interventions: ablation of feature sets with intervention_strength=1.0
+    """
+    prompt = ANALYSIS_PROMPTS[prompt_idx]
+    logger.info(f"Analyzing prompt {prompt_idx}: '{prompt}'")
+    # Build/prune the attribution graph for this prompt
+    pipeline.analyze_prompt(prompt)
+    full_graph = pipeline.attribution_graph.graph
+    # Baseline: run once to get logits & feature activations
+    baseline_data = pipeline.perturbation_experiments._prepare_inputs(prompt, top_k=1)
+    target_token_id = baseline_data['baseline_top_tokens'].indices[0].item()
+    baseline_logits = baseline_data['baseline_last_token_logits']
+    m_N = baseline_logits[target_token_id].item()
+    logger.info(
+        f"Baseline m(N) = {m_N:.4f} "
+        f"(Token: {pipeline.tokenizer.decode([target_token_id])})"
+    )
+    # Universe: all feature nodes in the graph
+    universe_features = get_active_features_from_graph(full_graph)
+    logger.info(f"Graph Universe size: {len(universe_features)} features")
+    if not universe_features:
+        logger.warning("No features found in graph. Skipping.")
+        return None
+    # Empty circuit: ablate all universe features
+    empty_res = pipeline.perturbation_experiments.feature_set_ablation_experiment(
+        prompt,
+        feature_set=universe_features,
+        intervention_strength=1.0,
+        target_token_id=target_token_id
+    )
+    m_empty = empty_res["ablated_logit"]
+    logger.info(f"Empty m(Ø) = {m_empty:.4f}")
+    if not math.isfinite(m_empty):
+        logger.warning(
+            f"m_empty is non-finite ({m_empty}) for prompt {prompt_idx}; "
+            "skipping CPR/CMD for this prompt."
+        )
+        return None
+    # Node importance within the graph
+    sorted_nodes = calculate_graph_importance(pipeline.attribution_graph, full_graph)
+    total_edges = full_graph.number_of_edges()
+    k_grid = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]
+    f_values = []
+    actual_k_values = []
+    # Embeddings/output are always kept
+    always_kept_nodes = [n for n in full_graph.nodes() if not n.startswith("feat_")]
+    logger.info("Computing faithfulness curve...")
+    for k in k_grid:
+        target_edge_count = int(k * total_edges)
+        current_circuit_nodes = list(always_kept_nodes)
+        current_feature_tuples = []
+        for node, _ in sorted_nodes:
+            current_edge_count = get_edges_count(full_graph, current_circuit_nodes)
+            if current_edge_count >= target_edge_count and len(current_feature_tuples) > 0:
+                break
+            current_circuit_nodes.append(node)
+            parts = node.split("_")
+            l = int(parts[1][1:])
+            f = int(parts[3][1:])
+            current_feature_tuples.append((l, f))
+        actual_edges = get_edges_count(full_graph, current_circuit_nodes)
+        actual_k = actual_edges / total_edges if total_edges > 0 else 0.0
+        actual_k_values.append(actual_k)
+        # Complement = universe \ current features
+        current_set = set(current_feature_tuples)
+        complement_set = [ft for ft in universe_features if ft not in current_set]
+        if not complement_set:
+            m_Ck = m_N
+        else:
+            res = pipeline.perturbation_experiments.feature_set_ablation_experiment(
+                prompt,
+                feature_set=complement_set,
+                intervention_strength=1.0,
+                target_token_id=target_token_id
+            )
+            m_Ck = res["ablated_logit"]
+        if not math.isfinite(m_Ck):
+            logger.warning(
+                f"Non-finite m_Ck={m_Ck} for k={k:.4f} on prompt {prompt_idx}; "
+                "skipping this k point."
+            )
+            continue
+        if abs(m_N - m_empty) < 1e-6:
+            f_k = 0.0
+        else:
+            raw_f = (m_Ck - m_empty) / (m_N - m_empty)
+            f_k = max(0.0, min(1.0, raw_f))
+        f_values.append(f_k)
+    if not actual_k_values or not f_values:
+        logger.warning(f"No valid k-points for prompt {prompt_idx}; skipping.")
+        return None
+    pairs = sorted(zip(actual_k_values, f_values), key=lambda x: x[0])
+    sorted_k = [p[0] for p in pairs]
+    sorted_f = [p[1] for p in pairs]
+    if sorted_k[0] > 0.0:
+        sorted_k.insert(0, 0.0)
+        sorted_f.insert(0, 0.0)
+    if sorted_k[-1] < 1.0:
+        last_f = sorted_f[-1]
+        sorted_k.append(1.0)
+        sorted_f.append(last_f)
+    cpr = compute_cpr(sorted_k, sorted_f)
+    cmd = compute_cmd(sorted_k, sorted_f)
+    logger.info(f"Result: CPR={cpr:.4f}, CMD={cmd:.4f}")
+    return {
+        "prompt": prompt,
+        "target_token": pipeline.tokenizer.decode([target_token_id]),
+        "m_N": m_N,
+        "m_empty": m_empty,
+        "curve_k": sorted_k,
+        "curve_f": sorted_f,
+        "CPR": cpr,
+        "CMD": cmd
+    }
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", type=str, default="circuit_analysis/results/cpr_cmd_results.json")
+    args = parser.parse_args()
+    # Initialize Pipeline
+    config = AttributionGraphConfig(
+        model_path="models/OLMo-2-1124-7B", # Adjust relative path if needed
+        n_features_per_layer=512,  # Back to 512 due to memory constraints
+        # We want a fairly rich graph to start with, so we can prune it down
+        graph_feature_activation_threshold=0.01,
+        graph_edge_weight_threshold=0.003, # Lower threshold for more edges (prev: 0.005)
+        graph_max_features_per_layer=40, # Increased from 24 (prev: 100 was too slow)
+        graph_max_edges_per_node=20,     # Increased from 12 (prev: 50 was too slow)
+        # intervention_strength defaults to 5.0 in AttributionGraphConfig, which was working better
+        intervention_strength=1.0,
+    )
+    # Check model path
+    if not os.path.exists(config.model_path):
+        # Try absolute python3 circuit_analysis/calculate_cpr_cmd.pypath or relative to script
+        root_path = Path(__file__).resolve().parent.parent
+        possible_path = root_path / "models" / "OLMo-2-1124-7B"
+        if possible_path.exists():
+            config.model_path = str(possible_path)
+        else:
+            # Try the one in current dir?
+            pass
+    pipeline = AttributionGraphsPipeline(config)
+    # Load CLT
+    clt_path = "circuit_analysis/models/clt_model.pth"
+    if not os.path.exists(clt_path):
+        # Try full path
+        clt_path = str(Path(__file__).resolve().parent / "models" / "clt_model.pth")
+    if os.path.exists(clt_path):
+        pipeline.load_clt(clt_path)
+    else:
+        logger.error(f"CLT model not found at {clt_path}. Please train it first.")
+        return
+    results = []
+    for i in range(len(ANALYSIS_PROMPTS)):
+        try:
+            res = run_cpr_cmd_analysis(pipeline, i)
+            if res:
+                results.append(res)
+        except Exception as e:
+            logger.error(f"Failed prompt {i}: {e}", exc_info=True)
+    # Average CPR/CMD
+    if results:
+        avg_cpr = np.mean([r['CPR'] for r in results])
+        avg_cmd = np.mean([r['CMD'] for r in results])
+    else:
+        avg_cpr = 0.0
+        avg_cmd = 0.0
+    final_output = {
+        "results": results,
+        "average_CPR": avg_cpr,
+        "average_CMD": avg_cmd
+    }
+    # Save
+    os.makedirs(os.path.dirname(args.output), exist_ok=True)
+    with open(args.output, 'w') as f:
+        json.dump(final_output, f, indent=2)
+    print(f"\n\nFinal Average CPR: {avg_cpr:.4f}")
+    print(f"Final Average CMD: {avg_cmd:.4f}")
+    print(f"Results saved to {args.output}")
+if __name__ == "__main__":
+    main()

circuit_analysis/circuit_trace_page.py ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/merge_circuit_results.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+import sys
+import os
+def merge_json_results(base_file, new_file):
+    # Merges the 'analyses' from a new results file into a base file.
+    try:
+        # Ensure the results directory exists.
+        results_dir = "circuit_analysis/results"
+        if not os.path.exists(results_dir):
+            os.makedirs(results_dir)
+        base_path = os.path.join(results_dir, base_file)
+        new_path = os.path.join(results_dir, new_file)
+        # Load the base file, or create a new one if it doesn't exist.
+        if os.path.exists(base_path):
+            with open(base_path, 'r') as f:
+                base_data = json.load(f)
+        else:
+            print(f"Base file '{base_file}' not found. Creating a new one.")
+            base_data = {"analyses": {}}
+        # Load the new results file.
+        with open(new_path, 'r') as f:
+            new_data = json.load(f)
+        # Ensure both files have the 'analyses' key.
+        if 'analyses' not in base_data or 'analyses' not in new_data:
+            print("Error: Both files must contain an 'analyses' key.")
+            return
+        # Update the analyses from the base file.
+        base_data['analyses'].update(new_data['analyses'])
+        # Update the timestamp and config from the new file.
+        base_data['timestamp'] = new_data.get('timestamp', base_data.get('timestamp'))
+        base_data['config'] = new_data.get('config', base_data.get('config'))
+        # Write the merged data back to the base file.
+        with open(base_path, 'w') as f:
+            json.dump(base_data, f, indent=2)
+        print(f"Successfully merged '{new_file}' into '{base_file}'.")
+    except FileNotFoundError as e:
+        print(f"Error: File not found - {e.filename}")
+    except json.JSONDecodeError:
+        print("Error: Invalid JSON format in one of the files.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python merge_circuit_results.py <base_json_file> <new_json_file>")
+    else:
+        merge_json_results(sys.argv[1], sys.argv[2])

circuit_analysis/offline_circuit_metrics.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python3
+"""
+Run attribution-graph ablation experiments outside the Streamlit UI.
+This script executes the same targeted/random/path perturbations as the
+interactive tool and emits aggregate metrics so we can verify that the
+visual plots actually reflect causal differences.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Any
+# Ensure we can import the pipeline when this script is executed directly.
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.append(str(PROJECT_ROOT))
+from circuit_analysis.attribution_graphs_olmo import (  # noqa: E402
+    AttributionGraphsPipeline,
+    AttributionGraphConfig,
+    ANALYSIS_PROMPTS,
+)
+RESULTS_DIR = SCRIPT_DIR / "results"
+DEFAULT_OUTPUT = RESULTS_DIR / "offline_circuit_metrics.json"
+def _load_prompts(args: argparse.Namespace) -> List[str]:
+    if args.prompts_file:
+        path = Path(args.prompts_file)
+        if not path.exists():
+            raise FileNotFoundError(f"Prompts file not found: {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            prompts = [line.strip() for line in f if line.strip()]
+        if not prompts:
+            raise ValueError(f"No prompts found in {path}")
+        return prompts
+    if args.prompt_text:
+        return [args.prompt_text]
+    if args.prompt_index is not None:
+        idx = args.prompt_index
+        if not (0 <= idx < len(ANALYSIS_PROMPTS)):
+            raise ValueError(f"--prompt-index must be between 0 and {len(ANALYSIS_PROMPTS)-1}")
+        return [ANALYSIS_PROMPTS[idx]]
+    if args.use_all:
+        return ANALYSIS_PROMPTS
+    # Default: run the canonical prompt set.
+    return ANALYSIS_PROMPTS
+def _format_summary(label: str, summary: Dict[str, Any]) -> str:
+    return (
+        f"{label:<20} "
+        f"count={summary.get('count', 0):3d}  "
+        f"avg|Δp|={summary.get('avg_abs_probability_change', 0.0):.4f}  "
+        f"flip_rate={summary.get('flip_rate', 0.0):.2%}  "
+        f"avg|Δlogit|={summary.get('avg_abs_logit_change', 0.0):.4f}"
+    )
+def _top_experiments(experiments: List[Dict[str, Any]], top_n: int = 5) -> List[Dict[str, Any]]:
+    sorted_exps = sorted(
+        experiments,
+        key=lambda exp: abs(exp.get("probability_change", 0.0)),
+        reverse=True,
+    )
+    summary = []
+    for exp in sorted_exps[:top_n]:
+        summary.append(
+            {
+                "label": exp.get("feature_name") or exp.get("path_description") or "feature_set",
+                "probability_change": exp.get("probability_change", 0.0),
+                "logit_change": exp.get("logit_change", 0.0),
+                "flip": bool(exp.get("ablation_flips_top_prediction")),
+            }
+        )
+    return summary
+def _sanitize_analysis(analysis: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "prompt": analysis.get("prompt"),
+        "summary_statistics": analysis.get("summary_statistics", {}),
+        "counts": {
+            "targeted": len(analysis.get("perturbation_experiments", []) or []),
+            "random": len(analysis.get("random_baseline_experiments", []) or []),
+            "path": len(analysis.get("path_ablation_experiments", []) or []),
+            "random_path": len(analysis.get("random_path_baseline_experiments", []) or []),
+        },
+        "top_targeted": _top_experiments(analysis.get("perturbation_experiments", []) or []),
+        "top_paths": _top_experiments(analysis.get("path_ablation_experiments", []) or []),
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Run offline attribution-graph ablation metrics.")
+    parser.add_argument("--prompt-index", type=int, help="Run a single prompt by index from ANALYSIS_PROMPTS.")
+    parser.add_argument("--prompt-text", type=str, help="Run a single custom prompt.")
+    parser.add_argument("--prompts-file", type=str, help="Path to a text file with one prompt per line.")
+    parser.add_argument("--use-all", action="store_true", help="Run all predefined analysis prompts.")
+    parser.add_argument(
+        "--feature-top-k",
+        type=int,
+        default=12,
+        help="Number of top features per layer to analyze for targeted ablations.",
+    )
+    parser.add_argument(
+        "--ablation-features-per-layer",
+        type=int,
+        default=4,
+        help="Limit of targeted feature ablations per layer.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=str(DEFAULT_OUTPUT),
+        help="Where to store the JSON summary of the offline metrics.",
+    )
+    args = parser.parse_args()
+    prompts = _load_prompts(args)
+    # Use consistent config matching the trained CLT
+    # Use the constructed graph (pruned), not the full universe
+    config = AttributionGraphConfig(
+        n_features_per_layer=512,           # Match trained CLT
+        sparsity_lambda=1e-3,              # Match training
+        graph_feature_activation_threshold=0.01,
+        graph_edge_weight_threshold=0.003,
+        graph_max_features_per_layer=40,
+        graph_max_edges_per_node=20,
+        ablation_features_per_layer=args.ablation_features_per_layer,
+        # Use default pruning_threshold (0.8) to use the constructed graph, not full universe
+    )
+    pipeline = AttributionGraphsPipeline(config)
+    print(f"Running offline faithfulness experiments for {len(prompts)} prompt(s)...")
+    batch_payload = pipeline.analyze_prompts_batch(prompts)
+    aggregate_summary = batch_payload.get("aggregate_summary", {})
+    print("\n=== Aggregate Metrics ===")
+    print(_format_summary("Targeted", aggregate_summary.get("targeted", {})))
+    print(_format_summary("Random baseline", aggregate_summary.get("random_baseline", {})))
+    print(_format_summary("Path", aggregate_summary.get("path", {})))
+    print(_format_summary("Random path", aggregate_summary.get("random_path_baseline", {})))
+    print(
+        f"\nTargeted − Random |Δp| = "
+        f"{aggregate_summary.get('target_minus_random_abs_probability_change', 0.0):.4f}"
+    )
+    print(
+        f"Path − Random path |Δp| = "
+        f"{aggregate_summary.get('path_minus_random_abs_probability_change', 0.0):.4f}"
+    )
+    print(
+        f"Targeted − Random flip rate = "
+        f"{aggregate_summary.get('target_flip_rate_minus_random', 0.0):.4f}"
+    )
+    print(
+        f"Path − Random path flip rate = "
+        f"{aggregate_summary.get('path_flip_rate_minus_random', 0.0):.4f}"
+    )
+    sanitized_per_prompt = {
+        key: _sanitize_analysis(analysis) for key, analysis in batch_payload.get("analyses", {}).items()
+    }
+    output_payload = {
+        "prompts_ran": prompts,
+        "aggregate_summary": aggregate_summary,
+        "per_prompt": sanitized_per_prompt,
+        "config": config.__dict__,
+    }
+    os.makedirs(Path(args.output).parent, exist_ok=True)
+    with open(args.output, "w", encoding="utf-8") as f:
+        json.dump(output_payload, f, indent=2)
+    print(f"\nSaved offline metrics to {args.output}")
+if __name__ == "__main__":
+    main()

circuit_analysis/plot_offline_metrics.py ADDED Viewed

	@@ -0,0 +1,239 @@

+#!/usr/bin/env python3
+"""
+Visualize the aggregate metrics produced by offline_circuit_metrics.py
+both overall and per prompt.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, Any
+from textwrap import fill
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+DEFAULT_RESULTS = Path(__file__).parent / "results" / "offline_circuit_metrics.json"
+DEFAULT_CPR_CMD = Path(__file__).parent / "results" / "cpr_cmd_results.json"
+# Save directly to the paper figures directory
+DEFAULT_FIG = Path(__file__).parent.parent / "writing" / "ELIA__EACL_2026_System_Demonstrations_" / "figures" / "offline_circuit_metrics_combined.png"
+def _load_payload(path: Path) -> Dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if "aggregate_summary" not in data or "per_prompt" not in data:
+        raise ValueError(f"Expected 'aggregate_summary' and 'per_prompt' in {path}")
+    return data
+def _configure_plot_style() -> None:
+    sns.set_theme(style="ticks", palette="colorblind")
+    plt.rcParams["font.family"] = "sans-serif"
+    plt.rcParams["font.sans-serif"] = "Arial"
+    plt.rcParams["axes.labelweight"] = "normal"
+    plt.rcParams["axes.titleweight"] = "bold"
+    plt.rcParams["figure.titleweight"] = "bold"
+    plt.rcParams["savefig.dpi"] = 300
+    plt.rcParams["figure.facecolor"] = "white"
+    plt.rcParams["axes.facecolor"] = "white"
+    plt.rcParams["grid.alpha"] = 0.2
+    plt.rcParams["axes.spines.top"] = False
+    plt.rcParams["axes.spines.right"] = False
+def _load_cpr_cmd(path: Path) -> Dict[str, Any]:
+    """Load CPR/CMD results if available."""
+    if not path.exists():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        return data
+    except Exception as e:
+        print(f"Warning: Could not load CPR/CMD results from {path}: {e}")
+        return None
+def plot_combined(summary: Dict[str, Any], per_prompt: Dict[str, Any], output_path: Path, cpr_cmd_data: Dict[str, Any] = None):
+    _configure_plot_style()
+    # Prepare data
+    labels = [r"$\mathbf{Aggregate}$"]
+    targeted_vals = [summary["targeted"]["avg_abs_probability_change"]]
+    random_vals = [summary["random_baseline"]["avg_abs_probability_change"]]
+    path_vals = [summary["path"]["avg_abs_probability_change"]]
+    random_path_vals = [summary["random_path_baseline"]["avg_abs_probability_change"]]
+    # Prepare CPR data if available
+    cpr_vals = []
+    if cpr_cmd_data:
+        # Get average CPR for aggregate
+        results = cpr_cmd_data.get("results", [])
+        if results:
+            avg_cpr = cpr_cmd_data.get("average_CPR", 0.0)
+            cpr_vals.append(avg_cpr)
+        # Map prompts to CPR values
+        prompt_to_cpr = {}
+        for result in results:
+            prompt_text = result.get("prompt", "")
+            prompt_to_cpr[prompt_text] = result.get("CPR", 0.0)
+    for key, data in per_prompt.items():
+        # Clean up prompt label for display (first 5 words or so)
+        prompt_text = data.get("prompt", key)
+        labels.append(prompt_text)
+        stats = data.get("summary_statistics", {})
+        targeted_vals.append(stats.get("targeted", {}).get("avg_abs_probability_change", 0.0))
+        random_vals.append(stats.get("random_baseline", {}).get("avg_abs_probability_change", 0.0))
+        path_vals.append(stats.get("path", {}).get("avg_abs_probability_change", 0.0))
+        random_path_vals.append(stats.get("random_path_baseline", {}).get("avg_abs_probability_change", 0.0))
+        # Add CPR for this prompt if available
+        if cpr_cmd_data and prompt_text in prompt_to_cpr:
+            cpr_vals.append(prompt_to_cpr[prompt_text])
+        elif cpr_cmd_data:
+            # If CPR data exists but this prompt isn't in it, add zero
+            cpr_vals.append(0.0)
+    x = np.arange(len(labels))
+    width = 0.2
+    # Use a aspect ratio that fits well in a paper (e.g. wide enough for column)
+    fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
+    # Create second y-axis for CPR if data is available
+    ax2 = None
+    if cpr_cmd_data and cpr_vals:
+        ax2 = ax.twinx()
+    # Color palette - using specific indices from colorblind to ensure contrast
+    # 0: Blue, 1: Orange, 2: Green, 3: Red, 4: Purple, etc.
+    palette = sns.color_palette("colorblind")
+    c_target = palette[0]  # Blue
+    c_random = palette[7]  # Grey-ish or distinct
+    c_path = palette[2]    # Green
+    c_path_rnd = palette[3] # Red
+    # Plot bars
+    features_targeted = ax.bar(x - width * 1.5, targeted_vals, width, label="Targeted Features", color=c_target)
+    features_random = ax.bar(x - width/2, random_vals, width, label="Random Features", color=c_random, alpha=0.7)
+    paths_targeted = ax.bar(x + width/2, path_vals, width, label="Traced Circuits", color=c_path)
+    paths_random = ax.bar(x + width * 1.5, random_path_vals, width, label="Random Path Baseline", color=c_path_rnd, alpha=0.7)
+    # Add value labels on top of bars (only if they are significant enough to not clutter)
+    def autolabel(rects):
+        for rect in rects:
+            height = rect.get_height()
+            # Threshold logic: Only skip if truly tiny (effectively zero)
+            if height > 0.01:
+                ax.annotate(
+                    f"{height:.2f}",
+                    xy=(rect.get_x() + rect.get_width() / 2, height),
+                    xytext=(0, 3),
+                    textcoords="offset points",
+                    ha="center",
+                    va="bottom",
+                    fontsize=14,
+                    fontweight="normal",
+                    color="black"
+                )
+    autolabel(features_targeted)
+    autolabel(features_random)
+    autolabel(paths_targeted)
+    autolabel(paths_random)
+    # Plot CPR on second axis if available
+    if ax2 and cpr_vals:
+        # Plot as line with markers
+        line1 = ax2.plot(x, cpr_vals, marker='o', linestyle='--', linewidth=2,
+                        markersize=8, color='purple', label='CPR', zorder=5)
+        ax2.set_ylabel("CPR", fontsize=16, fontweight="normal", color='black')
+        ax2.tick_params(axis='y', labelcolor='black', labelsize=14)
+        ax2.set_ylim(0, 1.1)  # CPR is in [0,1]
+        # Add value labels for CPR (below the markers)
+        for i, cpr_val in enumerate(cpr_vals):
+            if cpr_val > 0.01:
+                ax2.annotate(f'{cpr_val:.2f}', xy=(i, cpr_val), xytext=(-20, -5),
+                           textcoords='offset points', fontsize=11, color='purple',
+                           fontweight='bold', ha='center')
+        # Add CPR to legend
+        lines1, labels1 = ax.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left", ncol=3,
+                 frameon=True, framealpha=0.9, edgecolor="white", fontsize=12)
+    else:
+        # Original legend if no CPR
+        ax.legend(loc="upper left", ncol=2, frameon=True, framealpha=0.9, edgecolor="white", fontsize=14)
+    ax.set_ylabel("Avg. |Probability Change| (|Δp|)", fontsize=16, fontweight="normal")
+    ax.set_xticks(x)
+    # Wrap labels nicely (but preserve LaTeX formatting for Aggregate)
+    wrapped_labels = []
+    for label in labels:
+        if r"$\mathbf{Aggregate}$" in label:
+            wrapped_labels.append(label)
+        else:
+            wrapped_labels.append(fill(label, 20))
+    ax.set_xticklabels(wrapped_labels, rotation=0, ha="center", fontsize=14)
+    # Add subtle grid
+    ax.grid(axis='y', linestyle='--', alpha=0.3)
+    # Adjust y-limit to give some headroom for labels
+    y_max = max(max(targeted_vals), max(path_vals), max(random_vals), max(random_path_vals))
+    ax.set_ylim(0, y_max * 1.30)
+    # Ensure output directory exists
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(output_path, dpi=300)
+    plt.close(fig)
+def main():
+    parser = argparse.ArgumentParser(description="Plot offline attribution metrics.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        default=str(DEFAULT_RESULTS),
+        help="Path to offline_circuit_metrics.json"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=str(DEFAULT_FIG),
+        help="Path to save the per-prompt figure (PNG)."
+    )
+    parser.add_argument(
+        "--cpr-cmd",
+        type=str,
+        default=str(DEFAULT_CPR_CMD),
+        help="Path to CPR/CMD results JSON file (optional)."
+    )
+    args = parser.parse_args()
+    if not Path(args.input).exists():
+        print(f"Error: Input file {args.input} not found. Please run offline_circuit_metrics.py first.")
+        return
+    payload = _load_payload(Path(args.input))
+    summary = payload["aggregate_summary"]
+    per_prompt = payload["per_prompt"]
+    # Load CPR/CMD data if available
+    cpr_cmd_data = _load_cpr_cmd(Path(args.cpr_cmd))
+    plot_combined(summary, per_prompt, Path(args.output), cpr_cmd_data)
+    print(f"Saved combined plot to {args.output}")
+if __name__ == "__main__":
+    main()

circuit_analysis/results/attribution_graphs_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_de.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_de_prompt_1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_de_prompt_2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_de_prompt_3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_prompt_1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_prompt_2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/attribution_graphs_results_prompt_3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/clt_training_stats.json ADDED Viewed

	@@ -0,0 +1,4508 @@

+{
+  "reconstruction_losses": [
+    8.078125,
+    7.6015625,
+    7.625,
+    7.3828125,
+    7.3671875,
+    7.0390625,
+    7.171875,
+    6.875,
+    6.703125,
+    6.4375,
+    6.46875,
+    6.41015625,
+    6.56640625,
+    6.3671875,
+    6.44140625,
+    6.265625,
+    5.80078125,
+    6.21875,
+    6.1171875,
+    6.0546875,
+    6.1015625,
+    5.5,
+    5.390625,
+    5.4609375,
+    6.0625,
+    5.7890625,
+    5.8671875,
+    5.7578125,
+    5.8046875,
+    5.6015625,
+    5.37890625,
+    5.328125,
+    5.40625,
+    5.453125,
+    5.7265625,
+    5.375,
+    5.3828125,
+    5.11328125,
+    5.66015625,
+    5.30078125,
+    5.46875,
+    5.4609375,
+    5.60546875,
+    5.13671875,
+    5.02734375,
+    5.18359375,
+    4.84375,
+    5.05859375,
+    4.96875,
+    5.05078125,
+    4.9765625,
+    5.40234375,
+    5.25,
+    4.89453125,
+    4.84375,
+    5.1953125,
+    4.75,
+    4.859375,
+    4.2890625,
+    4.84765625,
+    4.97265625,
+    4.73046875,
+    4.62890625,
+    4.6015625,
+    4.796875,
+    4.33203125,
+    4.81640625,
+    4.734375,
+    4.49609375,
+    4.65625,
+    4.4453125,
+    4.56640625,
+    4.52734375,
+    4.09765625,
+    4.48046875,
+    4.4296875,
+    4.61328125,
+    4.5625,
+    4.39453125,
+    4.44140625,
+    4.6171875,
+    4.39453125,
+    4.4765625,
+    4.5078125,
+    4.05078125,
+    4.20703125,
+    4.54296875,
+    4.53515625,
+    4.33984375,
+    4.29296875,
+    4.2109375,
+    4.1171875,
+    4.18359375,
+    4.3671875,
+    4.140625,
+    4.16015625,
+    4.16015625,
+    4.08984375,
+    3.865234375,
+    4.1796875,
+    3.740234375,
+    4.3125,
+    4.2578125,
+    4.203125,
+    4.109375,
+    4.1484375,
+    3.666015625,
+    4.21484375,
+    3.89453125,
+    3.861328125,
+    4.05859375,
+    3.626953125,
+    4.1640625,
+    3.84375,
+    3.970703125,
+    4.09375,
+    3.70703125,
+    4.359375,
+    3.98046875,
+    3.89453125,
+    3.70703125,
+    3.7421875,
+    3.880859375,
+    3.7734375,
+    4.109375,
+    3.748046875,
+    3.83984375,
+    3.640625,
+    3.876953125,
+    4.0234375,
+    3.72265625,
+    4.140625,
+    3.89453125,
+    4.0703125,
+    3.853515625,
+    3.861328125,
+    3.99609375,
+    3.8046875,
+    3.6640625,
+    3.59375,
+    3.46875,
+    3.88671875,
+    3.78515625,
+    3.525390625,
+    3.9375,
+    3.5625,
+    3.45703125,
+    3.849609375,
+    3.58203125,
+    3.408203125,
+    3.67578125,
+    3.84765625,
+    3.8203125,
+    3.681640625,
+    3.67578125,
+    3.361328125,
+    3.47265625,
+    3.734375,
+    3.720703125,
+    3.369140625,
+    3.5546875,
+    3.556640625,
+    3.357421875,
+    3.44140625,
+    3.3671875,
+    3.46875,
+    3.767578125,
+    3.5546875,
+    3.26171875,
+    3.619140625,
+    3.837890625,
+    3.01171875,
+    3.4296875,
+    3.396484375,
+    3.658203125,
+    3.375,
+    3.458984375,
+    3.46875,
+    3.1640625,
+    3.412109375,
+    3.38671875,
+    3.361328125,
+    3.70703125,
+    3.31640625,
+    3.2578125,
+    3.32421875,
+    3.7578125,
+    3.65234375,
+    3.16796875,
+    3.474609375,
+    3.4375,
+    3.13671875,
+    3.478515625,
+    3.357421875,
+    3.17578125,
+    3.3515625,
+    3.32421875,
+    3.158203125,
+    3.64453125,
+    3.802734375,
+    3.1953125,
+    3.0859375,
+    3.474609375,
+    3.208984375,
+    3.1171875,
+    3.3046875,
+    3.169921875,
+    3.2109375,
+    3.296875,
+    3.125,
+    3.2578125,
+    3.328125,
+    3.412109375,
+    3.353515625,
+    3.265625,
+    3.419921875,
+    3.298828125,
+    3.373046875,
+    3.302734375,
+    3.05859375,
+    3.12890625,
+    3.248046875,
+    2.955078125,
+    3.0703125,
+    3.048828125,
+    3.36328125,
+    3.126953125,
+    3.0625,
+    3.240234375,
+    3.052734375,
+    3.1796875,
+    2.943359375,
+    3.080078125,
+    3.185546875,
+    2.98046875,
+    3.234375,
+    3.01171875,
+    2.9921875,
+    3.01953125,
+    2.71875,
+    3.1328125,
+    2.75,
+    3.23046875,
+    3.01953125,
+    2.81640625,
+    3.162109375,
+    3.080078125,
+    3.37890625,
+    3.06640625,
+    3.03125,
+    3.1875,
+    2.83984375,
+    2.94140625,
+    2.8203125,
+    3.07421875,
+    2.90234375,
+    2.87890625,
+    3.12890625,
+    2.94921875,
+    2.97265625,
+    2.73046875,
+    2.892578125,
+    2.79296875,
+    3.1640625,
+    2.974609375,
+    2.681640625,
+    2.98046875,
+    2.84375,
+    2.990234375,
+    2.62890625,
+    2.953125,
+    3.046875,
+    2.962890625,
+    3.126953125,
+    2.88671875,
+    2.9765625,
+    2.701171875,
+    2.7734375,
+    3.255859375,
+    2.939453125,
+    2.98046875,
+    2.705078125,
+    2.98828125,
+    2.869140625,
+    2.828125,
+    3.025390625,
+    2.765625,
+    2.91015625,
+    2.671875,
+    2.892578125,
+    3.181640625,
+    2.91796875,
+    2.908203125,
+    2.91796875,
+    2.904296875,
+    2.93359375,
+    2.802734375,
+    3.044921875,
+    2.9296875,
+    2.96875,
+    2.859375,
+    2.890625,
+    2.984375,
+    2.7265625,
+    2.78515625,
+    2.876953125,
+    2.798828125,
+    2.759765625,
+    2.6328125,
+    2.765625,
+    2.705078125,
+    2.7265625,
+    2.767578125,
+    2.9609375,
+    2.5234375,
+    2.65625,
+    2.701171875,
+    2.82421875,
+    2.677734375,
+    2.57421875,
+    2.90234375,
+    2.806640625,
+    2.94921875,
+    2.912109375,
+    2.865234375,
+    2.6328125,
+    2.787109375,
+    2.634765625,
+    2.66796875,
+    2.701171875,
+    2.81640625,
+    2.646484375,
+    2.72265625,
+    2.5859375,
+    2.8046875,
+    2.548828125,
+    2.6171875,
+    2.59375,
+    3.01171875,
+    2.828125,
+    2.85546875,
+    2.525390625,
+    2.751953125,
+    2.779296875,
+    2.77734375,
+    2.689453125,
+    2.638671875,
+    2.64453125,
+    2.748046875,
+    2.587890625,
+    2.634765625,
+    2.5625,
+    2.9375,
+    2.64453125,
+    2.8828125,
+    2.47265625,
+    2.63671875,
+    2.60546875,
+    2.4453125,
+    2.6796875,
+    2.56640625,
+    2.814453125,
+    2.623046875,
+    2.6640625,
+    2.892578125,
+    2.80078125,
+    2.67578125,
+    2.73046875,
+    2.42578125,
+    2.73046875,
+    2.853515625,
+    2.54296875,
+    2.70703125,
+    2.537109375,
+    2.65625,
+    2.44140625,
+    2.400390625,
+    2.76171875,
+    2.693359375,
+    2.72265625,
+    2.62109375,
+    2.421875,
+    2.4140625,
+    2.673828125,
+    2.515625,
+    2.48828125,
+    2.681640625,
+    2.40625,
+    2.6328125,
+    2.322265625,
+    2.482421875,
+    2.51953125,
+    2.486328125,
+    2.669921875,
+    2.50390625,
+    2.576171875,
+    2.494140625,
+    2.62109375,
+    2.72265625,
+    2.669921875,
+    2.587890625,
+    2.587890625,
+    2.42578125,
+    2.376953125,
+    2.466796875,
+    2.396484375,
+    2.513671875,
+    2.42578125,
+    2.408203125,
+    2.71484375,
+    2.482421875,
+    2.490234375,
+    2.75390625,
+    2.47265625,
+    2.439453125,
+    2.541015625,
+    2.466796875,
+    2.197265625,
+    2.294921875,
+    2.515625,
+    2.6328125,
+    2.37109375,
+    2.62109375,
+    2.3046875,
+    2.48828125,
+    2.435546875,
+    2.685546875,
+    2.28125,
+    2.591796875,
+    2.41015625,
+    2.5703125,
+    2.21484375,
+    2.4140625,
+    2.244140625,
+    2.5859375,
+    2.58984375,
+    2.56640625,
+    2.611328125,
+    2.3359375,
+    2.46875,
+    2.484375,
+    2.4375,
+    2.29296875,
+    2.46875,
+    2.36328125,
+    2.537109375,
+    2.525390625,
+    2.3515625,
+    2.484375,
+    2.443359375,
+    2.421875,
+    2.4375,
+    2.46484375,
+    2.591796875,
+    2.302734375,
+    2.3203125,
+    2.3046875,
+    2.4375,
+    2.28125,
+    2.37109375,
+    2.5859375,
+    2.51953125,
+    2.33203125,
+    2.16796875,
+    2.328125,
+    2.421875,
+    2.388671875,
+    2.42578125,
+    2.3671875,
+    2.48046875,
+    2.44140625,
+    2.28515625,
+    2.25,
+    2.279296875,
+    2.48828125,
+    2.26171875,
+    2.37109375,
+    2.369140625,
+    2.517578125,
+    2.419921875,
+    2.302734375,
+    2.4140625,
+    2.333984375,
+    2.251953125,
+    2.345703125,
+    2.259765625,
+    2.373046875,
+    2.38671875,
+    2.404296875,
+    2.306640625,
+    2.47265625,
+    2.12890625,
+    2.302734375,
+    2.56640625,
+    2.279296875,
+    2.314453125,
+    2.4296875,
+    2.470703125,
+    2.40234375,
+    2.283203125,
+    2.1484375,
+    2.48046875,
+    2.5,
+    2.357421875,
+    2.158203125,
+    2.357421875,
+    2.49609375,
+    2.375,
+    2.26171875,
+    2.40625,
+    2.2734375,
+    2.34375,
+    2.173828125,
+    2.326171875,
+    2.15234375,
+    2.125,
+    2.431640625,
+    2.083984375,
+    2.203125,
+    2.400390625,
+    2.365234375,
+    2.29296875,
+    2.060546875,
+    2.158203125,
+    2.150390625,
+    2.26171875,
+    1.931640625,
+    2.099609375,
+    2.3203125,
+    2.2109375,
+    2.236328125,
+    2.482421875,
+    2.203125,
+    2.2109375,
+    2.259765625,
+    2.10546875,
+    2.1953125,
+    2.16015625,
+    2.50390625,
+    2.501953125,
+    2.265625,
+    2.25390625,
+    1.9990234375,
+    2.255859375,
+    2.158203125,
+    2.21484375,
+    2.181640625,
+    2.2734375,
+    2.40234375,
+    2.345703125,
+    2.296875,
+    2.123046875,
+    2.337890625,
+    2.037109375,
+    2.08203125,
+    2.28125,
+    2.0234375,
+    2.169921875,
+    2.236328125,
+    2.248046875,
+    2.541015625,
+    2.40625,
+    2.111328125,
+    1.9775390625,
+    2.12109375,
+    2.181640625,
+    2.177734375,
+    1.9755859375,
+    2.10546875,
+    1.994140625,
+    2.044921875,
+    2.115234375,
+    2.392578125,
+    1.873046875,
+    2.02734375,
+    2.36328125,
+    2.056640625,
+    2.1640625,
+    2.048828125,
+    2.009765625,
+    2.095703125,
+    2.0625,
+    2.10546875,
+    2.0703125,
+    2.306640625,
+    2.201171875,
+    1.951171875,
+    1.9130859375,
+    2.20703125,
+    2.24609375,
+    2.337890625,
+    2.0625,
+    2.03515625,
+    2.259765625,
+    2.154296875,
+    1.966796875,
+    1.9765625,
+    2.248046875,
+    2.1796875,
+    2.099609375,
+    2.119140625,
+    2.130859375,
+    2.19140625,
+    2.0,
+    2.26953125,
+    2.181640625,
+    2.072265625,
+    2.0703125,
+    1.974609375,
+    2.1171875,
+    2.10546875,
+    2.3671875,
+    2.130859375,
+    2.001953125,
+    2.03125,
+    2.181640625,
+    1.962890625,
+    2.015625,
+    2.10546875,
+    2.32421875,
+    2.08984375,
+    2.0234375,
+    2.029296875,
+    1.904296875,
+    2.189453125,
+    2.2734375,
+    2.173828125,
+    2.060546875,
+    2.015625,
+    2.0625,
+    2.13671875,
+    1.94140625,
+    1.91796875,
+    1.9462890625,
+    1.9599609375,
+    2.013671875,
+    2.0234375,
+    1.96875,
+    1.9765625,
+    2.119140625,
+    2.056640625,
+    2.099609375,
+    2.06640625,
+    1.8115234375,
+    2.224609375,
+    1.9189453125,
+    2.08203125,
+    2.01953125,
+    1.8779296875,
+    1.908203125,
+    2.09765625,
+    2.22265625,
+    1.921875,
+    1.884765625,
+    1.9912109375,
+    2.052734375,
+    2.025390625,
+    2.185546875,
+    2.099609375,
+    1.9833984375,
+    2.1015625,
+    2.220703125,
+    2.244140625,
+    2.123046875,
+    1.9736328125,
+    1.828125,
+    1.982421875,
+    1.8388671875,
+    2.1953125,
+    1.9150390625,
+    1.8994140625,
+    1.9296875,
+    1.818359375,
+    2.06640625,
+    1.958984375,
+    2.056640625,
+    2.11328125,
+    1.9423828125,
+    2.26953125,
+    1.98046875,
+    1.806640625,
+    2.12890625,
+    2.07421875,
+    2.05859375,
+    2.09375,
+    2.14453125,
+    2.142578125,
+    1.93359375,
+    1.9990234375,
+    1.900390625,
+    1.8837890625,
+    1.880859375,
+    1.9931640625,
+    2.162109375,
+    2.064453125,
+    1.8603515625,
+    2.09375,
+    2.1015625,
+    2.03125,
+    2.154296875,
+    2.123046875,
+    1.822265625,
+    2.001953125,
+    2.04296875,
+    1.9951171875,
+    1.9677734375,
+    2.146484375,
+    2.060546875,
+    2.00390625,
+    1.978515625,
+    2.005859375,
+    1.943359375,
+    1.9375,
+    1.828125,
+    1.90625,
+    1.8935546875,
+    1.958984375,
+    2.009765625,
+    1.8984375,
+    2.03515625,
+    2.0625,
+    2.091796875,
+    1.8447265625,
+    2.11328125,
+    1.765625,
+    1.9130859375,
+    1.8974609375,
+    1.87109375,
+    2.033203125,
+    1.921875,
+    1.921875,
+    1.615234375,
+    1.9169921875,
+    1.9609375,
+    1.7412109375,
+    1.9296875,
+    1.8935546875,
+    2.109375,
+    1.8779296875,
+    1.875,
+    1.794921875,
+    2.076171875,
+    2.0546875,
+    1.828125,
+    1.8076171875,
+    1.951171875,
+    1.73046875,
+    1.9482421875,
+    2.109375,
+    1.978515625,
+    2.169921875,
+    1.943359375,
+    1.8896484375,
+    1.9443359375,
+    2.10546875,
+    1.986328125,
+    2.005859375,
+    1.93359375,
+    2.078125,
+    1.8740234375,
+    2.056640625,
+    2.015625,
+    2.02734375,
+    1.9296875,
+    1.5615234375,
+    2.1171875,
+    1.796875,
+    1.9716796875,
+    1.8515625,
+    2.046875,
+    1.6640625,
+    1.9560546875,
+    1.8271484375,
+    1.9677734375,
+    1.9912109375,
+    1.861328125,
+    1.935546875,
+    1.94140625,
+    2.01171875,
+    2.173828125,
+    2.01953125,
+    1.8701171875,
+    1.828125,
+    2.06640625,
+    2.021484375,
+    1.9677734375,
+    1.9453125,
+    1.76171875,
+    1.998046875,
+    2.103515625,
+    1.8115234375,
+    2.08984375,
+    1.8740234375,
+    1.787109375,
+    2.08203125,
+    2.017578125,
+    1.7421875,
+    1.865234375,
+    2.03515625,
+    1.87890625,
+    1.7744140625,
+    2.01171875,
+    1.779296875,
+    1.970703125,
+    1.79296875,
+    1.9130859375,
+    1.880859375,
+    1.814453125,
+    1.833984375,
+    1.8671875,
+    1.8017578125,
+    1.708984375,
+    1.9697265625,
+    1.9130859375,
+    2.033203125,
+    1.728515625,
+    1.7158203125,
+    1.8798828125,
+    1.9765625,
+    1.63671875,
+    1.9140625,
+    1.857421875,
+    1.8037109375,
+    1.75,
+    1.7978515625,
+    1.6875,
+    1.88671875,
+    1.84765625,
+    1.8828125,
+    1.8515625,
+    1.8359375,
+    1.931640625,
+    1.939453125,
+    1.970703125,
+    1.8662109375,
+    1.88671875,
+    1.6826171875,
+    1.87890625,
+    1.748046875,
+    1.779296875,
+    1.9384765625,
+    1.88671875,
+    1.8154296875,
+    1.767578125,
+    1.8798828125,
+    1.962890625,
+    1.89453125,
+    1.970703125,
+    1.966796875,
+    1.86328125,
+    1.947265625,
+    1.900390625,
+    1.8271484375,
+    1.9453125,
+    1.818359375,
+    1.8994140625,
+    1.8623046875,
+    1.8046875,
+    1.7509765625,
+    1.8525390625,
+    2.01171875,
+    1.7734375,
+    1.68359375,
+    2.01171875,
+    2.0,
+    1.880859375,
+    1.775390625,
+    1.828125,
+    1.716796875,
+    1.849609375,
+    1.806640625,
+    1.8271484375,
+    1.8193359375,
+    1.955078125,
+    1.970703125,
+    1.7529296875,
+    1.62890625,
+    1.861328125,
+    1.669921875,
+    1.888671875,
+    1.859375,
+    1.8427734375,
+    1.751953125,
+    1.7109375,
+    1.7470703125,
+    1.8095703125,
+    1.84765625,
+    1.771484375,
+    1.728515625,
+    1.818359375,
+    1.7841796875,
+    2.01953125,
+    1.94140625,
+    1.81640625,
+    1.974609375,
+    1.8525390625,
+    1.748046875,
+    1.962890625,
+    1.91796875,
+    1.822265625,
+    1.7099609375,
+    1.9775390625,
+    1.75390625,
+    1.775390625,
+    1.8955078125,
+    1.728515625,
+    1.8369140625,
+    2.068359375,
+    1.890625,
+    1.6982421875,
+    1.7509765625,
+    1.8125,
+    1.716796875,
+    1.8544921875,
+    1.6630859375,
+    1.646484375,
+    1.7802734375,
+    1.513671875,
+    1.92578125,
+    1.560546875,
+    1.8212890625,
+    1.7490234375,
+    1.8564453125,
+    1.765625,
+    1.8037109375,
+    1.7470703125,
+    1.60546875,
+    1.869140625,
+    1.7421875,
+    1.814453125,
+    1.6513671875,
+    1.7353515625,
+    1.8828125,
+    1.7529296875,
+    1.70703125,
+    1.927734375,
+    1.7099609375,
+    1.650390625,
+    1.857421875,
+    1.78125,
+    1.7998046875,
+    1.623046875,
+    1.7998046875,
+    1.8955078125,
+    1.9072265625,
+    1.662109375,
+    1.64453125,
+    1.7119140625,
+    1.85546875,
+    1.8505859375,
+    1.806640625,
+    1.5927734375,
+    1.90234375,
+    1.7626953125,
+    1.8935546875,
+    1.8115234375,
+    1.7109375,
+    1.994140625,
+    1.8896484375,
+    1.732421875,
+    1.6640625,
+    1.74609375,
+    1.6875,
+    1.71875,
+    1.80078125,
+    1.9140625,
+    1.6865234375,
+    1.646484375,
+    1.7646484375,
+    1.765625,
+    1.509765625,
+    1.7548828125,
+    1.9052734375,
+    1.615234375,
+    1.5146484375,
+    1.7548828125,
+    1.7451171875,
+    1.7626953125,
+    1.7353515625,
+    1.7607421875,
+    1.669921875,
+    1.7734375,
+    1.7900390625,
+    1.75390625,
+    1.9267578125,
+    1.8232421875,
+    1.6748046875,
+    1.5771484375,
+    1.740234375,
+    1.6904296875,
+    1.90625,
+    1.59375,
+    1.677734375,
+    1.6259765625,
+    1.658203125,
+    1.751953125,
+    1.6982421875,
+    1.7294921875,
+    1.8388671875,
+    1.73046875,
+    1.775390625,
+    1.818359375,
+    1.7734375,
+    1.779296875,
+    1.541015625,
+    1.7744140625,
+    1.5859375,
+    1.896484375,
+    1.6298828125,
+    1.6962890625,
+    1.666015625,
+    2.01953125,
+    1.65234375,
+    1.7041015625,
+    1.626953125,
+    1.611328125,
+    1.8544921875,
+    1.8515625,
+    1.8662109375,
+    1.7353515625,
+    1.787109375,
+    1.791015625,
+    1.8642578125,
+    1.71875,
+    1.703125,
+    1.681640625,
+    1.666015625,
+    1.8740234375,
+    1.7587890625,
+    1.736328125,
+    1.599609375,
+    1.677734375,
+    1.853515625,
+    1.66796875,
+    1.5537109375,
+    1.8505859375,
+    1.833984375,
+    1.744140625,
+    1.64453125,
+    1.701171875,
+    1.6796875,
+    1.8955078125,
+    1.8505859375,
+    1.66015625,
+    1.8330078125,
+    1.6171875,
+    1.7861328125,
+    1.5546875,
+    1.9013671875,
+    1.763671875,
+    1.6474609375,
+    1.509765625,
+    1.6513671875,
+    1.791015625,
+    1.8134765625,
+    1.70703125,
+    1.740234375,
+    1.72265625,
+    1.703125,
+    1.63671875,
+    1.5693359375,
+    1.611328125,
+    1.76953125,
+    1.818359375,
+    1.732421875,
+    1.5029296875,
+    1.583984375,
+    1.64453125,
+    1.5634765625,
+    1.71484375,
+    1.572265625,
+    1.62109375,
+    1.58203125,
+    1.7080078125,
+    1.6689453125,
+    1.5244140625,
+    1.732421875,
+    1.64453125,
+    1.67578125,
+    1.669921875,
+    1.76953125,
+    1.767578125,
+    1.6552734375,
+    1.654296875,
+    1.8671875,
+    1.5791015625,
+    1.572265625,
+    1.9609375,
+    1.5625,
+    1.91015625,
+    1.7001953125,
+    1.90625,
+    1.767578125,
+    1.611328125,
+    1.80078125,
+    1.6865234375,
+    1.73046875,
+    1.6640625,
+    1.611328125,
+    1.560546875,
+    1.75390625,
+    1.9609375,
+    1.720703125,
+    1.7177734375,
+    1.689453125,
+    1.744140625,
+    1.72265625,
+    1.59375,
+    1.634765625,
+    1.5947265625,
+    1.6748046875,
+    1.53515625,
+    1.8359375,
+    1.70703125,
+    1.666015625,
+    1.626953125,
+    1.560546875,
+    1.6337890625,
+    1.5947265625,
+    1.626953125,
+    1.6953125,
+    1.421875,
+    1.8046875,
+    1.7890625,
+    1.658203125,
+    1.6796875,
+    1.693359375,
+    1.49609375,
+    1.693359375,
+    1.642578125,
+    1.541015625,
+    1.9150390625,
+    1.8095703125,
+    1.69140625,
+    1.5439453125,
+    1.6328125,
+    1.6474609375,
+    1.6640625,
+    1.45703125,
+    1.5166015625,
+    1.552734375,
+    1.912109375,
+    1.646484375,
+    1.791015625,
+    1.4482421875,
+    1.75390625,
+    1.572265625,
+    1.619140625,
+    1.6591796875,
+    1.5302734375,
+    1.56640625,
+    1.685546875,
+    1.525390625,
+    1.7041015625,
+    1.6787109375,
+    1.6943359375,
+    1.8330078125,
+    1.6142578125,
+    1.720703125,
+    1.791015625,
+    1.6005859375,
+    1.568359375,
+    1.6318359375,
+    1.5546875,
+    1.6533203125,
+    1.71484375,
+    1.498046875,
+    1.6220703125,
+    1.7724609375,
+    1.66796875,
+    1.79296875,
+    1.8359375,
+    1.74609375,
+    1.822265625,
+    1.751953125,
+    1.609375,
+    1.63671875,
+    1.6376953125,
+    1.5,
+    1.76171875,
+    1.744140625,
+    1.728515625,
+    1.6533203125,
+    1.6474609375,
+    1.6689453125,
+    1.771484375,
+    1.59765625,
+    1.7763671875,
+    1.7158203125,
+    1.4404296875,
+    1.412109375,
+    1.4833984375,
+    1.6396484375,
+    1.712890625,
+    1.6171875,
+    1.45703125,
+    1.78515625,
+    1.662109375,
+    1.83984375,
+    1.44921875,
+    1.7392578125,
+    1.7861328125,
+    1.62109375,
+    1.5791015625,
+    1.52734375,
+    1.724609375,
+    1.7216796875,
+    1.7041015625,
+    1.73046875,
+    1.7373046875,
+    1.556640625,
+    1.736328125,
+    1.8095703125,
+    1.701171875,
+    1.48046875,
+    1.7373046875,
+    1.64453125,
+    1.669921875,
+    1.6162109375,
+    1.6357421875,
+    1.6181640625,
+    1.7265625,
+    1.69921875,
+    1.6796875,
+    1.6767578125,
+    1.6171875,
+    1.5341796875,
+    1.8037109375,
+    1.6298828125,
+    1.59765625,
+    1.689453125,
+    1.595703125,
+    1.693359375,
+    1.6474609375,
+    1.76171875,
+    1.7265625,
+    1.353515625,
+    1.572265625,
+    1.69921875,
+    1.6484375,
+    1.63671875,
+    1.525390625,
+    1.5830078125,
+    1.6923828125,
+    1.8232421875,
+    1.685546875,
+    1.529296875,
+    1.6630859375,
+    1.796875,
+    1.6328125,
+    1.626953125,
+    1.7353515625,
+    1.7109375,
+    1.6220703125,
+    1.5185546875,
+    1.5615234375,
+    1.5732421875,
+    1.77734375,
+    1.662109375,
+    1.7158203125,
+    1.703125,
+    1.7119140625,
+    1.7392578125,
+    1.552734375,
+    1.6787109375,
+    1.59375,
+    1.51953125,
+    1.4970703125,
+    1.7021484375,
+    1.533203125,
+    1.5673828125,
+    1.5439453125,
+    1.5390625,
+    1.6162109375,
+    1.7041015625,
+    1.5029296875,
+    1.6484375,
+    1.62890625,
+    1.6494140625,
+    1.509765625,
+    1.830078125,
+    1.6845703125,
+    1.68359375,
+    1.330078125,
+    1.58203125,
+    1.7197265625,
+    1.515625,
+    1.70703125,
+    1.603515625,
+    1.583984375,
+    1.5947265625,
+    1.5478515625,
+    1.572265625,
+    1.5625,
+    1.546875,
+    1.5830078125,
+    1.787109375,
+    1.6435546875,
+    1.6689453125,
+    1.6796875,
+    1.771484375,
+    1.630859375,
+    1.6923828125,
+    1.72265625,
+    1.5888671875,
+    1.693359375,
+    1.677734375,
+    1.5205078125,
+    1.64453125,
+    1.748046875,
+    1.84375,
+    1.6357421875,
+    1.623046875,
+    1.705078125,
+    1.763671875,
+    1.6044921875,
+    1.6640625,
+    1.7421875,
+    1.67578125,
+    1.841796875,
+    1.79296875,
+    1.8046875,
+    1.4951171875,
+    1.5712890625,
+    1.61328125,
+    1.6015625,
+    1.6298828125,
+    1.701171875,
+    1.59765625,
+    1.71484375,
+    1.5634765625,
+    1.65234375,
+    1.759765625,
+    1.4267578125,
+    1.748046875,
+    1.62890625,
+    1.50390625,
+    1.712890625,
+    1.7861328125,
+    1.625,
+    1.69921875,
+    1.654296875,
+    1.771484375,
+    1.5048828125,
+    1.666015625,
+    1.4296875,
+    1.8046875,
+    1.634765625,
+    1.6484375,
+    1.6748046875,
+    1.76953125,
+    1.779296875,
+    1.6669921875,
+    1.814453125,
+    1.677734375,
+    1.7001953125,
+    1.7412109375,
+    1.91015625,
+    1.654296875,
+    1.5703125,
+    1.6103515625,
+    1.634765625,
+    1.689453125,
+    1.521484375,
+    1.6748046875,
+    1.6689453125,
+    1.455078125,
+    1.7490234375,
+    1.5166015625,
+    1.6611328125,
+    1.779296875,
+    1.640625,
+    1.669921875,
+    1.724609375,
+    1.6201171875,
+    1.677734375,
+    1.654296875,
+    1.7724609375,
+    1.6396484375,
+    1.689453125,
+    1.58203125,
+    1.4560546875,
+    1.4609375,
+    1.65234375,
+    1.759765625,
+    1.814453125,
+    1.6455078125,
+    1.783203125,
+    1.58984375,
+    1.734375,
+    1.548828125,
+    1.51953125,
+    1.8203125,
+    1.615234375,
+    1.6044921875,
+    1.673828125,
+    1.6953125,
+    1.771484375,
+    1.6455078125,
+    1.5439453125,
+    1.73828125,
+    1.7119140625,
+    1.58203125,
+    1.61328125,
+    1.7109375,
+    1.751953125,
+    1.2490234375,
+    1.529296875,
+    1.48828125,
+    1.7431640625,
+    1.5986328125,
+    1.6796875,
+    1.6787109375,
+    1.521484375,
+    1.6875,
+    1.716796875,
+    1.5546875,
+    1.8046875,
+    1.7626953125,
+    1.7099609375,
+    1.60546875,
+    1.6904296875,
+    1.65234375,
+    1.5693359375,
+    1.791015625,
+    1.5048828125,
+    1.673828125,
+    1.576171875,
+    1.763671875,
+    1.677734375,
+    1.412109375,
+    1.7265625,
+    1.634765625,
+    1.5732421875,
+    1.5703125,
+    1.5849609375,
+    1.611328125,
+    1.5732421875,
+    1.6640625,
+    1.6318359375,
+    1.615234375,
+    1.677734375,
+    1.67578125,
+    1.6142578125,
+    1.62890625,
+    1.611328125,
+    1.634765625,
+    1.4755859375,
+    1.537109375,
+    1.634765625,
+    1.443359375,
+    1.720703125,
+    1.5390625,
+    1.55078125,
+    1.6884765625,
+    1.4248046875,
+    1.6201171875,
+    1.681640625,
+    1.623046875,
+    1.5947265625,
+    1.5703125,
+    1.4716796875,
+    1.5986328125,
+    1.7255859375,
+    1.505859375,
+    1.6982421875,
+    1.677734375,
+    1.6962890625,
+    1.662109375,
+    1.677734375,
+    1.6044921875,
+    1.77734375,
+    1.71484375,
+    1.634765625,
+    1.677734375,
+    1.560546875,
+    1.6865234375,
+    1.5986328125,
+    1.6767578125,
+    1.564453125,
+    1.7138671875,
+    1.70703125,
+    1.6298828125,
+    1.583984375,
+    1.462890625,
+    1.681640625,
+    1.515625,
+    1.5625,
+    1.70703125,
+    1.74609375,
+    1.783203125,
+    1.6572265625,
+    1.65234375,
+    1.60546875,
+    1.58203125,
+    1.4873046875,
+    1.6689453125,
+    1.619140625,
+    1.65625,
+    1.6572265625
+  ],
+  "sparsity_losses": [
+    0.6328125,
+    0.7421875,
+    1.294921875,
+    1.56640625,
+    1.5380859375,
+    1.6962890625,
+    2.01171875,
+    2.505859375,
+    2.822265625,
+    3.525390625,
+    3.46875,
+    3.8359375,
+    3.884765625,
+    3.958984375,
+    4.16015625,
+    3.935546875,
+    4.234375,
+    4.1796875,
+    4.09375,
+    3.986328125,
+    3.931640625,
+    4.05859375,
+    4.1484375,
+    4.09375,
+    3.98828125,
+    4.078125,
+    4.0390625,
+    4.265625,
+    4.140625,
+    4.296875,
+    4.26953125,
+    4.2265625,
+    4.28125,
+    4.19921875,
+    4.1875,
+    4.26953125,
+    4.3359375,
+    4.31640625,
+    4.3046875,
+    4.5078125,
+    4.3203125,
+    4.3671875,
+    4.21484375,
+    4.3046875,
+    4.3046875,
+    4.3125,
+    4.55859375,
+    4.3203125,
+    4.359375,
+    4.484375,
+    4.625,
+    4.484375,
+    4.50390625,
+    4.5,
+    4.61328125,
+    4.54296875,
+    4.58984375,
+    4.5078125,
+    4.53125,
+    4.59375,
+    4.54296875,
+    4.6640625,
+    4.53125,
+    4.65234375,
+    4.48046875,
+    4.66796875,
+    4.6640625,
+    4.65625,
+    4.78125,
+    4.671875,
+    4.6640625,
+    4.73828125,
+    4.7578125,
+    4.7734375,
+    4.828125,
+    4.81640625,
+    4.7265625,
+    4.59375,
+    4.75390625,
+    4.72265625,
+    4.62890625,
+    4.8203125,
+    4.74609375,
+    4.94921875,
+    4.8984375,
+    4.79296875,
+    4.58203125,
+    4.76171875,
+    4.9921875,
+    5.09765625,
+    4.796875,
+    4.85546875,
+    4.9140625,
+    4.84765625,
+    5.140625,
+    4.90625,
+    5.0390625,
+    4.9453125,
+    5.13671875,
+    5.1171875,
+    5.3046875,
+    4.94921875,
+    5.02734375,
+    4.875,
+    5.01953125,
+    5.12890625,
+    5.171875,
+    5.18359375,
+    5.234375,
+    5.30859375,
+    5.23828125,
+    5.265625,
+    5.3203125,
+    5.390625,
+    5.4140625,
+    5.29296875,
+    5.3515625,
+    5.24609375,
+    5.39453125,
+    5.28125,
+    5.46875,
+    5.4375,
+    5.41015625,
+    5.4296875,
+    5.37890625,
+    5.44140625,
+    5.36328125,
+    5.5078125,
+    5.34375,
+    5.28515625,
+    5.47265625,
+    5.4921875,
+    5.5390625,
+    5.36328125,
+    5.5078125,
+    5.421875,
+    5.578125,
+    5.484375,
+    5.60546875,
+    5.6953125,
+    5.5234375,
+    5.5390625,
+    5.62890625,
+    5.58203125,
+    5.79296875,
+    5.76171875,
+    5.734375,
+    5.6328125,
+    5.4921875,
+    5.734375,
+    5.87109375,
+    5.54296875,
+    5.73046875,
+    5.7265625,
+    5.72265625,
+    5.8359375,
+    5.6875,
+    5.6953125,
+    5.92578125,
+    5.9296875,
+    5.9296875,
+    5.6328125,
+    5.96484375,
+    5.8125,
+    5.9765625,
+    5.875,
+    5.984375,
+    6.0,
+    5.96875,
+    5.8046875,
+    5.8359375,
+    6.0,
+    6.08984375,
+    6.03125,
+    6.2265625,
+    6.08984375,
+    6.06640625,
+    6.08984375,
+    6.0546875,
+    6.05859375,
+    6.2265625,
+    6.1015625,
+    6.203125,
+    6.1171875,
+    6.1328125,
+    6.296875,
+    6.203125,
+    6.2265625,
+    6.078125,
+    6.20703125,
+    6.28125,
+    6.3671875,
+    6.375,
+    6.24609375,
+    6.3671875,
+    6.34375,
+    6.375,
+    6.30859375,
+    6.3046875,
+    6.44140625,
+    6.5703125,
+    6.51953125,
+    6.43359375,
+    6.5390625,
+    6.57421875,
+    6.89453125,
+    6.6875,
+    6.71875,
+    6.80859375,
+    6.62890625,
+    6.640625,
+    6.640625,
+    6.71875,
+    6.640625,
+    6.75390625,
+    6.63671875,
+    6.59765625,
+    6.5,
+    6.84375,
+    6.8125,
+    6.7734375,
+    6.796875,
+    6.640625,
+    6.875,
+    6.671875,
+    6.75390625,
+    6.73828125,
+    6.921875,
+    6.85546875,
+    6.9921875,
+    6.9296875,
+    6.6796875,
+    7.0859375,
+    7.03125,
+    6.9140625,
+    6.796875,
+    6.9765625,
+    6.87890625,
+    6.9375,
+    7.0234375,
+    6.93359375,
+    7.03515625,
+    6.98828125,
+    7.0546875,
+    7.265625,
+    6.8515625,
+    6.9375,
+    7.2109375,
+    7.05078125,
+    7.0546875,
+    6.98828125,
+    7.171875,
+    7.140625,
+    7.359375,
+    7.109375,
+    7.12109375,
+    7.33984375,
+    7.2890625,
+    7.26171875,
+    7.0703125,
+    7.2421875,
+    6.99609375,
+    7.0625,
+    7.34765625,
+    7.48046875,
+    7.31640625,
+    7.265625,
+    7.3125,
+    7.26171875,
+    7.1640625,
+    7.30078125,
+    7.234375,
+    7.203125,
+    7.5,
+    7.34375,
+    7.515625,
+    7.375,
+    7.4609375,
+    7.48046875,
+    7.359375,
+    7.51171875,
+    7.48828125,
+    7.5,
+    7.59375,
+    7.765625,
+    7.671875,
+    7.6875,
+    7.671875,
+    7.65625,
+    7.609375,
+    7.8359375,
+    7.62109375,
+    7.5078125,
+    7.5234375,
+    7.546875,
+    7.65625,
+    7.75,
+    7.84375,
+    7.7265625,
+    7.6875,
+    7.78125,
+    7.7734375,
+    7.484375,
+    7.81640625,
+    7.5390625,
+    7.9609375,
+    7.69140625,
+    7.72265625,
+    7.765625,
+    7.875,
+    7.7421875,
+    7.88671875,
+    7.9453125,
+    7.73046875,
+    7.99609375,
+    8.046875,
+    7.94921875,
+    7.9375,
+    7.71875,
+    7.7578125,
+    7.8125,
+    8.0234375,
+    8.03125,
+    8.03125,
+    8.0859375,
+    8.0546875,
+    7.97265625,
+    7.92578125,
+    8.140625,
+    8.0703125,
+    8.3203125,
+    8.1640625,
+    8.140625,
+    8.0703125,
+    8.359375,
+    8.3125,
+    8.390625,
+    8.25,
+    8.25,
+    8.2109375,
+    8.34375,
+    8.0625,
+    8.578125,
+    8.2265625,
+    8.453125,
+    8.421875,
+    8.5,
+    8.59375,
+    8.3203125,
+    8.6171875,
+    8.390625,
+    8.4375,
+    8.46875,
+    8.296875,
+    8.4765625,
+    8.578125,
+    8.3359375,
+    8.453125,
+    8.515625,
+    8.59375,
+    8.5859375,
+    8.71875,
+    8.65625,
+    8.7421875,
+    8.5703125,
+    8.4296875,
+    8.765625,
+    8.46875,
+    8.8515625,
+    8.8125,
+    8.953125,
+    8.7265625,
+    8.890625,
+    8.5703125,
+    8.828125,
+    8.734375,
+    8.734375,
+    8.6875,
+    8.7578125,
+    8.9375,
+    8.84375,
+    8.78125,
+    8.75,
+    8.9765625,
+    8.9375,
+    8.8359375,
+    8.921875,
+    8.9921875,
+    8.671875,
+    8.953125,
+    8.8828125,
+    9.0546875,
+    8.9609375,
+    9.1640625,
+    9.140625,
+    8.96875,
+    9.09375,
+    9.2265625,
+    9.2578125,
+    9.125,
+    9.2890625,
+    9.078125,
+    9.3203125,
+    9.109375,
+    9.2890625,
+    8.921875,
+    9.1328125,
+    9.25,
+    9.09375,
+    8.9375,
+    9.1328125,
+    9.234375,
+    9.1875,
+    9.4765625,
+    9.1640625,
+    9.1796875,
+    9.25,
+    8.90625,
+    9.09375,
+    9.46875,
+    9.25,
+    9.1875,
+    9.359375,
+    9.328125,
+    9.46875,
+    9.3125,
+    9.640625,
+    9.34375,
+    9.53125,
+    9.4609375,
+    9.4921875,
+    9.25,
+    9.359375,
+    9.3203125,
+    9.578125,
+    9.46875,
+    9.546875,
+    9.5078125,
+    9.6796875,
+    9.7421875,
+    9.328125,
+    9.6015625,
+    9.703125,
+    9.4375,
+    9.578125,
+    9.625,
+    9.6875,
+    9.5859375,
+    9.640625,
+    9.7578125,
+    9.78125,
+    9.765625,
+    9.8515625,
+    9.65625,
+    9.734375,
+    9.5625,
+    9.8515625,
+    9.6640625,
+    9.5859375,
+    9.828125,
+    9.8203125,
+    9.9375,
+    10.125,
+    9.859375,
+    9.9375,
+    10.28125,
+    10.0078125,
+    10.1953125,
+    10.1875,
+    10.046875,
+    10.0546875,
+    9.875,
+    9.765625,
+    9.875,
+    10.234375,
+    10.125,
+    10.0078125,
+    10.0,
+    10.1953125,
+    10.078125,
+    10.2734375,
+    10.265625,
+    9.9375,
+    10.0234375,
+    10.3046875,
+    10.1640625,
+    10.3125,
+    10.015625,
+    10.09375,
+    10.0703125,
+    10.1171875,
+    10.171875,
+    10.484375,
+    10.2578125,
+    10.0234375,
+    10.359375,
+    10.109375,
+    10.390625,
+    10.3671875,
+    10.4765625,
+    10.28125,
+    10.2890625,
+    10.453125,
+    10.2890625,
+    10.46875,
+    10.40625,
+    10.546875,
+    10.46875,
+    10.1953125,
+    10.390625,
+    10.46875,
+    10.375,
+    10.4375,
+    10.3125,
+    10.4453125,
+    10.40625,
+    10.84375,
+    10.8125,
+    10.6484375,
+    10.8984375,
+    10.65625,
+    10.7578125,
+    10.6875,
+    10.625,
+    10.515625,
+    10.765625,
+    10.4375,
+    10.609375,
+    10.875,
+    10.921875,
+    10.796875,
+    10.8359375,
+    10.765625,
+    10.8984375,
+    10.921875,
+    10.75,
+    10.6875,
+    10.7265625,
+    11.015625,
+    10.8046875,
+    10.640625,
+    10.890625,
+    10.6640625,
+    11.03125,
+    10.8359375,
+    11.0859375,
+    10.96875,
+    10.984375,
+    10.9375,
+    11.265625,
+    10.96875,
+    10.9453125,
+    11.0625,
+    11.1328125,
+    11.0859375,
+    11.0234375,
+    11.0234375,
+    11.09375,
+    11.0234375,
+    11.09375,
+    11.0546875,
+    11.2265625,
+    10.9296875,
+    10.84375,
+    11.203125,
+    10.9375,
+    11.40625,
+    11.046875,
+    11.2265625,
+    11.1328125,
+    11.1171875,
+    11.0546875,
+    11.28125,
+    11.0703125,
+    11.140625,
+    11.515625,
+    10.8359375,
+    11.328125,
+    11.546875,
+    11.3203125,
+    11.109375,
+    11.390625,
+    11.4140625,
+    11.5546875,
+    11.5546875,
+    11.515625,
+    11.390625,
+    11.125,
+    11.7890625,
+    11.4375,
+    11.6015625,
+    11.359375,
+    11.5390625,
+    11.6953125,
+    11.546875,
+    11.2265625,
+    11.3984375,
+    11.984375,
+    11.71875,
+    11.625,
+    11.6171875,
+    11.7734375,
+    11.46875,
+    11.671875,
+    11.5390625,
+    11.71875,
+    11.6484375,
+    11.7265625,
+    11.5859375,
+    11.6875,
+    11.84375,
+    11.9765625,
+    11.640625,
+    11.625,
+    11.765625,
+    11.6796875,
+    11.8125,
+    11.8046875,
+    11.9296875,
+    11.921875,
+    11.59375,
+    11.8515625,
+    11.84375,
+    11.7421875,
+    12.0078125,
+    12.09375,
+    11.9375,
+    11.7890625,
+    11.6953125,
+    11.765625,
+    12.15625,
+    11.9609375,
+    12.03125,
+    12.2421875,
+    12.0,
+    11.828125,
+    12.0625,
+    11.921875,
+    12.15625,
+    12.1484375,
+    11.78125,
+    12.265625,
+    12.328125,
+    11.84375,
+    12.125,
+    12.1484375,
+    12.2109375,
+    12.4375,
+    11.984375,
+    12.1328125,
+    12.2265625,
+    12.09375,
+    12.484375,
+    12.046875,
+    12.53125,
+    12.0078125,
+    12.453125,
+    12.3125,
+    12.3203125,
+    12.0546875,
+    12.25,
+    12.2421875,
+    12.3671875,
+    12.28125,
+    12.3671875,
+    12.1171875,
+    12.296875,
+    12.0390625,
+    12.640625,
+    12.1015625,
+    11.9921875,
+    12.6953125,
+    12.28125,
+    12.4140625,
+    12.203125,
+    12.2421875,
+    12.0703125,
+    12.1640625,
+    12.5234375,
+    12.15625,
+    12.4375,
+    12.90625,
+    12.7109375,
+    12.3515625,
+    12.4375,
+    12.546875,
+    12.703125,
+    12.5078125,
+    12.78125,
+    12.453125,
+    12.59375,
+    12.6328125,
+    12.5625,
+    12.890625,
+    12.7734375,
+    12.59375,
+    12.765625,
+    13.0078125,
+    12.90625,
+    13.109375,
+    13.1875,
+    12.6015625,
+    13.03125,
+    12.9296875,
+    12.875,
+    12.875,
+    12.9375,
+    12.71875,
+    13.0234375,
+    12.8984375,
+    13.046875,
+    13.0546875,
+    13.015625,
+    12.90625,
+    12.7578125,
+    12.828125,
+    13.2890625,
+    13.1953125,
+    13.2734375,
+    13.1796875,
+    12.9375,
+    13.265625,
+    13.109375,
+    13.3671875,
+    12.8671875,
+    13.0625,
+    13.21875,
+    13.0078125,
+    13.109375,
+    12.859375,
+    13.171875,
+    12.6875,
+    13.2890625,
+    13.46875,
+    12.75,
+    13.046875,
+    13.65625,
+    13.46875,
+    13.328125,
+    13.3359375,
+    13.515625,
+    13.40625,
+    13.609375,
+    13.3046875,
+    13.5390625,
+    13.8046875,
+    13.4296875,
+    13.234375,
+    13.625,
+    13.6875,
+    13.46875,
+    13.703125,
+    13.703125,
+    13.359375,
+    13.6015625,
+    13.7734375,
+    13.515625,
+    13.4375,
+    13.875,
+    13.5234375,
+    13.890625,
+    13.7109375,
+    13.5859375,
+    13.578125,
+    13.09375,
+    13.578125,
+    13.9375,
+    13.625,
+    13.7265625,
+    13.9140625,
+    13.28125,
+    13.6484375,
+    13.8046875,
+    13.96875,
+    13.84375,
+    13.6640625,
+    13.53125,
+    13.96875,
+    13.6875,
+    13.9921875,
+    13.7265625,
+    13.75,
+    13.9375,
+    14.0078125,
+    14.0078125,
+    13.8515625,
+    14.1015625,
+    13.78125,
+    13.7734375,
+    14.1015625,
+    13.8046875,
+    14.015625,
+    13.828125,
+    13.671875,
+    14.0625,
+    14.078125,
+    13.515625,
+    13.84375,
+    14.0390625,
+    14.046875,
+    14.0625,
+    13.953125,
+    13.8125,
+    14.1484375,
+    14.09375,
+    14.2109375,
+    14.34375,
+    14.015625,
+    13.9140625,
+    14.03125,
+    14.265625,
+    14.0234375,
+    13.9921875,
+    14.484375,
+    14.359375,
+    14.265625,
+    13.84375,
+    14.21875,
+    14.3671875,
+    13.734375,
+    14.125,
+    13.890625,
+    14.203125,
+    14.3671875,
+    14.2734375,
+    14.0234375,
+    14.59375,
+    14.484375,
+    14.53125,
+    14.6171875,
+    14.1953125,
+    14.296875,
+    14.734375,
+    14.796875,
+    14.5390625,
+    14.6015625,
+    13.8984375,
+    14.46875,
+    14.671875,
+    14.6640625,
+    14.4609375,
+    14.421875,
+    14.53125,
+    14.7734375,
+    14.5703125,
+    14.25,
+    14.609375,
+    14.5859375,
+    14.9375,
+    14.9609375,
+    14.7734375,
+    14.7265625,
+    14.6875,
+    14.765625,
+    14.6875,
+    15.0625,
+    15.1328125,
+    14.96875,
+    14.546875,
+    14.7265625,
+    14.6015625,
+    14.703125,
+    14.546875,
+    15.3515625,
+    15.046875,
+    14.890625,
+    14.890625,
+    14.6875,
+    15.125,
+    14.7890625,
+    14.90625,
+    14.5,
+    15.1328125,
+    15.09375,
+    15.1796875,
+    14.59375,
+    14.7578125,
+    14.9765625,
+    15.125,
+    15.28125,
+    15.0546875,
+    15.0703125,
+    15.21875,
+    14.4609375,
+    15.015625,
+    15.1875,
+    15.390625,
+    15.09375,
+    14.75,
+    15.1328125,
+    15.359375,
+    15.2890625,
+    15.2734375,
+    15.1484375,
+    15.1328125,
+    15.15625,
+    15.1328125,
+    15.421875,
+    15.15625,
+    15.2578125,
+    14.9375,
+    15.015625,
+    15.46875,
+    15.2734375,
+    15.2734375,
+    15.40625,
+    15.6484375,
+    15.8671875,
+    15.3125,
+    15.015625,
+    15.90625,
+    15.203125,
+    15.328125,
+    15.46875,
+    15.0078125,
+    15.546875,
+    15.359375,
+    14.8515625,
+    15.875,
+    15.15625,
+    15.296875,
+    15.671875,
+    15.3984375,
+    15.296875,
+    15.6171875,
+    15.25,
+    15.0625,
+    15.671875,
+    15.7109375,
+    15.5546875,
+    15.640625,
+    15.625,
+    15.984375,
+    15.8125,
+    15.71875,
+    15.6875,
+    15.5,
+    15.3359375,
+    15.15625,
+    15.78125,
+    15.6015625,
+    15.625,
+    15.71875,
+    15.921875,
+    15.6171875,
+    15.5703125,
+    15.34375,
+    15.546875,
+    16.0,
+    15.921875,
+    15.546875,
+    15.7578125,
+    16.0,
+    15.78125,
+    16.390625,
+    16.25,
+    15.953125,
+    15.9765625,
+    16.390625,
+    16.046875,
+    15.8046875,
+    15.9375,
+    15.875,
+    15.9296875,
+    16.21875,
+    16.1875,
+    16.125,
+    15.96875,
+    15.765625,
+    16.3125,
+    15.7890625,
+    16.1875,
+    16.40625,
+    15.5390625,
+    15.84375,
+    16.265625,
+    16.109375,
+    16.03125,
+    16.375,
+    16.40625,
+    15.3828125,
+    15.8046875,
+    16.671875,
+    16.40625,
+    16.34375,
+    16.375,
+    16.078125,
+    16.015625,
+    16.359375,
+    16.515625,
+    15.8828125,
+    16.109375,
+    16.1875,
+    16.359375,
+    16.34375,
+    16.25,
+    16.203125,
+    16.546875,
+    16.28125,
+    16.390625,
+    16.859375,
+    16.125,
+    16.84375,
+    16.453125,
+    15.96875,
+    16.65625,
+    16.296875,
+    16.5,
+    16.546875,
+    16.109375,
+    16.546875,
+    16.53125,
+    16.1875,
+    16.65625,
+    16.3125,
+    16.625,
+    16.53125,
+    16.21875,
+    16.96875,
+    16.84375,
+    16.734375,
+    16.65625,
+    16.671875,
+    16.640625,
+    16.3125,
+    16.90625,
+    16.4375,
+    16.5625,
+    16.671875,
+    16.859375,
+    16.875,
+    16.546875,
+    16.90625,
+    16.796875,
+    16.546875,
+    16.828125,
+    16.65625,
+    16.8125,
+    16.703125,
+    16.71875,
+    16.9375,
+    16.96875,
+    16.40625,
+    16.65625,
+    16.328125,
+    16.53125,
+    17.0625,
+    16.53125,
+    16.625,
+    16.859375,
+    17.09375,
+    16.390625,
+    16.9375,
+    17.015625,
+    17.0,
+    16.53125,
+    16.953125,
+    16.578125,
+    17.4375,
+    16.703125,
+    16.6875,
+    16.78125,
+    17.0,
+    17.15625,
+    16.59375,
+    16.53125,
+    16.84375,
+    16.921875,
+    16.9375,
+    17.125,
+    17.0,
+    17.40625,
+    16.9375,
+    16.90625,
+    17.109375,
+    17.03125,
+    16.671875,
+    17.203125,
+    17.203125,
+    16.90625,
+    16.828125,
+    17.125,
+    17.15625,
+    17.109375,
+    16.953125,
+    16.890625,
+    17.0,
+    17.265625,
+    17.46875,
+    17.515625,
+    17.015625,
+    17.296875,
+    17.109375,
+    17.171875,
+    17.3125,
+    17.1875,
+    17.671875,
+    16.9375,
+    17.265625,
+    17.0625,
+    17.578125,
+    16.828125,
+    17.21875,
+    17.421875,
+    17.265625,
+    17.375,
+    17.203125,
+    17.21875,
+    17.578125,
+    17.1875,
+    17.359375,
+    17.234375,
+    17.96875,
+    17.546875,
+    17.59375,
+    17.34375,
+    16.984375,
+    17.40625,
+    17.234375,
+    17.015625,
+    17.265625,
+    17.25,
+    17.1875,
+    17.625,
+    17.8125,
+    17.59375,
+    17.1875,
+    17.140625,
+    17.328125,
+    17.390625,
+    17.5625,
+    17.3125,
+    17.75,
+    17.484375,
+    16.984375,
+    17.640625,
+    17.75,
+    17.734375,
+    16.65625,
+    17.125,
+    17.78125,
+    17.59375,
+    17.453125,
+    17.5625,
+    17.484375,
+    17.875,
+    17.53125,
+    17.5625,
+    17.5625,
+    17.140625,
+    17.78125,
+    17.5625,
+    17.625,
+    17.671875,
+    17.75,
+    17.75,
+    18.09375,
+    17.8125,
+    17.5,
+    17.96875,
+    17.46875,
+    18.0,
+    17.9375,
+    17.8125,
+    17.46875,
+    17.5,
+    17.4375,
+    17.09375,
+    17.875,
+    17.796875,
+    18.046875,
+    17.90625,
+    17.796875,
+    17.65625,
+    18.0,
+    17.765625,
+    17.859375,
+    17.859375,
+    17.5625,
+    17.796875,
+    17.828125,
+    17.6875,
+    17.515625,
+    17.546875,
+    18.203125,
+    17.828125,
+    17.546875,
+    18.296875,
+    18.28125,
+    17.53125,
+    17.625,
+    17.578125,
+    18.0625,
+    17.984375,
+    18.125,
+    17.984375,
+    17.828125,
+    17.984375,
+    18.140625,
+    17.78125,
+    18.03125,
+    17.765625,
+    18.234375,
+    17.90625,
+    18.21875,
+    18.09375,
+    18.375,
+    17.953125,
+    18.140625,
+    18.3125,
+    17.984375,
+    18.0625,
+    18.359375,
+    18.5,
+    17.90625,
+    18.265625,
+    18.421875,
+    17.890625,
+    18.015625,
+    17.828125,
+    18.21875,
+    18.15625,
+    18.15625,
+    18.484375,
+    18.484375,
+    18.03125,
+    18.078125,
+    17.9375,
+    17.921875,
+    17.96875,
+    18.03125,
+    17.84375,
+    18.375,
+    18.359375,
+    18.46875,
+    17.96875,
+    17.734375,
+    18.421875,
+    17.984375,
+    18.421875,
+    18.203125,
+    18.46875,
+    18.28125,
+    18.40625,
+    17.8125,
+    18.21875,
+    18.59375,
+    18.59375,
+    18.765625,
+    18.40625,
+    18.65625,
+    17.984375,
+    18.375,
+    18.421875,
+    18.25,
+    18.140625,
+    18.203125,
+    18.265625,
+    18.125,
+    18.203125,
+    18.359375,
+    18.5,
+    18.6875,
+    18.125,
+    18.46875,
+    18.359375,
+    18.140625,
+    18.125,
+    18.421875,
+    18.15625,
+    17.734375,
+    18.296875,
+    18.09375,
+    18.375,
+    18.1875,
+    17.734375,
+    18.1875,
+    18.65625,
+    18.234375,
+    17.890625,
+    18.484375,
+    19.03125,
+    18.34375,
+    17.859375,
+    18.5,
+    18.75,
+    18.390625,
+    18.59375,
+    18.15625,
+    18.21875,
+    18.625,
+    18.46875,
+    18.734375,
+    18.625,
+    18.1875,
+    18.046875,
+    18.578125,
+    18.625,
+    18.546875,
+    18.1875,
+    18.78125,
+    18.65625,
+    18.796875,
+    18.65625,
+    18.5,
+    18.40625,
+    18.859375,
+    18.046875,
+    18.65625,
+    18.46875,
+    18.875,
+    18.6875,
+    18.171875,
+    18.5,
+    18.15625,
+    18.28125,
+    18.078125,
+    19.03125,
+    18.375,
+    18.609375,
+    18.96875,
+    18.609375,
+    18.25,
+    18.890625,
+    18.46875,
+    18.703125,
+    18.53125,
+    18.34375,
+    18.6875,
+    18.5,
+    18.53125,
+    18.75,
+    18.453125,
+    18.515625,
+    19.0625,
+    18.46875,
+    18.578125,
+    19.125,
+    18.734375,
+    18.40625,
+    18.875,
+    18.5,
+    18.34375,
+    18.21875,
+    18.40625,
+    18.0,
+    18.234375,
+    18.21875,
+    18.75,
+    18.828125,
+    18.34375,
+    18.40625,
+    18.671875,
+    18.5625,
+    18.875,
+    18.875,
+    18.59375,
+    18.328125,
+    18.59375,
+    18.703125,
+    18.8125,
+    18.8125,
+    18.625,
+    18.8125,
+    18.84375,
+    18.859375,
+    18.734375,
+    18.234375,
+    18.765625,
+    19.09375,
+    18.796875,
+    19.109375,
+    18.484375,
+    18.796875,
+    18.53125,
+    18.953125,
+    18.828125,
+    18.484375,
+    19.0625,
+    18.765625,
+    18.5625,
+    18.21875,
+    18.3125,
+    18.5625,
+    18.640625,
+    18.96875,
+    18.515625,
+    18.46875,
+    18.609375,
+    18.828125,
+    18.46875,
+    18.5,
+    18.5625,
+    18.53125,
+    18.703125,
+    18.296875,
+    18.59375,
+    19.0625,
+    18.125,
+    18.53125,
+    18.890625,
+    18.765625,
+    18.296875,
+    18.84375,
+    18.90625,
+    18.15625,
+    17.8125,
+    18.953125,
+    18.296875,
+    18.703125,
+    18.703125,
+    18.46875,
+    19.03125,
+    18.546875,
+    18.59375,
+    18.875,
+    19.015625,
+    18.609375,
+    18.59375,
+    18.65625,
+    18.828125,
+    18.875,
+    18.359375,
+    18.46875,
+    18.96875,
+    18.8125,
+    17.75,
+    18.390625,
+    18.875,
+    18.125,
+    18.34375,
+    18.703125,
+    18.90625,
+    19.09375,
+    18.765625,
+    18.515625,
+    18.421875,
+    18.828125,
+    18.609375,
+    18.28125,
+    18.875,
+    18.578125,
+    18.90625,
+    18.75,
+    18.609375,
+    19.0,
+    18.84375,
+    18.546875,
+    18.5,
+    18.734375,
+    18.703125,
+    18.5625,
+    18.765625,
+    18.28125,
+    19.078125,
+    18.359375,
+    18.515625,
+    19.0625,
+    18.84375,
+    18.5,
+    18.765625,
+    18.625,
+    18.5,
+    18.953125,
+    18.609375,
+    18.71875,
+    18.453125,
+    18.625,
+    18.546875,
+    18.8125,
+    18.890625,
+    18.84375,
+    19.171875,
+    18.84375,
+    18.96875,
+    18.484375,
+    19.09375,
+    18.890625,
+    19.265625,
+    18.40625,
+    18.9375,
+    18.6875,
+    18.734375,
+    18.578125,
+    18.421875,
+    19.296875,
+    18.84375,
+    19.015625,
+    18.828125,
+    19.09375,
+    18.84375,
+    19.015625,
+    19.0625,
+    18.96875,
+    19.21875,
+    18.421875,
+    18.875,
+    18.546875,
+    18.59375,
+    18.984375
+  ],
+  "total_losses": [
+    8.078125,
+    7.6015625,
+    7.625,
+    7.3828125,
+    7.3671875,
+    7.0390625,
+    7.17578125,
+    6.87890625,
+    6.70703125,
+    6.44140625,
+    6.47265625,
+    6.4140625,
+    6.5703125,
+    6.37109375,
+    6.4453125,
+    6.26953125,
+    5.8046875,
+    6.22265625,
+    6.12109375,
+    6.05859375,
+    6.10546875,
+    5.50390625,
+    5.39453125,
+    5.46484375,
+    6.06640625,
+    5.79296875,
+    5.87109375,
+    5.76171875,
+    5.80859375,
+    5.60546875,
+    5.3828125,
+    5.33203125,
+    5.41015625,
+    5.45703125,
+    5.73046875,
+    5.37890625,
+    5.38671875,
+    5.1171875,
+    5.6640625,
+    5.3046875,
+    5.47265625,
+    5.46484375,
+    5.609375,
+    5.140625,
+    5.03125,
+    5.1875,
+    4.84765625,
+    5.0625,
+    4.97265625,
+    5.0546875,
+    4.98046875,
+    5.40625,
+    5.25390625,
+    4.8984375,
+    4.84765625,
+    5.19921875,
+    4.75390625,
+    4.86328125,
+    4.29296875,
+    4.8515625,
+    4.9765625,
+    4.734375,
+    4.6328125,
+    4.60546875,
+    4.80078125,
+    4.3359375,
+    4.8203125,
+    4.73828125,
+    4.5,
+    4.66015625,
+    4.44921875,
+    4.5703125,
+    4.53125,
+    4.1015625,
+    4.484375,
+    4.43359375,
+    4.6171875,
+    4.56640625,
+    4.3984375,
+    4.4453125,
+    4.62109375,
+    4.3984375,
+    4.48046875,
+    4.51171875,
+    4.0546875,
+    4.2109375,
+    4.546875,
+    4.5390625,
+    4.34375,
+    4.296875,
+    4.21484375,
+    4.12109375,
+    4.1875,
+    4.37109375,
+    4.14453125,
+    4.1640625,
+    4.1640625,
+    4.09375,
+    3.87109375,
+    4.18359375,
+    3.74609375,
+    4.31640625,
+    4.26171875,
+    4.20703125,
+    4.11328125,
+    4.15234375,
+    3.671875,
+    4.21875,
+    3.900390625,
+    3.8671875,
+    4.0625,
+    3.6328125,
+    4.16796875,
+    3.849609375,
+    3.9765625,
+    4.09765625,
+    3.712890625,
+    4.36328125,
+    3.986328125,
+    3.900390625,
+    3.712890625,
+    3.748046875,
+    3.88671875,
+    3.779296875,
+    4.11328125,
+    3.75390625,
+    3.845703125,
+    3.646484375,
+    3.8828125,
+    4.02734375,
+    3.728515625,
+    4.14453125,
+    3.900390625,
+    4.07421875,
+    3.859375,
+    3.8671875,
+    4.0,
+    3.810546875,
+    3.669921875,
+    3.599609375,
+    3.474609375,
+    3.892578125,
+    3.791015625,
+    3.53125,
+    3.943359375,
+    3.568359375,
+    3.462890625,
+    3.85546875,
+    3.587890625,
+    3.4140625,
+    3.681640625,
+    3.853515625,
+    3.826171875,
+    3.6875,
+    3.681640625,
+    3.3671875,
+    3.478515625,
+    3.740234375,
+    3.7265625,
+    3.375,
+    3.560546875,
+    3.5625,
+    3.36328125,
+    3.447265625,
+    3.373046875,
+    3.474609375,
+    3.7734375,
+    3.560546875,
+    3.267578125,
+    3.625,
+    3.84375,
+    3.017578125,
+    3.435546875,
+    3.40234375,
+    3.6640625,
+    3.380859375,
+    3.46484375,
+    3.474609375,
+    3.169921875,
+    3.41796875,
+    3.392578125,
+    3.3671875,
+    3.712890625,
+    3.322265625,
+    3.263671875,
+    3.330078125,
+    3.763671875,
+    3.658203125,
+    3.173828125,
+    3.48046875,
+    3.443359375,
+    3.142578125,
+    3.484375,
+    3.36328125,
+    3.181640625,
+    3.357421875,
+    3.330078125,
+    3.1640625,
+    3.650390625,
+    3.80859375,
+    3.201171875,
+    3.091796875,
+    3.48046875,
+    3.21484375,
+    3.123046875,
+    3.3125,
+    3.17578125,
+    3.216796875,
+    3.302734375,
+    3.130859375,
+    3.263671875,
+    3.333984375,
+    3.41796875,
+    3.359375,
+    3.271484375,
+    3.42578125,
+    3.3046875,
+    3.37890625,
+    3.310546875,
+    3.064453125,
+    3.134765625,
+    3.25390625,
+    2.9609375,
+    3.078125,
+    3.0546875,
+    3.369140625,
+    3.1328125,
+    3.0703125,
+    3.248046875,
+    3.060546875,
+    3.1875,
+    2.94921875,
+    3.087890625,
+    3.193359375,
+    2.98828125,
+    3.240234375,
+    3.01953125,
+    3.0,
+    3.02734375,
+    2.7265625,
+    3.140625,
+    2.7578125,
+    3.23828125,
+    3.02734375,
+    2.82421875,
+    3.169921875,
+    3.087890625,
+    3.38671875,
+    3.07421875,
+    3.0390625,
+    3.1953125,
+    2.84765625,
+    2.94921875,
+    2.828125,
+    3.08203125,
+    2.91015625,
+    2.88671875,
+    3.13671875,
+    2.95703125,
+    2.98046875,
+    2.73828125,
+    2.900390625,
+    2.80078125,
+    3.171875,
+    2.982421875,
+    2.689453125,
+    2.98828125,
+    2.8515625,
+    2.998046875,
+    2.63671875,
+    2.9609375,
+    3.0546875,
+    2.970703125,
+    3.134765625,
+    2.89453125,
+    2.984375,
+    2.708984375,
+    2.78125,
+    3.263671875,
+    2.947265625,
+    2.98828125,
+    2.712890625,
+    2.99609375,
+    2.876953125,
+    2.8359375,
+    3.033203125,
+    2.7734375,
+    2.91796875,
+    2.6796875,
+    2.900390625,
+    3.189453125,
+    2.92578125,
+    2.916015625,
+    2.92578125,
+    2.912109375,
+    2.94140625,
+    2.810546875,
+    3.052734375,
+    2.9375,
+    2.9765625,
+    2.8671875,
+    2.8984375,
+    2.9921875,
+    2.734375,
+    2.79296875,
+    2.884765625,
+    2.806640625,
+    2.767578125,
+    2.640625,
+    2.7734375,
+    2.712890625,
+    2.734375,
+    2.775390625,
+    2.96875,
+    2.53125,
+    2.6640625,
+    2.708984375,
+    2.83203125,
+    2.685546875,
+    2.58203125,
+    2.91015625,
+    2.814453125,
+    2.95703125,
+    2.919921875,
+    2.873046875,
+    2.640625,
+    2.794921875,
+    2.642578125,
+    2.67578125,
+    2.708984375,
+    2.82421875,
+    2.654296875,
+    2.73046875,
+    2.59375,
+    2.8125,
+    2.556640625,
+    2.625,
+    2.6015625,
+    3.01953125,
+    2.8359375,
+    2.86328125,
+    2.533203125,
+    2.759765625,
+    2.787109375,
+    2.78515625,
+    2.697265625,
+    2.646484375,
+    2.65234375,
+    2.755859375,
+    2.595703125,
+    2.642578125,
+    2.5703125,
+    2.9453125,
+    2.65234375,
+    2.890625,
+    2.48046875,
+    2.64453125,
+    2.61328125,
+    2.453125,
+    2.6875,
+    2.57421875,
+    2.822265625,
+    2.630859375,
+    2.671875,
+    2.900390625,
+    2.80859375,
+    2.68359375,
+    2.73828125,
+    2.435546875,
+    2.740234375,
+    2.86328125,
+    2.55078125,
+    2.716796875,
+    2.544921875,
+    2.666015625,
+    2.44921875,
+    2.408203125,
+    2.76953125,
+    2.701171875,
+    2.732421875,
+    2.630859375,
+    2.4296875,
+    2.421875,
+    2.68359375,
+    2.525390625,
+    2.498046875,
+    2.69140625,
+    2.416015625,
+    2.640625,
+    2.33203125,
+    2.4921875,
+    2.529296875,
+    2.49609375,
+    2.6796875,
+    2.513671875,
+    2.5859375,
+    2.50390625,
+    2.630859375,
+    2.732421875,
+    2.6796875,
+    2.59765625,
+    2.59765625,
+    2.435546875,
+    2.38671875,
+    2.4765625,
+    2.40625,
+    2.5234375,
+    2.435546875,
+    2.41796875,
+    2.724609375,
+    2.4921875,
+    2.5,
+    2.763671875,
+    2.482421875,
+    2.44921875,
+    2.55078125,
+    2.4765625,
+    2.20703125,
+    2.3046875,
+    2.525390625,
+    2.642578125,
+    2.380859375,
+    2.630859375,
+    2.314453125,
+    2.498046875,
+    2.4453125,
+    2.6953125,
+    2.291015625,
+    2.6015625,
+    2.419921875,
+    2.580078125,
+    2.224609375,
+    2.423828125,
+    2.25390625,
+    2.595703125,
+    2.599609375,
+    2.576171875,
+    2.62109375,
+    2.345703125,
+    2.478515625,
+    2.494140625,
+    2.447265625,
+    2.302734375,
+    2.478515625,
+    2.373046875,
+    2.546875,
+    2.53515625,
+    2.361328125,
+    2.494140625,
+    2.453125,
+    2.431640625,
+    2.447265625,
+    2.474609375,
+    2.6015625,
+    2.3125,
+    2.330078125,
+    2.314453125,
+    2.447265625,
+    2.291015625,
+    2.380859375,
+    2.595703125,
+    2.529296875,
+    2.341796875,
+    2.177734375,
+    2.337890625,
+    2.431640625,
+    2.3984375,
+    2.435546875,
+    2.376953125,
+    2.490234375,
+    2.451171875,
+    2.294921875,
+    2.259765625,
+    2.2890625,
+    2.498046875,
+    2.271484375,
+    2.380859375,
+    2.37890625,
+    2.52734375,
+    2.4296875,
+    2.3125,
+    2.423828125,
+    2.34375,
+    2.26171875,
+    2.35546875,
+    2.26953125,
+    2.3828125,
+    2.396484375,
+    2.4140625,
+    2.31640625,
+    2.482421875,
+    2.138671875,
+    2.3125,
+    2.576171875,
+    2.2890625,
+    2.32421875,
+    2.439453125,
+    2.48046875,
+    2.412109375,
+    2.29296875,
+    2.158203125,
+    2.490234375,
+    2.509765625,
+    2.3671875,
+    2.16796875,
+    2.3671875,
+    2.505859375,
+    2.384765625,
+    2.271484375,
+    2.416015625,
+    2.283203125,
+    2.353515625,
+    2.18359375,
+    2.3359375,
+    2.162109375,
+    2.134765625,
+    2.443359375,
+    2.095703125,
+    2.212890625,
+    2.412109375,
+    2.375,
+    2.3046875,
+    2.0703125,
+    2.16796875,
+    2.16015625,
+    2.2734375,
+    1.9423828125,
+    2.109375,
+    2.33203125,
+    2.22265625,
+    2.248046875,
+    2.494140625,
+    2.21484375,
+    2.22265625,
+    2.271484375,
+    2.1171875,
+    2.205078125,
+    2.169921875,
+    2.515625,
+    2.513671875,
+    2.275390625,
+    2.265625,
+    2.009765625,
+    2.267578125,
+    2.169921875,
+    2.2265625,
+    2.193359375,
+    2.28515625,
+    2.4140625,
+    2.357421875,
+    2.30859375,
+    2.134765625,
+    2.349609375,
+    2.048828125,
+    2.09375,
+    2.29296875,
+    2.03515625,
+    2.181640625,
+    2.248046875,
+    2.259765625,
+    2.552734375,
+    2.41796875,
+    2.123046875,
+    1.98828125,
+    2.1328125,
+    2.193359375,
+    2.189453125,
+    1.986328125,
+    2.1171875,
+    2.005859375,
+    2.056640625,
+    2.126953125,
+    2.404296875,
+    1.8837890625,
+    2.0390625,
+    2.375,
+    2.068359375,
+    2.17578125,
+    2.060546875,
+    2.021484375,
+    2.107421875,
+    2.07421875,
+    2.1171875,
+    2.08203125,
+    2.318359375,
+    2.212890625,
+    1.962890625,
+    1.923828125,
+    2.21875,
+    2.2578125,
+    2.349609375,
+    2.07421875,
+    2.046875,
+    2.271484375,
+    2.166015625,
+    1.9775390625,
+    1.98828125,
+    2.259765625,
+    2.19140625,
+    2.111328125,
+    2.130859375,
+    2.142578125,
+    2.203125,
+    2.01171875,
+    2.28125,
+    2.193359375,
+    2.083984375,
+    2.08203125,
+    1.986328125,
+    2.12890625,
+    2.1171875,
+    2.37890625,
+    2.142578125,
+    2.013671875,
+    2.04296875,
+    2.193359375,
+    1.974609375,
+    2.02734375,
+    2.1171875,
+    2.3359375,
+    2.1015625,
+    2.03515625,
+    2.041015625,
+    1.916015625,
+    2.201171875,
+    2.28515625,
+    2.185546875,
+    2.072265625,
+    2.02734375,
+    2.07421875,
+    2.1484375,
+    1.953125,
+    1.9296875,
+    1.958984375,
+    1.9716796875,
+    2.025390625,
+    2.03515625,
+    1.98046875,
+    1.98828125,
+    2.130859375,
+    2.068359375,
+    2.111328125,
+    2.078125,
+    1.8232421875,
+    2.236328125,
+    1.9306640625,
+    2.09375,
+    2.03125,
+    1.8896484375,
+    1.919921875,
+    2.109375,
+    2.234375,
+    1.9345703125,
+    1.896484375,
+    2.00390625,
+    2.064453125,
+    2.037109375,
+    2.197265625,
+    2.111328125,
+    1.9951171875,
+    2.11328125,
+    2.232421875,
+    2.255859375,
+    2.134765625,
+    1.986328125,
+    1.83984375,
+    1.9951171875,
+    1.8505859375,
+    2.20703125,
+    1.9267578125,
+    1.9111328125,
+    1.9423828125,
+    1.8310546875,
+    2.078125,
+    1.970703125,
+    2.068359375,
+    2.125,
+    1.9541015625,
+    2.28125,
+    1.9921875,
+    1.8193359375,
+    2.142578125,
+    2.087890625,
+    2.0703125,
+    2.10546875,
+    2.15625,
+    2.15625,
+    1.9462890625,
+    2.01171875,
+    1.9130859375,
+    1.896484375,
+    1.8935546875,
+    2.005859375,
+    2.17578125,
+    2.078125,
+    1.873046875,
+    2.107421875,
+    2.115234375,
+    2.044921875,
+    2.16796875,
+    2.13671875,
+    1.8349609375,
+    2.015625,
+    2.056640625,
+    2.0078125,
+    1.98046875,
+    2.16015625,
+    2.07421875,
+    2.017578125,
+    1.9912109375,
+    2.01953125,
+    1.9560546875,
+    1.9501953125,
+    1.8408203125,
+    1.9189453125,
+    1.90625,
+    1.97265625,
+    2.0234375,
+    1.912109375,
+    2.048828125,
+    2.076171875,
+    2.10546875,
+    1.857421875,
+    2.126953125,
+    1.7783203125,
+    1.92578125,
+    1.9111328125,
+    1.8837890625,
+    2.046875,
+    1.9345703125,
+    1.9345703125,
+    1.6279296875,
+    1.9306640625,
+    1.974609375,
+    1.75390625,
+    1.9423828125,
+    1.9072265625,
+    2.123046875,
+    1.8916015625,
+    1.888671875,
+    1.80859375,
+    2.08984375,
+    2.068359375,
+    1.841796875,
+    1.8212890625,
+    1.96484375,
+    1.744140625,
+    1.9619140625,
+    2.123046875,
+    1.9921875,
+    2.18359375,
+    1.95703125,
+    1.9033203125,
+    1.9580078125,
+    2.119140625,
+    2.0,
+    2.01953125,
+    1.947265625,
+    2.091796875,
+    1.8876953125,
+    2.0703125,
+    2.029296875,
+    2.041015625,
+    1.943359375,
+    1.57421875,
+    2.130859375,
+    1.810546875,
+    1.9853515625,
+    1.865234375,
+    2.060546875,
+    1.677734375,
+    1.9697265625,
+    1.8408203125,
+    1.9814453125,
+    2.005859375,
+    1.875,
+    1.94921875,
+    1.955078125,
+    2.025390625,
+    2.1875,
+    2.033203125,
+    1.8837890625,
+    1.841796875,
+    2.080078125,
+    2.03515625,
+    1.9814453125,
+    1.958984375,
+    1.775390625,
+    2.01171875,
+    2.1171875,
+    1.8251953125,
+    2.103515625,
+    1.8876953125,
+    1.80078125,
+    2.095703125,
+    2.03125,
+    1.755859375,
+    1.87890625,
+    2.048828125,
+    1.892578125,
+    1.7880859375,
+    2.025390625,
+    1.79296875,
+    1.984375,
+    1.806640625,
+    1.927734375,
+    1.8955078125,
+    1.828125,
+    1.84765625,
+    1.880859375,
+    1.81640625,
+    1.72265625,
+    1.9833984375,
+    1.927734375,
+    2.046875,
+    1.7431640625,
+    1.7294921875,
+    1.89453125,
+    1.9912109375,
+    1.650390625,
+    1.927734375,
+    1.87109375,
+    1.818359375,
+    1.7646484375,
+    1.8125,
+    1.701171875,
+    1.9013671875,
+    1.8623046875,
+    1.8974609375,
+    1.8662109375,
+    1.8505859375,
+    1.9462890625,
+    1.9541015625,
+    1.9853515625,
+    1.880859375,
+    1.9013671875,
+    1.6962890625,
+    1.8935546875,
+    1.7626953125,
+    1.7939453125,
+    1.953125,
+    1.9013671875,
+    1.830078125,
+    1.7822265625,
+    1.89453125,
+    1.9775390625,
+    1.9091796875,
+    1.9853515625,
+    1.9814453125,
+    1.8779296875,
+    1.9619140625,
+    1.9150390625,
+    1.841796875,
+    1.9599609375,
+    1.8330078125,
+    1.9140625,
+    1.876953125,
+    1.8193359375,
+    1.765625,
+    1.8671875,
+    2.025390625,
+    1.7880859375,
+    1.6982421875,
+    2.02734375,
+    2.015625,
+    1.8955078125,
+    1.7900390625,
+    1.8427734375,
+    1.7314453125,
+    1.8642578125,
+    1.8212890625,
+    1.841796875,
+    1.833984375,
+    1.9697265625,
+    1.986328125,
+    1.767578125,
+    1.6435546875,
+    1.8759765625,
+    1.6845703125,
+    1.904296875,
+    1.8740234375,
+    1.857421875,
+    1.767578125,
+    1.7255859375,
+    1.76171875,
+    1.8251953125,
+    1.86328125,
+    1.7861328125,
+    1.7431640625,
+    1.8330078125,
+    1.7998046875,
+    2.03515625,
+    1.95703125,
+    1.83203125,
+    1.9892578125,
+    1.8681640625,
+    1.7626953125,
+    1.978515625,
+    1.93359375,
+    1.837890625,
+    1.724609375,
+    1.9921875,
+    1.76953125,
+    1.791015625,
+    1.9111328125,
+    1.744140625,
+    1.8525390625,
+    2.083984375,
+    1.90625,
+    1.712890625,
+    1.7666015625,
+    1.828125,
+    1.732421875,
+    1.8701171875,
+    1.677734375,
+    1.662109375,
+    1.7958984375,
+    1.5283203125,
+    1.94140625,
+    1.576171875,
+    1.8369140625,
+    1.7646484375,
+    1.8720703125,
+    1.78125,
+    1.8193359375,
+    1.7626953125,
+    1.6201171875,
+    1.884765625,
+    1.7578125,
+    1.830078125,
+    1.6669921875,
+    1.7509765625,
+    1.8984375,
+    1.7685546875,
+    1.72265625,
+    1.943359375,
+    1.7255859375,
+    1.666015625,
+    1.873046875,
+    1.796875,
+    1.8154296875,
+    1.638671875,
+    1.8154296875,
+    1.9111328125,
+    1.9228515625,
+    1.677734375,
+    1.66015625,
+    1.7275390625,
+    1.87109375,
+    1.8662109375,
+    1.822265625,
+    1.6083984375,
+    1.91796875,
+    1.7783203125,
+    1.91015625,
+    1.828125,
+    1.7265625,
+    2.009765625,
+    1.90625,
+    1.748046875,
+    1.6796875,
+    1.76171875,
+    1.703125,
+    1.734375,
+    1.8173828125,
+    1.9306640625,
+    1.703125,
+    1.662109375,
+    1.7802734375,
+    1.7822265625,
+    1.525390625,
+    1.771484375,
+    1.921875,
+    1.630859375,
+    1.5302734375,
+    1.771484375,
+    1.76171875,
+    1.7783203125,
+    1.751953125,
+    1.77734375,
+    1.685546875,
+    1.7890625,
+    1.806640625,
+    1.7705078125,
+    1.943359375,
+    1.83984375,
+    1.6904296875,
+    1.5927734375,
+    1.7568359375,
+    1.70703125,
+    1.921875,
+    1.609375,
+    1.6943359375,
+    1.642578125,
+    1.6748046875,
+    1.7685546875,
+    1.71484375,
+    1.74609375,
+    1.85546875,
+    1.7470703125,
+    1.7919921875,
+    1.8349609375,
+    1.7900390625,
+    1.7958984375,
+    1.556640625,
+    1.791015625,
+    1.6025390625,
+    1.9130859375,
+    1.646484375,
+    1.712890625,
+    1.6826171875,
+    2.03515625,
+    1.6689453125,
+    1.720703125,
+    1.6435546875,
+    1.6279296875,
+    1.87109375,
+    1.8681640625,
+    1.8828125,
+    1.751953125,
+    1.8037109375,
+    1.8076171875,
+    1.880859375,
+    1.7353515625,
+    1.7197265625,
+    1.6982421875,
+    1.6826171875,
+    1.890625,
+    1.775390625,
+    1.7529296875,
+    1.6162109375,
+    1.6943359375,
+    1.8701171875,
+    1.6845703125,
+    1.5703125,
+    1.8671875,
+    1.8505859375,
+    1.7607421875,
+    1.6611328125,
+    1.7177734375,
+    1.6962890625,
+    1.912109375,
+    1.8671875,
+    1.6767578125,
+    1.849609375,
+    1.6337890625,
+    1.802734375,
+    1.5712890625,
+    1.91796875,
+    1.7802734375,
+    1.6640625,
+    1.5263671875,
+    1.66796875,
+    1.8076171875,
+    1.830078125,
+    1.7236328125,
+    1.7568359375,
+    1.7392578125,
+    1.720703125,
+    1.6533203125,
+    1.5859375,
+    1.6279296875,
+    1.7861328125,
+    1.8359375,
+    1.7490234375,
+    1.51953125,
+    1.6005859375,
+    1.6611328125,
+    1.580078125,
+    1.732421875,
+    1.5888671875,
+    1.638671875,
+    1.5986328125,
+    1.724609375,
+    1.6865234375,
+    1.541015625,
+    1.7490234375,
+    1.662109375,
+    1.693359375,
+    1.6865234375,
+    1.7861328125,
+    1.78515625,
+    1.6728515625,
+    1.671875,
+    1.8837890625,
+    1.595703125,
+    1.5888671875,
+    1.978515625,
+    1.580078125,
+    1.927734375,
+    1.716796875,
+    1.923828125,
+    1.78515625,
+    1.62890625,
+    1.818359375,
+    1.7041015625,
+    1.748046875,
+    1.6806640625,
+    1.62890625,
+    1.5771484375,
+    1.771484375,
+    1.9775390625,
+    1.73828125,
+    1.7353515625,
+    1.70703125,
+    1.76171875,
+    1.740234375,
+    1.611328125,
+    1.65234375,
+    1.6123046875,
+    1.6923828125,
+    1.552734375,
+    1.853515625,
+    1.724609375,
+    1.68359375,
+    1.64453125,
+    1.5771484375,
+    1.6513671875,
+    1.6123046875,
+    1.6435546875,
+    1.712890625,
+    1.439453125,
+    1.822265625,
+    1.806640625,
+    1.67578125,
+    1.697265625,
+    1.7109375,
+    1.513671875,
+    1.7109375,
+    1.66015625,
+    1.55859375,
+    1.9326171875,
+    1.8271484375,
+    1.708984375,
+    1.560546875,
+    1.650390625,
+    1.6650390625,
+    1.681640625,
+    1.4736328125,
+    1.5341796875,
+    1.5703125,
+    1.9296875,
+    1.6640625,
+    1.80859375,
+    1.4658203125,
+    1.771484375,
+    1.58984375,
+    1.63671875,
+    1.6767578125,
+    1.5478515625,
+    1.583984375,
+    1.703125,
+    1.54296875,
+    1.7216796875,
+    1.6962890625,
+    1.7119140625,
+    1.8515625,
+    1.6318359375,
+    1.73828125,
+    1.80859375,
+    1.6181640625,
+    1.5859375,
+    1.6494140625,
+    1.572265625,
+    1.6708984375,
+    1.732421875,
+    1.515625,
+    1.638671875,
+    1.7900390625,
+    1.685546875,
+    1.810546875,
+    1.853515625,
+    1.763671875,
+    1.83984375,
+    1.76953125,
+    1.626953125,
+    1.654296875,
+    1.6552734375,
+    1.517578125,
+    1.779296875,
+    1.76171875,
+    1.74609375,
+    1.6708984375,
+    1.6650390625,
+    1.6875,
+    1.7890625,
+    1.615234375,
+    1.794921875,
+    1.734375,
+    1.4580078125,
+    1.4296875,
+    1.5009765625,
+    1.658203125,
+    1.73046875,
+    1.6357421875,
+    1.474609375,
+    1.802734375,
+    1.6796875,
+    1.8583984375,
+    1.466796875,
+    1.7568359375,
+    1.8037109375,
+    1.6396484375,
+    1.5966796875,
+    1.5458984375,
+    1.7431640625,
+    1.740234375,
+    1.7216796875,
+    1.7490234375,
+    1.755859375,
+    1.57421875,
+    1.75390625,
+    1.828125,
+    1.7197265625,
+    1.498046875,
+    1.755859375,
+    1.6630859375,
+    1.6875,
+    1.6337890625,
+    1.6533203125,
+    1.63671875,
+    1.7451171875,
+    1.7177734375,
+    1.6982421875,
+    1.6953125,
+    1.634765625,
+    1.552734375,
+    1.8212890625,
+    1.6474609375,
+    1.615234375,
+    1.70703125,
+    1.61328125,
+    1.7119140625,
+    1.666015625,
+    1.7802734375,
+    1.744140625,
+    1.37109375,
+    1.5908203125,
+    1.716796875,
+    1.6669921875,
+    1.6552734375,
+    1.5439453125,
+    1.6015625,
+    1.7109375,
+    1.8408203125,
+    1.7041015625,
+    1.5478515625,
+    1.681640625,
+    1.8154296875,
+    1.6513671875,
+    1.6455078125,
+    1.7529296875,
+    1.7294921875,
+    1.640625,
+    1.537109375,
+    1.580078125,
+    1.591796875,
+    1.7958984375,
+    1.6806640625,
+    1.734375,
+    1.7216796875,
+    1.73046875,
+    1.7578125,
+    1.5712890625,
+    1.697265625,
+    1.6123046875,
+    1.5380859375,
+    1.515625,
+    1.720703125,
+    1.5517578125,
+    1.5849609375,
+    1.5625,
+    1.5576171875,
+    1.634765625,
+    1.72265625,
+    1.5205078125,
+    1.6669921875,
+    1.6474609375,
+    1.66796875,
+    1.52734375,
+    1.8486328125,
+    1.703125,
+    1.7021484375,
+    1.34765625,
+    1.6005859375,
+    1.73828125,
+    1.5341796875,
+    1.7255859375,
+    1.6220703125,
+    1.6025390625,
+    1.61328125,
+    1.56640625,
+    1.5908203125,
+    1.5810546875,
+    1.5654296875,
+    1.6005859375,
+    1.8056640625,
+    1.662109375,
+    1.6875,
+    1.6982421875,
+    1.7900390625,
+    1.6494140625,
+    1.7109375,
+    1.7412109375,
+    1.607421875,
+    1.7119140625,
+    1.6962890625,
+    1.5380859375,
+    1.6630859375,
+    1.7666015625,
+    1.8623046875,
+    1.654296875,
+    1.6416015625,
+    1.7236328125,
+    1.7822265625,
+    1.623046875,
+    1.6826171875,
+    1.7607421875,
+    1.6943359375,
+    1.8603515625,
+    1.8115234375,
+    1.8232421875,
+    1.513671875,
+    1.58984375,
+    1.6318359375,
+    1.6201171875,
+    1.6484375,
+    1.7197265625,
+    1.6162109375,
+    1.7333984375,
+    1.58203125,
+    1.6708984375,
+    1.7783203125,
+    1.4453125,
+    1.767578125,
+    1.6474609375,
+    1.5224609375,
+    1.732421875,
+    1.8046875,
+    1.6435546875,
+    1.7177734375,
+    1.6728515625,
+    1.7900390625,
+    1.5234375,
+    1.6845703125,
+    1.447265625,
+    1.8232421875,
+    1.6533203125,
+    1.6669921875,
+    1.693359375,
+    1.7880859375,
+    1.7978515625,
+    1.685546875,
+    1.8330078125,
+    1.6962890625,
+    1.71875,
+    1.759765625,
+    1.9287109375,
+    1.6728515625,
+    1.5888671875,
+    1.62890625,
+    1.6533203125,
+    1.7080078125,
+    1.5400390625,
+    1.693359375,
+    1.6875,
+    1.4736328125,
+    1.767578125,
+    1.53515625,
+    1.6806640625,
+    1.7978515625,
+    1.66015625,
+    1.6884765625,
+    1.7431640625,
+    1.638671875,
+    1.6962890625,
+    1.6728515625,
+    1.791015625,
+    1.6591796875,
+    1.7080078125,
+    1.6005859375,
+    1.474609375,
+    1.4794921875,
+    1.6708984375,
+    1.7783203125,
+    1.8330078125,
+    1.6640625,
+    1.8017578125,
+    1.6083984375,
+    1.7529296875,
+    1.5673828125,
+    1.5380859375,
+    1.8388671875,
+    1.6337890625,
+    1.623046875,
+    1.6923828125,
+    1.7138671875,
+    1.791015625,
+    1.6640625,
+    1.5625,
+    1.7568359375,
+    1.73046875,
+    1.6005859375,
+    1.6318359375,
+    1.7294921875,
+    1.7705078125,
+    1.2666015625,
+    1.5478515625,
+    1.5068359375,
+    1.76171875,
+    1.6171875,
+    1.6982421875,
+    1.697265625,
+    1.5400390625,
+    1.7060546875,
+    1.7353515625,
+    1.5732421875,
+    1.8232421875,
+    1.78125,
+    1.728515625,
+    1.6240234375,
+    1.708984375,
+    1.6708984375,
+    1.587890625,
+    1.8095703125,
+    1.5234375,
+    1.69140625,
+    1.5947265625,
+    1.7822265625,
+    1.6962890625,
+    1.4306640625,
+    1.7451171875,
+    1.6533203125,
+    1.5927734375,
+    1.5888671875,
+    1.603515625,
+    1.6298828125,
+    1.591796875,
+    1.6826171875,
+    1.650390625,
+    1.6337890625,
+    1.6962890625,
+    1.6943359375,
+    1.6328125,
+    1.6474609375,
+    1.6298828125,
+    1.6533203125,
+    1.494140625,
+    1.5556640625,
+    1.6533203125,
+    1.4619140625,
+    1.7392578125,
+    1.5576171875,
+    1.5693359375,
+    1.7080078125,
+    1.443359375,
+    1.638671875,
+    1.701171875,
+    1.6416015625,
+    1.61328125,
+    1.5888671875,
+    1.490234375,
+    1.6171875,
+    1.744140625,
+    1.5244140625,
+    1.716796875,
+    1.6962890625,
+    1.71484375,
+    1.6806640625,
+    1.6962890625,
+    1.623046875,
+    1.7958984375,
+    1.734375,
+    1.6533203125,
+    1.6962890625,
+    1.5791015625,
+    1.7060546875,
+    1.6171875,
+    1.6962890625,
+    1.5830078125,
+    1.732421875,
+    1.7255859375,
+    1.6484375,
+    1.6025390625,
+    1.4814453125,
+    1.701171875,
+    1.5341796875,
+    1.5810546875,
+    1.7255859375,
+    1.765625,
+    1.8017578125,
+    1.67578125,
+    1.671875,
+    1.6240234375,
+    1.6015625,
+    1.505859375,
+    1.6875,
+    1.6376953125,
+    1.6748046875,
+    1.67578125
+  ]
+}

circuit_analysis/results/cpr_cmd_results.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+  "results": [
+    {
+      "prompt": "The capital of France is",
+      "target_token": " Paris",
+      "m_N": -0.153564453125,
+      "m_empty": -10.2265625,
+      "curve_k": [
+        0.0,
+        0.0009930096056473734,
+        0.002035178102663429,
+        0.005083029367521703,
+        0.010116899843674726,
+        0.020007668787053515,
+        0.05001425607849692,
+        0.10017598883109988,
+        0.20007668787053515,
+        0.5000639065587793,
+        1.0
+      ],
+      "curve_f": [
+        0.0,
+        0.020940885624954556,
+        0.04188177124990911,
+        0.08066118907389903,
+        0.11944060689788895,
+        0.2322887127656996,
+        0.7083060665551758,
+        0.8935747352092877,
+        0.8023461547783514,
+        0.8479119707215396,
+        1.0
+      ],
+      "CPR": 0.8509204971370973,
+      "CMD": 0.14907950286290272
+    },
+    {
+      "prompt": "def factorial(n):",
+      "target_token": " #",
+      "m_N": -0.498779296875,
+      "m_empty": -1.625,
+      "curve_k": [
+        0.0,
+        0.0010854585147512593,
+        0.0021831131925896113,
+        0.0050492115180564194,
+        0.010086226873025746,
+        0.02017245374605149,
+        0.05017501494029978,
+        0.10000853731416097,
+        0.20012684009610576,
+        0.5000304904077177,
+        1.0
+      ],
+      "curve_f": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.4985909386516367,
+        0.716236722306525,
+        0.0,
+        1.0
+      ],
+      "CPR": 0.43062227169181266,
+      "CMD": 0.5693777283081873
+    },
+    {
+      "prompt": "The literary device in the phrase 'The wind whispered through the trees' is",
+      "target_token": " person",
+      "m_N": -0.452392578125,
+      "m_empty": -1.4228515625,
+      "curve_k": [
+        0.0,
+        0.0010364040898343655,
+        0.002022097276190597,
+        0.005020379444335275,
+        0.010018572868398867,
+        0.0200054514221239,
+        0.050054831164385735,
+        0.10001140995328257,
+        0.20002915876949992,
+        0.5000538803349455,
+        1.0
+      ],
+      "curve_f": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "CPR": 0.24997305983252727,
+      "CMD": 0.7500269401674727
+    }
+  ],
+  "average_CPR": 0.510505276220479,
+  "average_CMD": 0.4894947237795209
+}

circuit_analysis/results/feature_interpretations_cache/feature_interpretations.json ADDED Viewed

The diff for this file is too large to render. See raw diff

circuit_analysis/results/offline_circuit_metrics.json ADDED Viewed

	@@ -0,0 +1,484 @@

+{
+  "prompts_ran": [
+    "The capital of France is",
+    "def factorial(n):",
+    "The literary device in the phrase 'The wind whispered through the trees' is"
+  ],
+  "aggregate_summary": {
+    "targeted": {
+      "count": 384,
+      "avg_probability_change": 0.0037064552307128906,
+      "avg_abs_probability_change": 0.017832597096761067,
+      "std_probability_change": 0.024085208735490346,
+      "avg_logit_change": 0.015276943643887838,
+      "avg_abs_logit_change": 0.10990743339061737,
+      "std_logit_change": 0.14790562906131785,
+      "avg_kl_divergence": 0.0,
+      "avg_entropy_change": 0.0,
+      "avg_hidden_state_delta_norm": 7.304225921630859,
+      "avg_hidden_state_relative_change": 0.07329477507168775,
+      "flip_rate": 0.1640625,
+      "count_flipped": 63
+    },
+    "random_baseline": {
+      "count": 15,
+      "avg_probability_change": 0.00411376953125,
+      "avg_abs_probability_change": 0.0045206705729166664,
+      "std_probability_change": 0.008715364389044073,
+      "avg_logit_change": 0.025040690104166666,
+      "avg_abs_logit_change": 0.0387451171875,
+      "std_logit_change": 0.056757731194411305,
+      "avg_kl_divergence": 0.0,
+      "avg_entropy_change": 0.0,
+      "avg_hidden_state_delta_norm": 2.098746744791667,
+      "avg_hidden_state_relative_change": 0.020347025628623817,
+      "flip_rate": 0.13333333333333333,
+      "count_flipped": 2
+    },
+    "path": {
+      "count": 9,
+      "avg_probability_change": 0.05379909939236111,
+      "avg_abs_probability_change": 0.05379909939236111,
+      "std_probability_change": 0.03592616044133087,
+      "avg_logit_change": 0.1922607421875,
+      "avg_abs_logit_change": 0.1967095269097222,
+      "std_logit_change": 0.21084131002100345,
+      "avg_kl_divergence": 0.0,
+      "avg_entropy_change": 0.0,
+      "avg_hidden_state_delta_norm": 9.847222222222221,
+      "avg_hidden_state_relative_change": 0.09728426029029577,
+      "flip_rate": 0.2222222222222222,
+      "count_flipped": 2
+    },
+    "random_path_baseline": {
+      "count": 10,
+      "avg_probability_change": -0.00140380859375,
+      "avg_abs_probability_change": 0.01318359375,
+      "std_probability_change": 0.017461901132504055,
+      "avg_logit_change": 0.012884521484375,
+      "avg_abs_logit_change": 0.075469970703125,
+      "std_logit_change": 0.08951169079108637,
+      "avg_kl_divergence": 0.0,
+      "avg_entropy_change": 0.0,
+      "avg_hidden_state_delta_norm": 6.292578125,
+      "avg_hidden_state_relative_change": 0.06215471324319018,
+      "flip_rate": 0.2,
+      "count_flipped": 2
+    },
+    "target_minus_random_abs_probability_change": 0.0133119265238444,
+    "target_flip_rate_minus_random": 0.03072916666666667,
+    "path_minus_random_abs_probability_change": 0.04061550564236111,
+    "path_flip_rate_minus_random": 0.0222222222222222
+  },
+  "per_prompt": {
+    "prompt_1": {
+      "prompt": "The capital of France is",
+      "summary_statistics": {
+        "targeted": {
+          "count": 128,
+          "avg_probability_change": 0.004764556884765625,
+          "avg_abs_probability_change": 0.025264739990234375,
+          "std_probability_change": 0.03195406092166983,
+          "avg_logit_change": 0.005943402647972107,
+          "avg_abs_logit_change": 0.08632268011569977,
+          "std_logit_change": 0.1132019008155424,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 6.6026153564453125,
+          "avg_hidden_state_relative_change": 0.070100760253486,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "random_baseline": {
+          "count": 5,
+          "avg_probability_change": 0.006494140625,
+          "avg_abs_probability_change": 0.006884765625,
+          "std_probability_change": 0.012063215323471172,
+          "avg_logit_change": 0.0234619140625,
+          "avg_abs_logit_change": 0.0234619140625,
+          "std_logit_change": 0.042261115942436644,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 1.45966796875,
+          "avg_hidden_state_relative_change": 0.015497470139185163,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "path": {
+          "count": 3,
+          "avg_probability_change": 0.07967122395833333,
+          "avg_abs_probability_change": 0.07967122395833333,
+          "std_probability_change": 0.013695590325716009,
+          "avg_logit_change": 0.07661946614583333,
+          "avg_abs_logit_change": 0.07661946614583333,
+          "std_logit_change": 0.03182210693328131,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 7.229166666666667,
+          "avg_hidden_state_relative_change": 0.0767529307667144,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "random_path_baseline": {
+          "count": 4,
+          "avg_probability_change": -0.00994873046875,
+          "avg_abs_probability_change": 0.01690673828125,
+          "std_probability_change": 0.02002948420533245,
+          "avg_logit_change": 0.0011444091796875,
+          "avg_abs_logit_change": 0.0316314697265625,
+          "std_logit_change": 0.0377090926751411,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 4.0029296875,
+          "avg_hidden_state_relative_change": 0.04249958526829463,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "target_minus_random_abs_probability_change": 0.018379974365234374,
+        "target_flip_rate_minus_random": 0.0,
+        "path_minus_random_abs_probability_change": 0.06276448567708333,
+        "path_flip_rate_minus_random": 0.0
+      },
+      "counts": {
+        "targeted": 128,
+        "random": 5,
+        "path": 3,
+        "random_path": 4
+      },
+      "top_targeted": [
+        {
+          "label": "feature_208",
+          "probability_change": -0.1171875,
+          "logit_change": -0.22528076171875,
+          "flip": false
+        },
+        {
+          "label": "feature_378",
+          "probability_change": 0.0859375,
+          "logit_change": 0.137939453125,
+          "flip": false
+        },
+        {
+          "label": "feature_467",
+          "probability_change": 0.077880859375,
+          "logit_change": 0.0877685546875,
+          "flip": false
+        },
+        {
+          "label": "feature_134",
+          "probability_change": 0.072021484375,
+          "logit_change": 0.224365234375,
+          "flip": false
+        },
+        {
+          "label": "feature_142",
+          "probability_change": 0.07177734375,
+          "logit_change": 0.24560546875,
+          "flip": false
+        }
+      ],
+      "top_paths": [
+        {
+          "label": "Token '\u0120capital' \u2192 Feature L0F322 (Word/alphabetic tokens) \u2192 Feature L1F44 (Word/alphabetic tokens) \u2192 Feature L2F320 (Word/alphabetic tokens) \u2192 Feature L3F338 (Word/alphabetic tokens) \u2192 Output",
+          "probability_change": 0.08935546875,
+          "logit_change": 0.09912109375,
+          "flip": false
+        },
+        {
+          "label": "Token 'The' \u2192 Feature L0F322 (Word/alphabetic tokens) \u2192 Feature L1F44 (Word/alphabetic tokens) \u2192 Feature L2F320 (Word/alphabetic tokens) \u2192 Feature L3F338 (Word/alphabetic tokens) \u2192 Output",
+          "probability_change": 0.08935546875,
+          "logit_change": 0.09912109375,
+          "flip": false
+        },
+        {
+          "label": "Token '\u0120capital' \u2192 Feature L0F322 (Word/alphabetic tokens) \u2192 Feature L1F44 (Word/alphabetic tokens) \u2192 Feature L2F320 (Word/alphabetic tokens) \u2192 Feature L3F367 (Word/alphabetic tokens) \u2192 Output",
+          "probability_change": 0.060302734375,
+          "logit_change": 0.0316162109375,
+          "flip": false
+        }
+      ]
+    },
+    "prompt_2": {
+      "prompt": "def factorial(n):",
+      "summary_statistics": {
+        "targeted": {
+          "count": 128,
+          "avg_probability_change": 0.003498077392578125,
+          "avg_abs_probability_change": 0.016864776611328125,
+          "std_probability_change": 0.02251701216357027,
+          "avg_logit_change": -0.004237174987792969,
+          "avg_abs_logit_change": 0.10776042938232422,
+          "std_logit_change": 0.14078546527424055,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 8.8001708984375,
+          "avg_hidden_state_relative_change": 0.08148306387366683,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "random_baseline": {
+          "count": 5,
+          "avg_probability_change": 0.00400390625,
+          "avg_abs_probability_change": 0.00478515625,
+          "std_probability_change": 0.008191782057277022,
+          "avg_logit_change": 0.030224609375,
+          "avg_abs_logit_change": 0.071337890625,
+          "std_logit_change": 0.08533968984750907,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 4.21953125,
+          "avg_hidden_state_relative_change": 0.039069733795934536,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "path": {
+          "count": 3,
+          "avg_probability_change": 0.07674153645833333,
+          "avg_abs_probability_change": 0.07674153645833333,
+          "std_probability_change": 0.009609931026867956,
+          "avg_logit_change": 0.4598795572916667,
+          "avg_abs_logit_change": 0.4598795572916667,
+          "std_logit_change": 0.13856714917783255,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 15.4140625,
+          "avg_hidden_state_relative_change": 0.1427228009246044,
+          "flip_rate": 0.3333333333333333,
+          "count_flipped": 1
+        },
+        "random_path_baseline": {
+          "count": 3,
+          "avg_probability_change": 0.00439453125,
+          "avg_abs_probability_change": 0.013671875,
+          "std_probability_change": 0.016523589485978502,
+          "avg_logit_change": -0.005452473958333333,
+          "avg_abs_logit_change": 0.11726888020833333,
+          "std_logit_change": 0.12771601543377464,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 10.997395833333334,
+          "avg_hidden_state_relative_change": 0.10182773919658801,
+          "flip_rate": 0.0,
+          "count_flipped": 0
+        },
+        "target_minus_random_abs_probability_change": 0.012079620361328125,
+        "target_flip_rate_minus_random": 0.0,
+        "path_minus_random_abs_probability_change": 0.06306966145833333,
+        "path_flip_rate_minus_random": 0.3333333333333333
+      },
+      "counts": {
+        "targeted": 128,
+        "random": 5,
+        "path": 3,
+        "random_path": 3
+      },
+      "top_targeted": [
+        {
+          "label": "feature_72",
+          "probability_change": -0.102294921875,
+          "logit_change": -0.555908203125,
+          "flip": false
+        },
+        {
+          "label": "feature_240",
+          "probability_change": 0.056884765625,
+          "logit_change": 0.272216796875,
+          "flip": false
+        },
+        {
+          "label": "feature_79",
+          "probability_change": -0.0556640625,
+          "logit_change": -0.1513671875,
+          "flip": false
+        },
+        {
+          "label": "feature_235",
+          "probability_change": 0.05224609375,
+          "logit_change": 0.252685546875,
+          "flip": false
+        },
+        {
+          "label": "feature_439",
+          "probability_change": 0.050048828125,
+          "logit_change": 0.223388671875,
+          "flip": false
+        }
+      ],
+      "top_paths": [
+        {
+          "label": "Token '\u0120factorial' \u2192 Feature L0F246 (Identifying punctuation and articles) \u2192 Feature L1F439 (Word/alphabetic tokens) \u2192 Feature L2F203 (Mixed/polysemantic feature) \u2192 Feature L3F24 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.09033203125,
+          "logit_change": 0.263916015625,
+          "flip": true
+        },
+        {
+          "label": "Token '\u0120factorial' \u2192 Feature L0F246 (Identifying punctuation and articles) \u2192 Feature L1F439 (Word/alphabetic tokens) \u2192 Feature L2F203 (Mixed/polysemantic feature) \u2192 Feature L3F187 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.0699462890625,
+          "logit_change": 0.557861328125,
+          "flip": false
+        },
+        {
+          "label": "Token 'def' \u2192 Feature L0F246 (Identifying punctuation and articles) \u2192 Feature L1F439 (Word/alphabetic tokens) \u2192 Feature L2F203 (Mixed/polysemantic feature) \u2192 Feature L3F187 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.0699462890625,
+          "logit_change": 0.557861328125,
+          "flip": false
+        }
+      ]
+    },
+    "prompt_3": {
+      "prompt": "The literary device in the phrase 'The wind whispered through the trees' is",
+      "summary_statistics": {
+        "targeted": {
+          "count": 128,
+          "avg_probability_change": 0.002856731414794922,
+          "avg_abs_probability_change": 0.011368274688720703,
+          "std_probability_change": 0.014502722583680286,
+          "avg_logit_change": 0.044124603271484375,
+          "avg_abs_logit_change": 0.13563919067382812,
+          "std_logit_change": 0.17802501078177962,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 6.509891510009766,
+          "avg_hidden_state_relative_change": 0.06830050108791044,
+          "flip_rate": 0.4921875,
+          "count_flipped": 63
+        },
+        "random_baseline": {
+          "count": 5,
+          "avg_probability_change": 0.00184326171875,
+          "avg_abs_probability_change": 0.00189208984375,
+          "std_probability_change": 0.00210067367193147,
+          "avg_logit_change": 0.021435546875,
+          "avg_abs_logit_change": 0.021435546875,
+          "std_logit_change": 0.02351792840670642,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 0.617041015625,
+          "avg_hidden_state_relative_change": 0.006473872950751749,
+          "flip_rate": 0.4,
+          "count_flipped": 2
+        },
+        "path": {
+          "count": 3,
+          "avg_probability_change": 0.004984537760416667,
+          "avg_abs_probability_change": 0.004984537760416667,
+          "std_probability_change": 0.003682847818679935,
+          "avg_logit_change": 0.040283203125,
+          "avg_abs_logit_change": 0.053629557291666664,
+          "std_logit_change": 0.07112499849825625,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 6.8984375,
+          "avg_hidden_state_relative_change": 0.0723770491795685,
+          "flip_rate": 0.3333333333333333,
+          "count_flipped": 1
+        },
+        "random_path_baseline": {
+          "count": 3,
+          "avg_probability_change": 0.004191080729166667,
+          "avg_abs_probability_change": 0.007731119791666667,
+          "std_probability_change": 0.0067955519556561735,
+          "avg_logit_change": 0.046875,
+          "avg_abs_logit_change": 0.09212239583333333,
+          "std_logit_change": 0.08261409961169389,
+          "avg_kl_divergence": 0.0,
+          "avg_entropy_change": 0.0,
+          "avg_hidden_state_delta_norm": 4.640625,
+          "avg_hidden_state_relative_change": 0.04868852458965311,
+          "flip_rate": 0.6666666666666666,
+          "count_flipped": 2
+        },
+        "target_minus_random_abs_probability_change": 0.009476184844970703,
+        "target_flip_rate_minus_random": 0.09218749999999998,
+        "path_minus_random_abs_probability_change": -0.00274658203125,
+        "path_flip_rate_minus_random": -0.3333333333333333
+      },
+      "counts": {
+        "targeted": 128,
+        "random": 5,
+        "path": 3,
+        "random_path": 3
+      },
+      "top_targeted": [
+        {
+          "label": "feature_26",
+          "probability_change": 0.0413818359375,
+          "logit_change": 0.641357421875,
+          "flip": true
+        },
+        {
+          "label": "feature_103",
+          "probability_change": -0.03997802734375,
+          "logit_change": -0.37274169921875,
+          "flip": false
+        },
+        {
+          "label": "feature_32",
+          "probability_change": 0.03857421875,
+          "logit_change": 0.449462890625,
+          "flip": true
+        },
+        {
+          "label": "feature_52",
+          "probability_change": 0.03411865234375,
+          "logit_change": 0.370849609375,
+          "flip": true
+        },
+        {
+          "label": "feature_294",
+          "probability_change": 0.03387451171875,
+          "logit_change": 0.537353515625,
+          "flip": true
+        }
+      ],
+      "top_paths": [
+        {
+          "label": "Token '\u0120literary' \u2192 Feature L0F434 (Mixed/polysemantic feature) \u2192 Feature L1F56 (Mixed/polysemantic feature) \u2192 Feature L2F62 (Mixed/polysemantic feature) \u2192 Feature L3F386 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.01019287109375,
+          "logit_change": 0.140869140625,
+          "flip": true
+        },
+        {
+          "label": "Token '\u0120literary' \u2192 Feature L0F434 (Mixed/polysemantic feature) \u2192 Feature L1F56 (Mixed/polysemantic feature) \u2192 Feature L2F62 (Mixed/polysemantic feature) \u2192 Feature L3F157 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.00238037109375,
+          "logit_change": -0.010009765625,
+          "flip": false
+        },
+        {
+          "label": "Token '\u0120device' \u2192 Feature L0F434 (Mixed/polysemantic feature) \u2192 Feature L1F56 (Mixed/polysemantic feature) \u2192 Feature L2F62 (Mixed/polysemantic feature) \u2192 Feature L3F157 (Mixed/polysemantic feature) \u2192 Output",
+          "probability_change": 0.00238037109375,
+          "logit_change": -0.010009765625,
+          "flip": false
+        }
+      ]
+    }
+  },
+  "config": {
+    "model_path": "./models/OLMo-2-1124-7B",
+    "max_seq_length": 512,
+    "n_features_per_layer": 512,
+    "sparsity_lambda": 0.001,
+    "reconstruction_loss_weight": 1.0,
+    "batch_size": 8,
+    "learning_rate": 0.0001,
+    "training_steps": 1000,
+    "device": "mps",
+    "pruning_threshold": 0.8,
+    "intervention_strength": 5.0,
+    "qwen_api_config": null,
+    "max_ablation_experiments": null,
+    "ablation_top_k_tokens": 5,
+    "ablation_features_per_layer": 4,
+    "summary_max_layers": null,
+    "summary_features_per_layer": 2,
+    "random_baseline_trials": 5,
+    "random_baseline_features": 1,
+    "random_baseline_seed": 1234,
+    "path_ablation_top_k": 3,
+    "random_path_baseline_trials": 5,
+    "graph_max_features_per_layer": 40,
+    "graph_feature_activation_threshold": 0.01,
+    "graph_edge_weight_threshold": 0.003,
+    "graph_max_edges_per_node": 20
+  }
+}

circuit_analysis/train_clt_and_plot.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# This script trains the Cross-Layer Transcoder (CLT) and plots its training loss.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import json
+import logging
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List, Tuple
+from dataclasses import dataclass
+from tqdm import tqdm
+import os
+import random
+import argparse
+import glob
+import itertools
+from torch.optim.lr_scheduler import CosineAnnealingLR
+# --- Fix import path ---
+import sys
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from utilities.utils import set_seed
+# --- Constants ---
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+RESULTS_DIR = Path(__file__).parent / "results"
+CLT_SAVE_PATH = Path(__file__).parent / "models" / "clt_model.pth"
+STATS_SAVE_PATH = RESULTS_DIR / "clt_training_stats.json"
+PLOT_SAVE_PATH = RESULTS_DIR / "clt_training_loss.png"
+DOLMA_DIR = PROJECT_ROOT / "influence_tracer" / "dolma_dataset_sample_1.6v"
+# Configure logging.
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set the device for training.
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+    logger.info("Using MPS (Metal Performance Shaders) for GPU acceleration")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+    logger.info("Using CUDA for GPU acceleration")
+else:
+    DEVICE = torch.device("cpu")
+    logger.info("Using CPU")
+@dataclass
+class AttributionGraphConfig:
+    # Configuration for building the attribution graph.
+    model_path: str = "./models/OLMo-2-1124-7B"
+    max_seq_length: int = 128
+    n_features_per_layer: int = 512    # Back to 512 due to memory constraints
+    sparsity_lambda: float = 1e-3     # Reduced from 0.01 for L1
+    reconstruction_loss_weight: float = 1.0
+    batch_size: int = 16               # Can be higher with 512 features
+    learning_rate: float = 3e-4       # Increased from 1e-4
+    training_steps: int = 1500       # Increased from 500
+    device: str = str(DEVICE)
+class JumpReLU(nn.Module):
+    # JumpReLU activation function.
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x):
+        return F.relu(x - self.threshold)
+class CrossLayerTranscoder(nn.Module):
+    # The Cross-Layer Transcoder (CLT) model.
+    def __init__(self, model_config: Dict, clt_config: AttributionGraphConfig):
+        super().__init__()
+        self.config = clt_config
+        self.model_config = model_config
+        self.n_layers = model_config['num_hidden_layers']
+        self.hidden_size = model_config['hidden_size']
+        self.n_features = clt_config.n_features_per_layer
+        self.encoders = nn.ModuleList([
+            nn.Linear(self.hidden_size, self.n_features, bias=False)
+            for _ in range(self.n_layers)
+        ])
+        self.decoders = nn.ModuleDict()
+        for source_layer in range(self.n_layers):
+            for target_layer in range(source_layer, self.n_layers):
+                key = f"{source_layer}_to_{target_layer}"
+                self.decoders[key] = nn.Linear(self.n_features, self.hidden_size, bias=False)
+        self.activation = JumpReLU(threshold=0.0)
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                # Improved initialization (Xavier/Glorot)
+                nn.init.xavier_uniform_(module.weight, gain=0.1)
+    def encode(self, layer_idx: int, residual_activations: torch.Tensor) -> torch.Tensor:
+        return self.activation(self.encoders[layer_idx](residual_activations))
+    def decode(self, source_layer: int, target_layer: int, feature_activations: torch.Tensor) -> torch.Tensor:
+        key = f"{source_layer}_to_{target_layer}"
+        return self.decoders[key](feature_activations)
+    def forward(self, residual_activations: List[torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        feature_activations = [self.encode(i, r) for i, r in enumerate(residual_activations)]
+        reconstructed_mlp_outputs = []
+        for target_layer in range(self.n_layers):
+            reconstruction = torch.zeros_like(residual_activations[target_layer])
+            for source_layer in range(target_layer + 1):
+                reconstruction += self.decode(source_layer, target_layer, feature_activations[source_layer])
+            reconstructed_mlp_outputs.append(reconstruction)
+        return feature_activations, reconstructed_mlp_outputs
+class TrainingPipeline:
+    # A pipeline for training the CLT model.
+    def __init__(self, config: AttributionGraphConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        logger.info(f"Loading OLMo model from {config.model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_path)
+        # Configure model loading based on the device.
+        model_args = {'torch_dtype': torch.float16 if "cpu" not in config.device else torch.float32}
+        if "cuda" in config.device:
+            model_args['device_map'] = "auto"
+        self.model = AutoModelForCausalLM.from_pretrained(config.model_path, **model_args).to(self.device)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        model_config = self.model.config.to_dict()
+        self.clt = CrossLayerTranscoder(model_config, config).to(self.device)
+        logger.info("Training Pipeline initialized successfully")
+    def load_dolma_data(self, buffer_size=10000):
+        """Generator that yields text samples from the Dolma dataset with shuffling."""
+        json_files = glob.glob(str(DOLMA_DIR / "*.json"))
+        if not json_files:
+            logger.error(f"No JSON files found in {DOLMA_DIR}")
+            raise FileNotFoundError(f"No training data found in {DOLMA_DIR}")
+        logger.info(f"Found {len(json_files)} training files in {DOLMA_DIR}")
+        random.shuffle(json_files)
+        buffer = []
+        while True:
+            for file_path in json_files:
+                try:
+                    # Use a larger buffer size for reading
+                    with open(file_path, 'r', buffering=8192*1024) as f:
+                        for line in f:
+                            try:
+                                doc = json.loads(line)
+                                text = doc.get('text', '')
+                                if len(text) > 100: # Filter very short texts
+                                    buffer.append(text)
+                                    if len(buffer) >= buffer_size:
+                                        random.shuffle(buffer)
+                                        yield from buffer
+                                        buffer = []
+                            except json.JSONDecodeError:
+                                continue
+                except Exception as e:
+                    logger.warning(f"Error reading {file_path}: {e}")
+            # Yield remaining items in buffer
+            if buffer:
+                random.shuffle(buffer)
+                yield from buffer
+                buffer = []
+            # Shuffle and restart for next epoch
+            random.shuffle(json_files)
+    def train_clt(self) -> Dict:
+        # Trains the Cross-Layer Transcoder.
+        logger.info("Starting CLT training...")
+        optimizer = torch.optim.Adam(self.clt.parameters(), lr=self.config.learning_rate)
+        scheduler = CosineAnnealingLR(optimizer, T_max=self.config.training_steps, eta_min=1e-6)
+        stats = {
+            'reconstruction_losses': [],
+            'sparsity_losses': [],
+            'total_losses': []
+        }
+        self.clt.train()
+        progress_bar = tqdm(range(self.config.training_steps), desc="Training CLT")
+        data_generator = self.load_dolma_data()
+        for step in progress_bar:
+            # Sample a batch of texts.
+            batch_texts = []
+            try:
+                for _ in range(self.config.batch_size):
+                    batch_texts.append(next(data_generator))
+            except StopIteration:
+                logger.warning("Data generator ran out of data!")
+                break
+            # Tokenize all texts at once (True batch processing)
+            inputs = self.tokenizer(
+                batch_texts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.config.max_seq_length
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True)
+                hidden_states = outputs.hidden_states[1:]
+            feature_activations, reconstructed_outputs = self.clt(hidden_states)
+            # --- Loss calculation ---
+            # Recon loss: Sum over batch, then average later implicitly via batch division or explicit mean
+            # To match previous scale: sum of MSE per sample
+            recon_loss = sum(F.mse_loss(pred, target) for target, pred in zip(hidden_states, reconstructed_outputs))
+            # L1 Sparsity Loss (Better than tanh)
+            sparsity_loss = sum(torch.mean(torch.abs(features)) for features in feature_activations)
+            loss = (self.config.reconstruction_loss_weight * recon_loss +
+                   self.config.sparsity_lambda * sparsity_loss)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            # Gradient Clipping (New)
+            torch.nn.utils.clip_grad_norm_(self.clt.parameters(), max_norm=1.0)
+            optimizer.step()
+            scheduler.step() # Learning Rate Schedule
+            # Normalize losses for logging (divide by number of layers approx or keep as sum)
+            # Previous code accumulated and then divided by batch size.
+            # Here F.mse_loss is mean over batch by default?
+            # F.mse_loss(input, target) -> mean over all elements.
+            # So recon_loss is sum(mean_mse_per_layer).
+            # This is fine, scale is consistent.
+            stats['total_losses'].append(loss.item())
+            stats['reconstruction_losses'].append(recon_loss.item())
+            stats['sparsity_losses'].append(sparsity_loss.item())
+            if step % 10 == 0:
+                progress_bar.set_postfix({
+                    "Total": f"{loss.item():.4f}",
+                    "Recon": f"{recon_loss.item():.4f}",
+                    "Sparsity": f"{sparsity_loss.item():.4f}",
+                    "LR": f"{scheduler.get_last_lr()[0]:.2e}"
+                })
+        logger.info("CLT training completed.")
+        return stats
+    def save_clt(self, path: str):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        torch.save(self.clt.state_dict(), path)
+        logger.info(f"CLT model saved to {path}")
+def plot_training_stats(stats_path: str, save_path: str):
+    # Loads training stats and generates a plot.
+    logger.info(f"Loading training stats from {stats_path}")
+    with open(stats_path, 'r') as f:
+        stats = json.load(f)
+    plt.style.use('seaborn-v0_8-darkgrid')
+    fig, ax1 = plt.subplots(figsize=(12, 6))
+    steps = range(len(stats['total_losses']))
+    color = 'tab:red'
+    ax1.set_xlabel('Training Steps')
+    ax1.set_ylabel('Total & Reconstruction Loss', color=color)
+    ax1.plot(steps, stats['total_losses'], color=color, label='Total Loss', alpha=0.9, linewidth=2)
+    ax1.plot(steps, stats['reconstruction_losses'], color='tab:blue', linestyle='--', label='Reconstruction Loss', alpha=1.0)
+    ax1.tick_params(axis='y', labelcolor=color)
+    ax1.grid(True, which='major', linestyle='--', linewidth='0.5', color='grey')
+    ax2 = ax1.twinx()
+    color2 = 'tab:green'
+    ax2.set_ylabel('Sparsity Loss (L1)', color=color2)
+    ax2.plot(steps, stats['sparsity_losses'], color=color2, linestyle=':', label='Sparsity Loss')
+    ax2.tick_params(axis='y', labelcolor=color2)
+    ax2.grid(True, which='major', linestyle=':', linewidth='0.5', color='darkgrey')
+    # Combine legends into a single box.
+    lines, labels = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(lines + lines2, labels + labels2, loc='center right', frameon=True, facecolor='white', framealpha=0.8, edgecolor='grey')
+    logger.info(f"Full training plot saved to {save_path}")
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+def main():
+    # Main function to handle training and plotting.
+    # --- Argument Parser ---
+    parser = argparse.ArgumentParser(description="Train CLT model and/or plot training stats.")
+    parser.add_argument(
+        '--skip-training',
+        action='store_true',
+        help="Skip the training process and only generate the plot from existing stats."
+    )
+    args = parser.parse_args()
+    # Set a seed for reproducibility.
+    set_seed()
+    # Config is now updated with improvements
+    config = AttributionGraphConfig()
+    try:
+        pipeline = TrainingPipeline(config)
+        logger.info("Training Pipeline initialized successfully")
+        if not args.skip_training:
+            # Train the Cross-Layer Transcoder using Dolma dataset
+            training_stats = pipeline.train_clt()
+            os.makedirs(RESULTS_DIR, exist_ok=True)
+            with open(STATS_SAVE_PATH, 'w') as f:
+                json.dump(training_stats, f, indent=2)
+            logger.info(f"Saved training stats to {STATS_SAVE_PATH}")
+            pipeline.save_clt(CLT_SAVE_PATH)
+        else:
+            logger.info("--skip-training flag is set. Loading existing stats for plotting.")
+        # Always plot, using either new or existing stats.
+        if os.path.exists(STATS_SAVE_PATH):
+            plot_training_stats(STATS_SAVE_PATH, PLOT_SAVE_PATH)
+        else:
+            logger.error(f"Statistics file not found at {STATS_SAVE_PATH}. Cannot generate plot. Run training first.")
+        print("\n🎉 CLT training and plotting completed successfully!")
+    except Exception as e:
+        logger.error(f"❌ Error during execution: {e}", exc_info=True)
+if __name__ == "__main__":
+    main()

function_vectors/data/multilingual_function_categories.py ADDED Viewed

The diff for this file is too large to render. See raw diff

function_vectors/data/visualizations/de_pca_3d_categories_layer_-1.html ADDED Viewed

The diff for this file is too large to render. See raw diff

function_vectors/data/visualizations/en_pca_3d_categories_layer_-1.html ADDED Viewed

The diff for this file is too large to render. See raw diff

function_vectors/function_vectors_page.py ADDED Viewed

	@@ -0,0 +1,1845 @@

+import streamlit as st
+import os
+from pathlib import Path
+import base64
+import sys
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import pandas as pd
+from utilities.localization import tr
+import plotly.graph_objects as go
+from sklearn.decomposition import PCA
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List
+import requests
+import json
+from PIL import Image
+from io import BytesIO
+import base64
+import markdown
+from datetime import datetime
+from utilities.feedback_survey import display_function_vector_feedback
+import gc
+import colorsys
+import re
+from thefuzz import process
+import threading
+# Directory for visualizations.
+VIZ_DIR = Path(__file__).parent / "data" / "visualizations"
+# Add the project root to the path.
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from function_vectors.data.multilingual_function_categories import FUNCTION_TYPES, FUNCTION_CATEGORIES
+from utilities.utils import init_qwen_api
+# Define colors and symbols for the plots.
+FUNCTION_TYPE_COLORS = {
+    "abstractive_tasks": "#87CEEB",      # skyblue
+    "multiple_choice_qa": "#90EE90",    # lightgreen
+    "text_classification": "#FA8072",    # salmon
+    "extractive_tasks": "#DA70D6",       # orchid
+    "named_entity_recognition": "#FFD700", # gold
+    "text_generation": "#F08080"         # lightcoral
+}
+# HTML entities for shapes in the legend.
+PLOTLY_SYMBOLS_HTML = {
+    "abstractive_tasks": "●", "multiple_choice_qa": "◆",
+    "text_classification": "■", "extractive_tasks": "✚",
+    "named_entity_recognition": "◇", "text_generation": "□"
+}
+# Plotly symbol names for the plot.
+PLOTLY_SYMBOLS = {
+    "abstractive_tasks": "circle", "multiple_choice_qa": "diamond",
+    "text_classification": "square", "extractive_tasks": "cross",
+    "named_entity_recognition": "diamond-open", "text_generation": "square-open"
+}
+# Helper function to format category names.
+def format_category_name(name):
+    # Formats a category key into a readable name.
+    # Make the check case-insensitive.
+    if name.lower().endswith('_qa'):
+        # Format names that end in '_qa'.
+        prefix = name[:-3].replace('_', ' ').replace('-', ' ').title()
+        formatted_name = f"{prefix} QA"
+    else:
+        # Default formatting for other names.
+        formatted_name = name.replace('_', ' ').replace('-', ' ').title()
+    return tr(formatted_name)
+def show_function_vectors_page():
+    # Shows the main Function Vector Analysis page.
+    # Add CSS for Bootstrap icons.
+    st.markdown('<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.5/font/bootstrap-icons.css">', unsafe_allow_html=True)
+    # Initialize a lock in the session state to prevent concurrent API calls.
+    if 'api_lock' not in st.session_state:
+        st.session_state.api_lock = threading.Lock()
+    st.markdown(f"<h1>{tr('fv_page_title')}</h1>", unsafe_allow_html=True)
+    st.markdown(f"""{tr('fv_page_desc')}""", unsafe_allow_html=True)
+    # Check if the visualization directory exists.
+    if not VIZ_DIR.exists():
+        st.error(tr('viz_dir_not_found_error'))
+        return
+    # Show examples of the categories.
+    st.header(tr('dataset_overview'))
+    st.markdown(tr('dataset_overview_desc_long'))
+    display_category_examples()
+    st.markdown("---")
+    # Add a visual explanation of how function vectors are made.
+    st.html(f"""
+    <div style='color: #ffffff; margin: 2rem 0;'>
+        <h4 style='color: #87CEEB; margin-top: 0; text-align: center; margin-bottom: 1.5rem;'>{tr('how_vectors_are_made_header')}</h4>
+        <p style="text-align: center; max-width: 600px; margin: auto; margin-bottom: 2rem;">{tr('how_vectors_are_made_desc')}</p>
+        <div style="display: flex; flex-direction: column; align-items: center; font-family: 'SF Mono', 'Consolas', 'Menlo', monospace; gap: 0.2rem;">
+            <!-- STEP 1: INPUT -->
+            <div style="background-color: #333; padding: 0.8rem; border-radius: 8px; width: 90%; max-width: 600px; text-align: center; border: 1px solid #444;">
+                <h5 style="margin: 0 0 0.5rem 0; color: #87CEEB; font-size: 0.9rem; letter-spacing: 1px; font-weight: bold;"><i class="bi bi-keyboard"></i> {tr('how_vectors_are_made_step1_title')}</h5>
+                <code style="background: none; color: #EAEAEA; font-size: 1em;">"{tr('how_vectors_are_made_step1_example')}"</code>
+            </div>
+            <i class="bi bi-arrow-down" style="font-size: 2rem; color: #666; margin: 0.5rem 0;"></i>
+            <!-- STEP 2: TOKENIZER -->
+            <div style="background-color: #333; padding: 0.8rem; border-radius: 8px; width: 90%; max-width: 600px; text-align: center; border: 1px solid #444;">
+                <h5 style="margin: 0 0 0.5rem 0; color: #87CEEB; font-size: 0.9rem; letter-spacing: 1px; font-weight: bold;"><i class="bi bi-segmented-nav"></i> {tr('how_vectors_are_made_step2_title')}</h5>
+                <code style="background: none; color: #EAEAEA; font-size: 1em;">{tr('how_vectors_are_made_step2_example')}</code>
+            </div>
+            <i class="bi bi-arrow-down" style="font-size: 2rem; color: #666; margin: 0.5rem 0;"></i>
+            <!-- STEP 3: MODEL -->
+            <div style="background-color: #333; padding: 0.8rem; border-radius: 8px; width: 90%; max-width: 600px; text-align: center; border: 1px solid #444;">
+                <h5 style="margin: 0 0 0.5rem 0; color: #87CEEB; font-size: 0.9rem; letter-spacing: 1px; font-weight: bold;"><i class="bi bi-cpu-fill"></i> {tr('how_vectors_are_made_step3_title')}</h5>
+                <code style="background: none; color: #EAEAEA; font-size: 1em;">{tr('how_vectors_are_made_step3_desc')}</code>
+            </div>
+            <i class="bi bi-arrow-down" style="font-size: 2rem; color: #666; margin: 0.5rem 0;"></i>
+            <!-- STEP 4: FINAL LAYER -->
+            <div style="background-color: #333; padding: 0.8rem; border-radius: 8px; width: 90%; max-width: 600px; text-align: center; border: 1px solid #444;">
+                <h5 style="margin: 0 0 0.5rem 0; color: #87CEEB; font-size: 0.9rem; letter-spacing: 1px; font-weight: bold;"><i class="bi bi-layer-forward"></i> {tr('how_vectors_are_made_step4_title')}</h5>
+                <code style="background: none; color: #EAEAEA; font-size: 1em;">{tr('how_vectors_are_made_step4_desc')}</code>
+            </div>
+            <i class="bi bi-arrow-down" style="font-size: 2rem; color: #666; margin: 0.5rem 0;"></i>
+            <!-- STEP 5: OUTPUT -->
+            <div style="background-color: #1e1e1e; padding: 1.2rem; border-radius: 8px; width: 90%; max-width: 600px; text-align: center; border: 2px solid #90EE90;">
+                <h5 style="margin: 0 0 0.5rem 0; color: #90EE90; font-size: 1rem; letter-spacing: 1px; font-weight: bold;"><i class="bi bi-check-circle-fill"></i> {tr('how_vectors_are_made_step5_title')}</h5>
+                <code style="background: none; color: #90EE90; font-weight: bold; font-size: 1.1em;">[ -0.23, 1.45, -0.89, ... ]</code>
+            </div>
+        </div>
+    </div>
+    """)
+    st.markdown("---")
+    analysis_run = 'analysis_results' in st.session_state and 'user_input' in st.session_state
+    # --- Initial Visualization ---
+    # Show the 3D PCA plot before an analysis is run.
+    if not analysis_run:
+        st.markdown(f"<h2>{tr('pca_3d_section_header')}</h2>", unsafe_allow_html=True)
+        display_3d_pca_visualization(show_description=True)
+        st.markdown("---")
+    # The interactive analysis section is always visible.
+    st.markdown(f"<h2>{tr('interactive_analysis_section_header')}</h2>", unsafe_allow_html=True)
+    display_interactive_analysis()
+    # If an analysis was run, show the results.
+    if analysis_run:
+        st.markdown("---")
+        with st.spinner(tr('running_analysis_spinner')):
+            display_analysis_results(st.session_state.analysis_results, st.session_state.user_input)
+    #if 'analysis_results' in st.session_state:
+     #   display_function_vector_feedback()
+def _trigger_and_rerun_analysis(input_text, include_attribution, include_evolution, enable_ai_explanation):
+    # Triggers an analysis, saves the results, and reruns the app.
+    if not input_text.strip():
+        st.warning("Please enter a prompt to analyze.")
+        return
+    st.session_state.user_input = input_text.strip()
+    st.session_state.enable_ai_explanation = enable_ai_explanation
+    with st.spinner(tr('running_analysis_spinner')):
+        try:
+            results = run_interactive_analysis(input_text.strip(), True, True, enable_ai_explanation)
+            if results:
+                st.session_state.analysis_results = results
+                # Process and store AI explanations if enabled.
+                if enable_ai_explanation or "pca_explanation" in results: # Also process if loaded from cache
+                    if 'api_error' in results:
+                        st.warning(results['api_error'])
+                    if 'pca_explanation' in results and results['pca_explanation']:
+                        # Split the explanation into parts based on headings.
+                        explanation_parts = re.split(r'(?=\n####\s)', results['pca_explanation'].strip())
+                        explanation_parts = [p.strip() for p in explanation_parts if p.strip()]
+                        st.session_state.explanation_part_1 = explanation_parts[0] if len(explanation_parts) > 0 else ""
+                        st.session_state.explanation_part_2 = explanation_parts[1] if len(explanation_parts) > 1 else ""
+                        st.session_state.explanation_part_3 = explanation_parts[2] if len(explanation_parts) > 2 else ""
+                    if 'evolution_explanation' in results and results['evolution_explanation']:
+                        # Split the evolution explanation into parts.
+                        evo_parts = re.split(r'(?=\n####\s)', results['evolution_explanation'].strip())
+                        evo_parts = [p.strip() for p in evo_parts if p.strip()]
+                        st.session_state.evolution_explanation_part_1 = evo_parts[0] if len(evo_parts) > 0 else ""
+                        st.session_state.evolution_explanation_part_2 = evo_parts[1] if len(evo_parts) > 1 else ""
+                if 'example_text' in st.session_state:
+                    del st.session_state['example_text']
+                st.rerun()
+            else:
+                st.error(tr('analysis_failed_error'))
+        except Exception as e:
+            st.error(tr('analysis_error').format(e=str(e)))
+            st.info(tr('ensure_model_and_data_info'))
+def display_interactive_analysis():
+    # Shows the interactive analysis section of the page.
+    # Show a section with example queries.
+    st.markdown(f"**{tr('example_queries_header')}**", unsafe_allow_html=True)
+    st.markdown(tr('example_queries_desc'))
+    current_lang = st.session_state.get('lang', 'en')
+    examples = {
+        'en': [
+            "Summarize the plot of 'Hamlet' in one sentence:",
+            "The main ingredient in a Negroni cocktail is",
+            "A Python function that calculates the factorial of a number is:",
+            "The sentence 'The cake was eaten by the dog' is in the following voice:",
+            "A good headline for an article about a new breakthrough in battery technology would be:",
+            "The capital of Mongolia is",
+            "The literary device in the phrase 'The wind whispered through the trees' is",
+            "The French translation of 'I would like to order a coffee, please.' is:",
+            "The movie 'The Matrix' can be classified into the following genre:"
+        ],
+        'de': [
+            "Fassen Sie die Handlung von 'Hamlet' in einem Satz zusammen:",
+            "Die Hauptzutat in einem Negroni-Cocktail ist",
+            "Eine Python-Funktion, die die Fakultät einer Zahl berechnet, lautet:",
+            "Der Satz 'Der Kuchen wurde vom Hund gefressen' steht in folgender Form:",
+            "Eine gute Überschrift für einen Artikel über einen neuen Durchbruch in der Batterietechnologie wäre:",
+            "Die Hauptstadt der Mongolei ist",
+            "Das literarische Stilmittel im Satz 'Der Wind flüsterte durch die Bäume' ist",
+            "Die französische Übersetzung von 'Ich möchte bitte einen Kaffee bestellen.' lautet:",
+            "Der Film 'Die Matrix' lässt sich in folgendes Genre einteilen:"
+        ]
+    }
+    # Display the examples in a 3-column grid.
+    example_cols = st.columns(3)
+    for i, example in enumerate(examples[current_lang]):
+        with example_cols[i % 3]:
+            if st.button(example, key=f"fv_example_{i}", use_container_width=True):
+                # Trigger an analysis when an example is clicked.
+                _trigger_and_rerun_analysis(example, True, True, True)
+    # Input section
+    # Add some custom CSS to style the text area.
+    st.markdown("""
+    <style>
+    .stTextArea > div > div > textarea {
+        background-color: #2b2b2b !important;
+        border: 2px solid #4a90e2 !important;
+        border-radius: 10px !important;
+        color: #ffffff !important;
+    }
+    .stTextArea > div > div > textarea::placeholder {
+        color: #888888 !important;
+    }
+    .stTextArea > div > div > textarea:focus {
+        border-color: #4a90e2 !important;
+        box-shadow: 0 0 0 2px rgba(74, 144, 226, 0.2) !important;
+    }
+    .custom-label {
+        font-size: 1.25rem !important;
+        font-weight: bold !important;
+        margin-bottom: 0.5rem !important;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    # Text input area that uses the session state.
+    # Use an example as the default value if one was clicked.
+    default_value = st.session_state.get('user_input', '')
+    st.markdown(f"<div class='custom-label'>{tr('input_text_label')}</div>", unsafe_allow_html=True)
+    input_text = st.text_area(
+        "text_area_for_analysis",
+        value=default_value,
+        placeholder="Sadly no GPU available. Please select an example above.",
+        height=100,
+        help=tr('input_text_help'),
+        label_visibility="collapsed",
+        disabled=True
+    )
+    # Checkbox for AI explanations.
+    enable_ai_explanation = st.checkbox(tr('enable_ai_explanation_checkbox'), value=True, help=tr('enable_ai_explanation_help'))
+    # Analysis button.
+    if st.button(tr('analyze_button'), type="primary"):
+        _trigger_and_rerun_analysis(input_text, True, True, enable_ai_explanation)
+def load_model_and_tokenizer():
+    # Loads and caches the model and tokenizer.
+    MODEL_PATH = "./models/OLMo-2-1124-7B"
+    device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+        output_hidden_states=True
+    )
+    return model, tokenizer, device
+@st.cache_data
+def _load_precomputed_vectors(lang='en', cache_version="function-vectors-2025-11-09"):
+    # Loads pre-computed vectors from a file.
+    vector_path = Path(__file__).parent / f"data/vectors/{lang}_category_vectors.npz"
+    if not vector_path.exists():
+        return None, None, f"Vector file not found for language '{lang}': {vector_path}"
+    try:
+        loaded_data = np.load(vector_path, allow_pickle=True)
+        category_vectors = {key: loaded_data[key] for key in loaded_data.files}
+        function_type_vectors = {}
+        for func_type_key, category_keys in FUNCTION_TYPES.items():
+            type_vectors = [category_vectors[cat_key] for cat_key in category_keys if cat_key in category_vectors]
+            if type_vectors:
+                function_type_vectors[func_type_key] = np.mean(type_vectors, axis=0)
+        return function_type_vectors, category_vectors, None
+    except Exception as e:
+        return None, None, f"Error loading vectors for language '{lang}': {e}"
+@st.cache_data(persist=True)
+def _perform_analysis(input_text, include_attribution, include_evolution, lang, enable_ai_explanation, cache_version="function-vectors-2025-11-09"):
+    # This function is cached and performs the main analysis.
+    results = {}
+    model, tokenizer, device = None, None, None
+    if include_attribution or include_evolution:
+        model, tokenizer, device = load_model_and_tokenizer()
+    if include_attribution:
+        function_type_vectors, category_vectors, error = _load_precomputed_vectors(lang)
+        if error:
+            results['error'] = error
+            return results
+        def get_input_activation(text):
+            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs, output_hidden_states=True)
+            last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
+            last_hidden_state = outputs.hidden_states[-1]
+            activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
+            return activation.astype(np.float64)
+        def calculate_similarity(activation, vectors_dict):
+            similarities = {}
+            norm_activation = activation / (np.linalg.norm(activation) + 1e-8)
+            for label, vector in vectors_dict.items():
+                norm_vector = vector / (np.linalg.norm(vector) + 1e-8)
+                similarity = np.dot(norm_activation, norm_vector)
+                similarities[label] = float(similarity)
+            return similarities
+        input_activation = get_input_activation(input_text)
+        function_type_scores = calculate_similarity(input_activation, function_type_vectors)
+        category_scores = calculate_similarity(input_activation, category_vectors)
+        results['attribution'] = {
+            'function_type_scores': dict(sorted(function_type_scores.items(), key=lambda x: x[1], reverse=True)),
+            'category_scores': dict(sorted(category_scores.items(), key=lambda x: x[1], reverse=True)),
+            'function_types_mapping': FUNCTION_TYPES,
+            'input_text': input_text,
+            'input_activation': input_activation,
+            'category_vectors': category_vectors,
+            'function_type_vectors': function_type_vectors
+        }
+    if include_evolution:
+        try:
+            analyzer = LayerEvolutionAnalyzer(model, tokenizer, device)
+            evolution_results = analyzer.analyze_text(input_text)
+            results['evolution'] = evolution_results
+        except Exception as e:
+            results['evolution_error'] = str(e)
+    if enable_ai_explanation:
+        with st.spinner(tr('generating_ai_explanation_spinner')):
+            api_config = init_qwen_api()
+            if api_config:
+                if 'attribution' in results:
+                    attribution_results = results['attribution']
+                    sorted_category_scores = list(attribution_results['category_scores'].items())
+                    # Get the top 3 categories.
+                    top_3_cats_data = sorted_category_scores[:3]
+                    top_cats_for_prompt = [format_category_name(cat_key) for cat_key, _ in top_3_cats_data]
+                    top_types_raw = list(attribution_results['function_type_scores'].keys())[:3]
+                    top_types_formatted = [format_category_name(t) for t in top_types_raw]
+                    results['pca_explanation'] = explain_pca_with_llm(api_config, input_text, top_types_formatted, top_cats_for_prompt)
+                if 'evolution' in results:
+                    results['evolution_explanation'] = explain_evolution_with_llm(api_config, input_text, results['evolution'])
+            else:
+                results['api_error'] = "Qwen API key not configured. Skipping AI explanation."
+    # Clean up to free memory.
+    if model is not None:
+        del model
+        del tokenizer
+        gc.collect()
+        if device == 'mps':
+            torch.mps.empty_cache()
+        elif device == 'cuda':
+            torch.cuda.empty_cache()
+    return results
+class LayerEvolutionAnalyzer:
+    def __init__(self, model, tokenizer, device):
+        # Initialize the analyzer with a pre-loaded model.
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        # Get the number of layers.
+        self.num_layers = self.model.config.num_hidden_layers
+        # Set the model to evaluation mode.
+        self.model.eval()
+    def extract_layer_vectors(self, text: str) -> Dict[int, np.ndarray]:
+        # Extracts function vectors from each layer for a given text.
+        import numpy as np
+        import torch
+        # Tokenize the input text.
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+        hidden_states = outputs.hidden_states
+        layer_vectors = {}
+        for i, state in enumerate(hidden_states):
+            vec = state[0].mean(dim=0).cpu().numpy()
+            vec = vec.astype(np.float64)
+            vec = np.nan_to_num(vec, nan=0.0, posinf=1.0, neginf=-1.0)
+            layer_vectors[i] = vec
+        return layer_vectors
+    def compute_layer_similarities(self, layer_vectors: Dict[int, np.ndarray]) -> np.ndarray:
+        # Computes the cosine similarity between vectors from different layers.
+        import numpy as np
+        n_layers = len(layer_vectors)
+        vectors = np.array([layer_vectors[i] for i in range(n_layers)])
+        normalized_vectors = vectors / (np.linalg.norm(vectors, axis=1, keepdims=True) + 1e-8)
+        similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T)
+        return similarity_matrix
+    def calculate_layer_changes(self, layer_vectors: Dict[int, np.ndarray]) -> List[float]:
+        # Calculates the amount of change between consecutive layers.
+        import numpy as np
+        changes = []
+        for i in range(1, len(layer_vectors)):
+            vec1 = layer_vectors[i-1]
+            vec2 = layer_vectors[i]
+            norm1 = np.linalg.norm(vec1)
+            norm2 = np.linalg.norm(vec2)
+            if norm1 == 0 or norm2 == 0:
+                sim = 0
+            else:
+                sim = np.dot(vec1, vec2) / (norm1 * norm2)
+            distance = 1 - sim
+            changes.append(distance)
+        return changes
+    def analyze_text(self, text: str):
+        # Performs a complete layer evolution analysis on a text.
+        layer_vectors = self.extract_layer_vectors(text)
+        similarity_matrix = self.compute_layer_similarities(layer_vectors)
+        layer_changes = self.calculate_layer_changes(layer_vectors)
+        return {
+            'layer_vectors': layer_vectors,
+            'similarity_matrix': similarity_matrix,
+            'layer_changes': layer_changes
+        }
+def run_interactive_analysis(input_text, include_attribution=True, include_evolution=True, enable_ai_explanation=True):
+    # A wrapper function for running the analysis from the UI.
+    # Before running, check if models exist if not using a cached value.
+    # This check relies on the fact that caching is attempted first.
+    model_path = "./models/OLMo-2-1124-7B"
+    model_exists = os.path.exists(model_path)
+    # if not os.path.exists(model_path):
+    #     # We assume if the model path is missing, we are in a static environment.
+    #     # The calling function should have already checked the cache.
+    #     st.info("This live demo is running in a static environment. Only the pre-cached example prompts are available. Please select an example to view its analysis.")
+    #     return None
+    current_lang = st.session_state.get('lang', 'en')
+    try:
+        results = _perform_analysis(input_text, include_attribution, include_evolution, current_lang, enable_ai_explanation)
+    except Exception as e:
+        if not model_exists:
+             st.info("This live demo is running in a static environment. Only the pre-cached example prompts are available. Please select an example to view its analysis.")
+             return None
+        else:
+             # If model exists but it failed, it's a real error
+             st.error(f"Analysis failed: {e}")
+             return None
+    if 'error' in results and results['error']:
+        st.error(results['error'])
+        return None
+    if 'evolution_error' in results:
+        st.warning(f"Layer evolution analysis failed: {results['evolution_error']}")
+    if 'api_error' in results:
+        st.error(results['api_error'])
+    if 'attribution' in results:
+        st.session_state.user_input_3d_data = results['attribution']
+    return results
+def explain_pca_with_llm(api_config, input_text, top_types, top_cats):
+    # Generates an explanation for the PCA plot with an LLM.
+    lang = st.session_state.get('lang', 'en')
+    prompt_key = 'pca_explanation_prompt_de' if lang == 'de' else 'pca_explanation_prompt'
+    prompt = tr(prompt_key).format(
+        input_text=input_text,
+        top_types=", ".join(top_types),
+        top_cats=", ".join(top_cats)
+    )
+    explanation = _explain_with_llm(api_config, prompt)
+    if "API request failed" in explanation or "Failed to generate explanation" in explanation:
+        st.error(explanation)
+        return None
+    return explanation
+def explain_evolution_with_llm(api_config, input_text, evolution_results):
+    # Generates an explanation for the layer evolution charts with an LLM.
+    # Extract data for the prompt.
+    activation_strengths = [float(np.sqrt(np.sum(vec ** 2))) for vec in evolution_results['layer_vectors'].values()]
+    layer_changes = evolution_results['layer_changes']
+    peak_activation_layer = np.argmax(activation_strengths)
+    peak_activation_strength = activation_strengths[peak_activation_layer]
+    biggest_change_idx = np.argmax(layer_changes)
+    biggest_change_start_layer = biggest_change_idx + 1
+    biggest_change_end_layer = biggest_change_idx + 2
+    biggest_change_magnitude = layer_changes[biggest_change_idx]
+    lang = st.session_state.get('lang', 'en')
+    prompt_key = 'evolution_explanation_prompt_de' if lang == 'de' else 'evolution_explanation_prompt'
+    prompt = tr(prompt_key).format(
+        input_text=input_text,
+        peak_activation_layer=peak_activation_layer,
+        peak_activation_strength=peak_activation_strength,
+        biggest_change_start_layer=biggest_change_start_layer,
+        biggest_change_end_layer=biggest_change_end_layer,
+        biggest_change_magnitude=biggest_change_magnitude
+    )
+    explanation = _explain_with_llm(api_config, prompt)
+    if "API request failed" in explanation or "Failed to generate explanation" in explanation:
+        st.error(explanation)
+        return None
+    return explanation
+@st.cache_data(persist=True)
+def _explain_with_llm(_api_config, prompt, cache_version="function-vectors-2025-11-09"):
+    # Makes a cached API call to the LLM.
+    with st.session_state.api_lock:
+        headers = {
+            "Authorization": f"Bearer {_api_config['api_key']}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": "qwen2.5-vl-72b-instruct",
+            "messages": [{"role": "user", "content": prompt}]
+        }
+        response = requests.post(
+            f"{_api_config['api_endpoint']}/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=300
+        )
+        # Raise an exception if the API call fails.
+        response.raise_for_status()
+        return response.json().get('choices', [{}])[0].get('message', {}).get('content', '')
+# --- Faithfulness Verification for Function Vectors ---
+def find_closest_match(query, choices):
+    # Wrapper for fuzzy matching to find the best choice.
+    if not query or not choices:
+        return None
+    match, score = process.extractOne(query, choices)
+    if score > 80: # Using a similarity threshold
+        return match
+    return None
+@st.cache_data(persist=True)
+def _cached_extract_fv_claims(api_config, explanation_text, context, cache_version="function-vectors-2025-11-09"):
+    # Extracts verifiable claims from an AI explanation on the function vectors page.
+    with st.session_state.api_lock:
+        headers = {
+            "Authorization": f"Bearer {api_config['api_key']}",
+            "Content-Type": "application/json"
+        }
+        # The prompt is dynamically adjusted based on the context (PCA or Evolution).
+        if context == "pca":
+            claim_types_details = tr("fv_claim_extraction_prompt_pca_types_details")
+        elif context == "evolution":
+            claim_types_details = tr("fv_claim_extraction_prompt_evolution_types_details")
+        else:
+            return []
+        # Dynamically set the example based on context.
+        if context == "pca":
+            example_block = f"""{tr('fv_claim_extraction_prompt_pca_example_header')}
+{tr('fv_claim_extraction_prompt_pca_example_explanation')}
+{tr('fv_claim_extraction_prompt_pca_example_json')}
+"""
+        elif context == "evolution":
+            example_block = f"""{tr('fv_claim_extraction_prompt_evolution_example_header')}
+{tr('fv_claim_extraction_prompt_evolution_example_explanation')}
+{tr('fv_claim_extraction_prompt_evolution_example_json')}
+"""
+        else:
+            example_block = ""
+        claim_extraction_prompt = f"""{tr('fv_claim_extraction_prompt_header')}
+{tr('fv_claim_extraction_prompt_instruction')}
+{tr('fv_claim_extraction_prompt_context_header').format(context=context)}
+{tr('fv_claim_extraction_prompt_types_header')}
+{claim_types_details}
+{example_block}
+{tr('fv_claim_extraction_prompt_analyze_header')}
+"{explanation_text}"
+{tr('fv_claim_extraction_prompt_footer')}
+"""
+        data = {
+            "model": "qwen2.5-vl-72b-instruct",
+            "messages": [{"role": "user", "content": claim_extraction_prompt}],
+            "max_tokens": 1500,
+            "temperature": 0.0,
+            "seed": 42
+        }
+        response = requests.post(
+            f"{api_config['api_endpoint']}/chat/completions",
+            headers=headers,
+            json=data,
+            timeout=300
+        )
+        response.raise_for_status()
+        claims_text = response.json()["choices"][0]["message"]["content"]
+        try:
+            if '```json' in claims_text:
+                claims_text = re.search(r'```json\n(.*?)\n```', claims_text, re.DOTALL).group(1)
+            return json.loads(claims_text)
+        except (AttributeError, json.JSONDecodeError):
+            return []
+@st.cache_data(persist=True)
+def _cached_verify_semantic_cluster_claim(api_config, claimed_clusters, actual_top_clusters, cache_version="function-vectors-2025-11-09"):
+    # Uses an LLM to verify if a semantic summary of clusters is faithful to the actual top clusters.
+    with st.session_state.api_lock:
+        headers = {
+            "Authorization": f"Bearer {api_config['api_key']}",
+            "Content-Type": "application/json"
+        }
+        verification_prompt = f"""{tr('fv_semantic_verification_prompt_header')}
+{tr('fv_semantic_verification_prompt_rule')}
+{tr('fv_semantic_verification_prompt_actual_header')}
+{actual_top_clusters}
+{tr('fv_semantic_verification_prompt_claimed_header')}
+"{', '.join(claimed_clusters)}"
+{tr('fv_semantic_verification_prompt_task_header')}
+{tr('fv_semantic_verification_prompt_task_instruction')}
+{tr('fv_semantic_verification_prompt_json_instruction')}
+{tr('fv_semantic_verification_prompt_footer')}
+"""
+    data = {
+        "model": "qwen2.5-vl-72b-instruct",
+        "messages": [{"role": "user", "content": verification_prompt}],
+        "max_tokens": 400,
+        "temperature": 0.0,
+        "seed": 42,
+        "response_format": {"type": "json_object"}
+    }
+    response = requests.post(
+        f"{api_config['api_endpoint']}/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=300
+    )
+    response.raise_for_status()
+    try:
+        result_json = response.json()["choices"][0]["message"]["content"]
+        return json.loads(result_json)
+    except (json.JSONDecodeError, KeyError):
+        return {"is_verified": False, "reasoning": "Could not parse the semantic verification result."}
+@st.cache_data(persist=True)
+def _cached_verify_justification_claim(api_config, input_prompt, category_name, justification, cache_version="function-vectors-2025-11-09"):
+    # Uses an LLM to verify if a justification for a category's relevance is sound.
+    with st.session_state.api_lock:
+        headers = {
+            "Authorization": f"Bearer {api_config['api_key']}",
+            "Content-Type": "application/json"
+        }
+        verification_prompt = f"""{tr('fv_justification_verification_prompt_header')}
+{tr('fv_justification_verification_prompt_rule')}
+{tr('fv_justification_verification_prompt_input_header')}
+"{input_prompt}"
+{tr('fv_justification_verification_prompt_category_header')}
+"{category_name}"
+{tr('fv_justification_verification_prompt_justification_header')}
+"{justification}"
+{tr('fv_justification_verification_prompt_task_header')}
+{tr('fv_justification_verification_prompt_task_instruction')}
+{tr('fv_justification_verification_prompt_json_instruction')}
+{tr('fv_justification_verification_prompt_footer')}
+"""
+    data = {
+        "model": "qwen2.5-vl-72b-instruct",
+        "messages": [{"role": "user", "content": verification_prompt}],
+        "max_tokens": 600,
+        "temperature": 0.0,
+        "seed": 42,
+        "response_format": {"type": "json_object"}
+    }
+    response = requests.post(
+        f"{api_config['api_endpoint']}/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=300
+    )
+    response.raise_for_status()
+    try:
+        result_json = response.json()["choices"][0]["message"]["content"]
+        return json.loads(result_json)
+    except (json.JSONDecodeError, KeyError):
+        return {"is_verified": False, "reasoning": "Could not parse the semantic justification result."}
+def verify_fv_claims(claims, analysis_results, context):
+    # Verifies claims for the function vector page.
+    verification_results = []
+    if not analysis_results:
+        return [{"claim_text": c.get('claim_text', 'N/A'), "verified": False, "evidence": "Analysis results not available."} for c in claims]
+    for claim in claims:
+        is_verified = False
+        evidence = "Could not be verified."
+        details = claim.get('details', {})
+        try:
+            if context == "pca" and 'attribution' in analysis_results:
+                attribution_data = analysis_results['attribution']
+                claim_type = claim.get('claim_type')
+                if claim_type == 'top_k_similarity':
+                    item_type = details.get('item_type')
+                    items_claimed = details.get('items', [])
+                    items_claimed_lower = [str(i).lower() for i in items_claimed]
+                    rank_description = details.get('rank_description')
+                    TOP_K = 3
+                    if item_type == 'function_type':
+                        actual_scores_raw = list(attribution_data['function_type_scores'].keys())
+                        actual_scores_formatted = [tr(i) for i in actual_scores_raw]
+                        actual_scores_lower = [name.lower() for name in actual_scores_formatted]
+                        if rank_description == 'most':
+                            num_claimed = len(items_claimed_lower)
+                            top_n_actual_formatted = actual_scores_formatted[:num_claimed]
+                            top_n_actual_lower = actual_scores_lower[:num_claimed]
+                            is_verified = set(items_claimed_lower) == set(top_n_actual_lower)
+                            evidence = f"The top {num_claimed} function type(s) are: {top_n_actual_formatted}. "
+                            if is_verified:
+                                evidence += "The claim correctly identified them."
+                            else:
+                                evidence += f"The claimed type(s) {items_claimed} did not match the top {num_claimed}."
+                        else:
+                            # Default: check for presence in top K
+                            top_k_actual_formatted = actual_scores_formatted[:TOP_K]
+                            top_k_actual_lower = actual_scores_lower[:TOP_K]
+                            unverified_items = [item for item in items_claimed_lower if item not in top_k_actual_lower]
+                            is_verified = not unverified_items
+                            evidence = f"Top {TOP_K} actual function types are: {top_k_actual_formatted}. "
+                            if not is_verified:
+                                unverified_items_original_case = [c for c in items_claimed if c.lower() in unverified_items]
+                                evidence += f"The following claimed types were not found in the top {TOP_K}: {unverified_items_original_case}."
+                            else:
+                                evidence += f"The claimed types {items_claimed} were successfully found within the top {TOP_K}."
+                    elif item_type == 'category':
+                        actual_scores_raw = list(attribution_data['category_scores'].keys())
+                        actual_scores_formatted = [format_category_name(i) for i in actual_scores_raw]
+                        actual_scores_lower = [name.lower() for name in actual_scores_formatted]
+                        if rank_description == 'most':
+                            num_claimed = len(items_claimed_lower)
+                            top_n_actual_formatted = actual_scores_formatted[:num_claimed]
+                            top_n_actual_lower = actual_scores_lower[:num_claimed]
+                            is_verified = set(items_claimed_lower) == set(top_n_actual_lower)
+                            evidence = f"The top {num_claimed} category/categories are: {top_n_actual_formatted}. "
+                            if is_verified:
+                                evidence += "The claim correctly identified them."
+                            else:
+                                evidence += f"The claimed category/categories {items_claimed} did not match the top {num_claimed}."
+                        else:
+                            # Default: check for presence in top K
+                            top_k_actual_formatted = actual_scores_formatted[:TOP_K]
+                            top_k_actual_lower = actual_scores_lower[:TOP_K]
+                            unverified_items = [item for item in items_claimed_lower if item not in top_k_actual_lower]
+                            is_verified = not unverified_items
+                            evidence = f"Top {TOP_K} actual categories are: {top_k_actual_formatted}. "
+                            if not is_verified:
+                                unverified_items_original_case = [c for c in items_claimed if c.lower() in unverified_items]
+                                evidence += f"The following claimed categories were not found in the top {TOP_K}: {unverified_items_original_case}."
+                            else:
+                                evidence += f"The claimed categories {items_claimed} were successfully found within the top {TOP_K}."
+                elif claim_type == 'positional_claim':
+                    cluster_names_claimed = details.get('cluster_names', [])
+                    position = details.get('position')
+                    if position == 'near':
+                        top_3_types_raw = list(attribution_data['function_type_scores'].keys())[:3]
+                        top_3_types_formatted = [tr(i) for i in top_3_types_raw]
+                        api_config = init_qwen_api()
+                        if api_config:
+                            verification = _cached_verify_semantic_cluster_claim(api_config, cluster_names_claimed, top_3_types_formatted)
+                            is_verified = verification.get('is_verified', False)
+                            evidence = verification.get('reasoning', "Failed to get reasoning.")
+                        else:
+                            is_verified = False
+                            evidence = "API key not configured for semantic verification."
+                elif claim_type == 'category_justification_claim':
+                    category_name = details.get('category_name')
+                    justification = details.get('justification')
+                    input_prompt = analysis_results.get('attribution', {}).get('input_text', '')
+                    if not all([category_name, justification, input_prompt]):
+                        evidence = "Missing data for justification verification (category, justification, or input prompt)."
+                    else:
+                        api_config = init_qwen_api()
+                        if api_config:
+                            verification = _cached_verify_justification_claim(api_config, input_prompt, category_name, justification)
+                            is_verified = verification.get('is_verified', False)
+                            evidence = verification.get('reasoning', "Failed to get semantic reasoning for justification.")
+                        else:
+                            is_verified = False
+                            evidence = "API key not configured for semantic verification."
+            elif context == "evolution" and 'evolution' in analysis_results:
+                evolution_data = analysis_results['evolution']
+                claim_type = claim.get('claim_type')
+                if claim_type == 'peak_activation':
+                    claimed_layer = details.get('layer_index')
+                    activation_strengths = [float(np.sqrt(np.sum(vec ** 2))) for vec in evolution_data['layer_vectors'].values()]
+                    actual_peak_layer = np.argmax(activation_strengths)
+                    is_verified = (claimed_layer == actual_peak_layer)
+                    evidence = f"Claimed peak activation at layer {claimed_layer}. Actual peak is at layer {actual_peak_layer}."
+                elif claim_type == 'biggest_change':
+                    claimed_start = details.get('start_layer')
+                    layer_changes = evolution_data['layer_changes']
+                    actual_biggest_change_idx = np.argmax(layer_changes)
+                    actual_start_layer = actual_biggest_change_idx + 1
+                    is_verified = (claimed_start == actual_start_layer)
+                    evidence = f"Claimed biggest change starts at layer {claimed_start}. Actual biggest change is at layer {actual_start_layer} -> {actual_start_layer + 1}."
+                elif claim_type == 'specific_value_claim':
+                    metric = details.get('metric')
+                    layer_index = details.get('layer_index')
+                    value = details.get('value')
+                    if metric == 'activation_strength':
+                        activation_strengths = [float(np.sqrt(np.sum(vec ** 2))) for vec in evolution_data['layer_vectors'].values()]
+                        # Check if layer_index is valid
+                        if layer_index < len(activation_strengths):
+                            actual_value = activation_strengths[layer_index]
+                            is_verified = round(actual_value, 2) == round(value, 2)
+                            evidence = f"Claimed activation strength for layer {layer_index} was {value}. Actual strength is {actual_value:.2f}."
+                        else:
+                            evidence = f"Invalid layer index {layer_index} provided."
+                    elif metric == 'change_magnitude':
+                        layer_changes = evolution_data['layer_changes']
+                        # change between L and L+1 is at index L-1 in the list
+                        # So for layer_index 1 (1->2), we need list index 0.
+                        change_index = layer_index - 1
+                        if 0 <= change_index < len(layer_changes):
+                            actual_value = layer_changes[change_index]
+                            is_verified = round(actual_value, 2) == round(value, 2)
+                            evidence = f"Claimed change magnitude for transition starting at layer {layer_index} was {value}. Actual magnitude is {actual_value:.2f}."
+                        else:
+                            evidence = f"Invalid starting layer index {layer_index} for change magnitude."
+        except Exception as e:
+            evidence = f"An error occurred during verification: {str(e)}"
+        verification_results.append({
+            'claim_text': claim.get('claim_text', 'N/A'),
+            'verified': is_verified,
+            'evidence': evidence
+        })
+    return verification_results
+# --- End Faithfulness Verification ---
+def display_category_examples():
+    # Displays an explorer for the function category examples.
+    st.markdown(tr('category_examples_desc'))
+    # Add an expander with descriptions for each function type.
+    with st.expander(tr('what_is_this_function_type')):
+        for func_type_key in FUNCTION_TYPES.keys():
+            color = FUNCTION_TYPE_COLORS.get(func_type_key, '#CCCCCC')
+            st.markdown(f"""
+            <div style="border-left: 5px solid {color}; padding: 0.5rem 1rem; margin-top: 1rem; background-color: #2b2b2b; border-radius: 5px;">
+                <h5 style="margin: 0; color: {color};">{tr(func_type_key)}</h5>
+                <p style="margin-top: 0.5rem; color: #EAEAEA;">{tr(f"desc_{func_type_key}")}</p>
+            </div>
+            """, unsafe_allow_html=True)
+    if 'show_all_states' not in st.session_state:
+        st.session_state.show_all_states = {}
+    current_lang = st.session_state.get('lang', 'en')
+    col1, col2 = st.columns([1, 3])
+    with col1:
+        st.subheader(tr('function_types_subheader'))
+        # --- Restore st.radio and add CSS for highlighting ---
+        func_type_keys = list(FUNCTION_TYPES.keys())
+        display_names = [tr(key) for key in func_type_keys]
+        # Set a default selection.
+        if 'selected_func_type_key' not in st.session_state:
+            st.session_state.selected_func_type_key = func_type_keys[0]
+        # Find the index of the current selection.
+        try:
+            current_index = func_type_keys.index(st.session_state.selected_func_type_key)
+        except ValueError:
+            current_index = 0
+        def on_radio_change():
+            # A callback to update the session state when the radio button changes.
+            selected_display_name = st.session_state.radio_selector
+            if selected_display_name in display_names:
+                idx = display_names.index(selected_display_name)
+                st.session_state.selected_func_type_key = func_type_keys[idx]
+        # Create the radio button selector.
+        st.radio(
+            label="Function Types",
+            options=display_names,
+            index=current_index,
+            on_change=on_radio_change,
+            key='radio_selector',
+            label_visibility="collapsed"
+        )
+        # Get the key and color for the selected function type.
+        selected_func_type_key = st.session_state.selected_func_type_key
+        selected_color = FUNCTION_TYPE_COLORS.get(selected_func_type_key, 'lightgrey')
+        # Add some CSS to highlight the selected radio button.
+        st.markdown(f"""
+        <style>
+            [data-testid="stAppViewBlockContainer"] div[role="radiogroup"] > label:has(input[type="radio"]:checked) {{
+                background-color: {selected_color} !important;
+                border-radius: 10px;
+                padding: 0.5rem 1rem;
+                color: white !important;
+                font-weight: bold;
+            }}
+            /* Ensure the text itself is white for contrast */
+            [data-testid="stAppViewBlockContainer"] div[role="radiogroup"] > label:has(input[type="radio"]:checked) div {{
+                color: white !important;
+            }}
+        </style>
+        """, unsafe_allow_html=True)
+    with col2:
+        category_keys = FUNCTION_TYPES[selected_func_type_key]
+        available_cats = [
+            cat_key for cat_key in category_keys
+            if cat_key in FUNCTION_CATEGORIES and current_lang in FUNCTION_CATEGORIES[cat_key]
+        ]
+        if not available_cats:
+            st.warning(tr('no_examples_for_type'))
+        else:
+            # Get the color and symbol for the selected type.
+            selected_display_name = tr(selected_func_type_key)
+            # Display the header.
+            st.markdown(f"<h4 style='color: #3498db; font-weight: bold;'>{tr('prompt_examples_for_category_header').format(category=selected_display_name)}</h4>", unsafe_allow_html=True)
+            num_to_show_by_default = 9
+            show_all = st.session_state.show_all_states.get(selected_func_type_key, False)
+            if len(available_cats) > num_to_show_by_default and not show_all:
+                cats_to_display = available_cats[:num_to_show_by_default]
+            else:
+                cats_to_display = available_cats
+            # --- Display Cards ---
+            num_columns = 3
+            example_cols = st.columns(num_columns)
+            for i, cat_key in enumerate(cats_to_display):
+                examples = FUNCTION_CATEGORIES.get(cat_key, {}).get(current_lang, [])
+                if examples:
+                    # Use the formatter for the display name.
+                    display_name = format_category_name(cat_key)
+                    with example_cols[i % num_columns]:
+                        with st.container():
+                            st.markdown(f"""
+                            <div style="border: 1px solid #e0e0e0; border-radius: 10px; padding: 1rem; height: 140px; margin-bottom: 1rem; display: flex; flex-direction: column; justify-content: space-between;">
+                                <div>
+                                    <p style="font-weight: bold; color: #3498db;">{display_name}</p>
+                                </div>
+                                <div>
+                                    <p style="font-style: italic; font-size: 0.9em; color: #6c757d;">"{examples[0]}"</p>
+                                </div>
+                            </div>
+                            """, unsafe_allow_html=True)
+            # --- "Show More/Less" Buttons ---
+            if len(available_cats) > num_to_show_by_default:
+                if not show_all:
+                    if st.button(tr('show_all_button').format(count=len(available_cats)), key=f"show_all_{selected_func_type_key}"):
+                        st.session_state.show_all_states[selected_func_type_key] = True
+                        st.rerun()
+                else:
+                    if st.button(tr('show_less_button'), key=f"show_less_{selected_func_type_key}"):
+                        # Set to False or remove the key.
+                        st.session_state.show_all_states[selected_func_type_key] = False
+                        st.rerun()
+def display_3d_pca_visualization(user_input_data=None, show_description=True):
+    # Displays the interactive 3D PCA plot.
+    import numpy as np
+    current_lang = st.session_state.get('lang', 'en')
+    if show_description:
+        if current_lang == 'de':
+            st.markdown("""
+            <div style='background-color: #2b2b2b; color: #ffffff; padding: 1.5rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #4a90e2;'>
+                <h4 style='color: #4a90e2; margin-top: 0;'>Interaktive 3D-PCA von Funktionsvektoren</h4>
+                <p>Diese Visualisierung stellt die hochdimensionalen 'Funktionsvektoren' verschiedener Anweisungs-Prompts in einem vereinfachten 3D-Raum mittels Hauptkomponentenanalyse (PCA) dar. Hier ist eine Aufschlüsselung dessen, was Sie sehen:</p>
+                <ul>
+                    <li><strong>Was sind Funktionsvektoren?</strong> Jeder Punkt in diesem Diagramm repräsentiert einen 'Funktionsvektor' – einen numerischen Fingerabdruck (ein Embedding), der den zentralen funktionalen Zweck eines bestimmten Prompts erfasst. Diese Vektoren werden aus dem letzten verborgenen Zustand des OLMo-Modells extrahiert, nachdem es einen Prompt verarbeitet hat. Prompts mit ähnlichen Funktionen haben Vektoren, die im hochdimensionalen Raum nahe beieinander liegen.</li>
+                    <li><strong>Wie funktioniert PCA?</strong> PCA ist eine Technik zur Dimensionsreduktion, die komplexe, hochdimensionale Daten in ein neues, kleineres Koordinatensystem (in diesem Fall 3D) umwandelt. Dies geschieht durch die Identifizierung der Richtungen (Hauptkomponenten), in denen die Daten am stärksten variieren. Durch die Darstellung der ersten drei Hauptkomponenten können wir die wichtigsten Beziehungen zwischen den Funktionsvektoren auf eine für uns leicht interpretierbare Weise visualisieren.</li>
+                    <li><strong>Worauf ist zu achten?</strong> Suchen Sie nach Punktclustern. Diese Cluster repräsentieren Gruppen von Funktionen, die das Modell als ähnlich wahrnimmt. Der Abstand zwischen den Punkten gibt ihre funktionale Ähnlichkeit an – nähere Punkte sind ähnlicher.</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+        else:
+            st.markdown("""
+    <div style='background-color: #2b2b2b; color: #ffffff; padding: 1.5rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #4a90e2;'>
+        <h4 style='color: #4a90e2; margin-top: 0;'>Interactive 3D PCA of Function Vectors</h4>
+        <p>This visualization plots the high-dimensional 'function vectors' of different instructional prompts in a simplified 3D space using <strong>Principal Component Analysis (PCA)</strong>. Here's a breakdown of what you're seeing:</p>
+        <ul>
+            <li><strong>What are Function Vectors?</strong> Each point on this plot represents a 'function vector'—a numerical fingerprint (an embedding) that captures the core functional purpose of a specific prompt. These vectors are extracted from the final hidden state of the OLMo model after it processes a prompt. Prompts with similar functions will have vectors that are close to each other in the high-dimensional space.</li>
+            <li><strong>How does PCA work?</strong> PCA is a dimensionality reduction technique that transforms the complex, high-dimensional data into a new, smaller coordinate system (in this case, 3D). It does this by identifying the directions (principal components) where the data varies the most. By plotting the first three principal components, we can visualize the most significant relationships between the function vectors in a way that's easy for us to interpret.</li>
+            <li><strong>What to look for:</strong> Look for clusters of points. These clusters represent groups of functions that the model perceives as similar. The distance between points indicates their functional similarity—closer points are more alike.</li>
+        </ul>
+    </div>
+    """, unsafe_allow_html=True)
+        st.markdown(tr('run_analysis_for_viz_info'), unsafe_allow_html=True)
+    # --- Load the base vectors for the selected language ---
+    @st.cache_data
+    def load_base_vectors(lang, cache_version="function-vectors-2025-11-09"):
+        import numpy as np
+        vector_path = Path(__file__).parent / f"data/vectors/{lang}_category_vectors.npz"
+        if not vector_path.exists():
+            st.error(f"Could not find vector file for language '{lang}' at {vector_path}")
+            return None
+        try:
+            loaded_data = np.load(vector_path, allow_pickle=True)
+            return {key: loaded_data[key] for key in loaded_data.files}
+        except Exception as e:
+            st.error(f"Error loading vectors: {e}")
+            return None
+    category_vectors = load_base_vectors(current_lang)
+    if category_vectors is None:
+        return # Stop if we can't load the necessary data
+    try:
+        # Prepare data for PCA using the loaded base vectors
+        categories = list(category_vectors.keys())
+        vectors = np.vstack([category_vectors[cat] for cat in categories])
+        # If user input exists, add it to the data
+        if user_input_data is not None:
+            input_activation = user_input_data['input_activation']
+            input_text = user_input_data['input_text']
+            all_vectors = np.vstack([vectors, input_activation.reshape(1, -1)])
+            plot_title = tr('pca_3d_with_input_title')
+        else:
+            all_vectors = vectors
+            plot_title = tr('pca_3d_title').format(lang=current_lang.upper())
+        # Perform PCA
+        pca = PCA(n_components=3)
+        reduced_vectors = pca.fit_transform(all_vectors)
+        # Create plotly figure
+        fig = go.Figure()
+        # Add category points grouped by function type
+        category_points = reduced_vectors[:len(categories)]
+        for func_type_key, cats in FUNCTION_TYPES.items():
+            func_categories = [cat for cat in cats if cat in categories]
+            if func_categories:
+                indices = [categories.index(cat) for cat in func_categories]
+                fig.add_trace(go.Scatter3d(
+                    x=category_points[indices, 0], y=category_points[indices, 1], z=category_points[indices, 2],
+                    mode='markers',
+                    marker=dict(size=8, color=FUNCTION_TYPE_COLORS.get(func_type_key, 'gray'), symbol=PLOTLY_SYMBOLS.get(func_type_key, 'circle'), line=dict(width=1, color='black'), opacity=0.7),
+                    name=tr(func_type_key),
+                    text=[format_category_name(cat) for cat in func_categories],
+                    hovertemplate="<b>%{text}</b><br>PC1: %{x:.3f}<br>PC2: %{y:.3f}<br>PC3: %{z:.3f}<extra></extra>"
+                ))
+        # If user input exists, add it as a special point
+        if user_input_data is not None:
+            user_point = reduced_vectors[-1]
+            fig.add_trace(go.Scatter3d(
+                x=[user_point[0]], y=[user_point[1]], z=[user_point[2]],
+                mode='markers',
+                marker=dict(size=12, color='red', symbol='diamond', line=dict(width=2, color='darkred')),
+                name=tr('your_input_legend'),
+                text=[f"{tr('your_input_legend')}: {input_text[:50]}..."],
+                hovertemplate=f"<b>{tr('your_input_hover_title')}</b><br>%{{text}}<br>PC1: %{{x:.3f}}<br>PC2: %{{y:.3f}}<br>PC3: %{{z:.3f}}<extra></extra>"
+            ))
+        fig.update_layout(
+            title=plot_title,
+            width=1400, height=900,
+            scene=dict(xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3', camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))),
+            legend=dict(orientation="v", yanchor="top", y=1, xanchor="left", x=1.02, font=dict(size=10), title_text=tr('legend_title'))
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        if user_input_data is not None:
+            st.markdown(tr('your_input_analysis_desc').format(input_text=input_text))
+        else:
+            st.markdown(f"""{tr('pca_key_insights')}""", unsafe_allow_html=True)
+    except Exception as e:
+        st.error(tr('error_creating_enhanced_pca').format(e=str(e)))
+def display_analysis_results(results, input_text):
+    # Displays the results of the analysis.
+    st.success(tr('analysis_complete_success'))
+    st.markdown(f"""
+    <div style='background: linear-gradient(135deg, #2f3f70 0%, #3a4c86 100%); padding: 1rem; border-radius: 10px; color: #f5f7fb; margin: 1rem 0; border-left: 4px solid #dcae36;'>
+        <h4 style='margin: 0; color: #f5f7fb;'>{tr('analyzed_text_header')}</h4>
+        <p style='margin: 0.5rem 0 0 0; font-size: 1.1rem; font-style: italic; color: #e8ecf8;'>"{input_text}"</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # --- Show the 3D plot with the user's data first ---
+    st.markdown(f"<h2>{tr('pca_3d_section_header')}</h2>", unsafe_allow_html=True)
+    user_input_data = st.session_state.get('user_input_3d_data')
+    display_3d_pca_visualization(user_input_data, show_description=False)
+    # --- AI Explanation for PCA Plot ---
+    if st.session_state.get('enable_ai_explanation') and 'explanation_part_1' in st.session_state:
+        # Display the first part of the explanation.
+        if st.session_state.explanation_part_1:
+            explanation_html = markdown.markdown(st.session_state.explanation_part_1)
+            st.markdown(
+                f"<div style='background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #6EE7B7; font-size: 0.9rem;'>{explanation_html}</div>",
+                unsafe_allow_html=True
+            )
+                # Faithfulness Check for PCA plot
+            with st.expander(tr('faithfulness_check_expander')):
+                    st.markdown(tr('fv_faithfulness_explanation_pca_html'), unsafe_allow_html=True)
+                    # Check for pre-cached faithfulness results first
+                    if 'pca_faithfulness' in st.session_state.analysis_results:
+                        verification_results = st.session_state.analysis_results['pca_faithfulness']
+                    else:
+                        api_config = init_qwen_api()
+                        if api_config:
+                            with st.spinner(tr('running_faithfulness_check_spinner')):
+                                claims = _cached_extract_fv_claims(api_config, st.session_state.explanation_part_1, "pca")
+                                verification_results = verify_fv_claims(claims, results, "pca")
+                        else:
+                            verification_results = []
+                            st.warning(tr('api_key_not_configured_warning'))
+                    if verification_results:
+                        for result in verification_results:
+                            status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                            st.markdown(f"""
+                            <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                                <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                                <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                                <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                    else:
+                        st.info(tr('no_verifiable_claims_info'))
+    st.markdown("---")
+    # --- Function Type and Category Analysis ---
+    if 'attribution' in results:
+        attribution = results['attribution']
+        # --- Section 1: Function Type Attribution ---
+        st.markdown(f"<h2>{tr('function_types_tab')}</h2>", unsafe_allow_html=True)
+        st.markdown(tr('function_type_attribution_header'))
+        function_type_scores = attribution['function_type_scores']
+        top_types = list(function_type_scores.items())[:6]
+        # Reverse for a horizontal bar chart.
+        top_types.reverse()
+        fig = go.Figure()
+        colors = [FUNCTION_TYPE_COLORS.get(name, '#CCCCCC') for name, _ in top_types]
+        fig.add_trace(go.Bar(
+            x=[score for _, score in top_types],
+            y=[tr(name) for name, _ in top_types],
+            orientation='h',
+            marker=dict(color=colors),
+            text=[f"{score:.3f}" for _, score in top_types],
+            textposition='outside',
+            hovertemplate='<b>%{y}</b><br>Score: %{x:.3f}<extra></extra>'
+        ))
+        fig.update_layout(
+            xaxis_title=tr('attribution_score_xaxis'),
+            yaxis=dict(autorange="reversed"), # Ensures y-axis is not reversed
+            height=500,
+            margin=dict(l=200, r=100, t=50, b=50)
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # --- AI Explanation for Function Type Plot ---
+        if st.session_state.get('enable_ai_explanation') and 'explanation_part_2' in st.session_state:
+            if st.session_state.explanation_part_2:
+                explanation_html = markdown.markdown(st.session_state.explanation_part_2)
+                st.markdown(
+                    f"<div style='background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #A78BFA; font-size: 0.9rem;'>{explanation_html}</div>",
+                    unsafe_allow_html=True
+                )
+                # Faithfulness Check for Function Type plot
+                with st.expander(tr('faithfulness_check_expander')):
+                    st.markdown(tr('fv_faithfulness_explanation_pca_html'), unsafe_allow_html=True)
+                    if 'pca_faithfulness' in st.session_state.analysis_results:
+                        verification_results = st.session_state.analysis_results['pca_faithfulness']
+                    else:
+                        api_config = init_qwen_api()
+                        if api_config:
+                            with st.spinner(tr('running_faithfulness_check_spinner')):
+                                claims = _cached_extract_fv_claims(api_config, st.session_state.explanation_part_2, "pca")
+                                verification_results = verify_fv_claims(claims, results, "pca")
+                        else:
+                            verification_results = []
+                            st.warning(tr('api_key_not_configured_warning'))
+                    if verification_results:
+                        for result in verification_results:
+                            status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                            st.markdown(f"""
+                            <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                                <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                                <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                                <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                    else:
+                        st.info(tr('no_verifiable_claims_info'))
+        st.markdown("---")
+        # --- Section 2: Category Analysis ---
+        st.markdown(f"<h2>{tr('category_analysis_tab')}</h2>", unsafe_allow_html=True)
+        st.markdown(tr('top_category_attribution_header'))
+        category_scores = attribution['category_scores']
+        top_categories = list(category_scores.items())[:20]
+        if top_categories:
+            # Get the function type for each category to color the chart.
+            function_type_mapping = attribution.get('function_types_mapping', FUNCTION_TYPES)
+            category_to_func_type = {
+                cat: func_type
+                for func_type, cats in function_type_mapping.items()
+                for cat in cats
+            }
+            missing_categories = [cat for cat, _ in top_categories if cat not in category_to_func_type]
+            if missing_categories:
+                st.warning(tr('missing_category_mapping_warning').format(categories=", ".join(missing_categories)))
+            filtered_categories = [(cat, score) for cat, score in top_categories if cat in category_to_func_type]
+            if not filtered_categories:
+                st.info(tr('no_mapped_categories_info'))
+            else:
+                # Restructure the data for the sunburst chart.
+                leaf_labels = [format_category_name(cat_key) for cat_key, score in filtered_categories]
+                leaf_values = [score for _, score in filtered_categories]
+                leaf_parent_keys = [category_to_func_type[cat_key] for cat_key, _ in filtered_categories]
+                function_type_order = {key: idx for idx, key in enumerate(function_type_mapping.keys())}
+                parent_keys = sorted(
+                    set(leaf_parent_keys),
+                    key=lambda key: function_type_order.get(key, len(function_type_order))
+                )
+                parent_labels_map = {key: tr(key) for key in parent_keys}
+                parent_values = [
+                    sum(leaf_values[i] for i, parent_key in enumerate(leaf_parent_keys) if parent_key == key)
+                    for key in parent_keys
+                ]
+                sunburst_labels = [parent_labels_map[key] for key in parent_keys] + leaf_labels
+                sunburst_parents = [""] * len(parent_keys) + [parent_labels_map[key] for key in leaf_parent_keys]
+                sunburst_values = parent_values + leaf_values
+                # Create a color map for the labels.
+                label_to_color_map = {
+                    parent_labels_map[key]: FUNCTION_TYPE_COLORS.get(key, '#CCCCCC')
+                    for key in parent_keys
+                }
+                # --- Generate gradient colors for leaves based on score ---
+                def hex_to_rgb_float(h):
+                    h = h.lstrip('#')
+                    return [int(h[i:i+2], 16) / 255.0 for i in (0, 2, 4)]
+                def rgb_float_to_hex(rgb):
+                    return '#%02x%02x%02x' % tuple(int(c * 255) for c in rgb)
+                leaf_scores = leaf_values
+                min_score = min(leaf_scores) if leaf_scores else 0
+                max_score = max(leaf_scores) if leaf_scores else 1
+                score_range = max_score - min_score
+                sunburst_marker_colors = []
+                # Add solid colors for the parent categories.
+                for key in parent_keys:
+                    parent_label = parent_labels_map[key]
+                    sunburst_marker_colors.append(label_to_color_map[parent_label])
+                # Add gradient colors for the leaf categories.
+                for i, parent_key in enumerate(leaf_parent_keys):
+                    base_color_hex = FUNCTION_TYPE_COLORS.get(parent_key, '#CCCCCC')
+                    # Normalize the score for this leaf.
+                    normalized_score = (leaf_scores[i] - min_score) / score_range if score_range > 0 else 0.5
+                    # Convert to HLS to get the original lightness.
+                    r, g, b = hex_to_rgb_float(base_color_hex)
+                    h, base_l, s = colorsys.rgb_to_hls(r, g, b)
+                    # Define a lightness range.
+                    lightest_shade = 0.9
+                    lightness_range = lightest_shade - base_l
+                    # Interpolate the lightness.
+                    new_l = lightest_shade - (normalized_score * lightness_range)
+                    # Convert back to RGB and then to Hex.
+                    new_r, new_g, new_b = colorsys.hls_to_rgb(h, new_l, s)
+                    new_hex = rgb_float_to_hex((new_r, new_g, new_b))
+                    sunburst_marker_colors.append(new_hex)
+                # --- Highlight the top match with a stronger visual cue ---
+                top_category_name, _ = filtered_categories[0]
+                formatted_top_category_name = format_category_name(top_category_name)
+                top_parent_key = category_to_func_type.get(top_category_name)
+                top_category_parent_str = parent_labels_map.get(top_parent_key, tr('unmapped_function_type'))
+                sunburst_line_widths = [1] * len(sunburst_labels)
+                sunburst_line_colors = ['#333'] * len(sunburst_labels)
+                try:
+                    top_leaf_index = sunburst_labels.index(formatted_top_category_name)
+                    sunburst_line_widths[top_leaf_index] = 5
+                    sunburst_line_colors[top_leaf_index] = '#FFFFFF'
+                except ValueError:
+                    pass
+                try:
+                    top_parent_index = sunburst_labels.index(top_category_parent_str)
+                    sunburst_line_widths[top_parent_index] = 5
+                    sunburst_line_colors[top_parent_index] = '#FFFFFF'
+                except ValueError:
+                    pass
+                fig = go.Figure(go.Sunburst(
+                    labels=sunburst_labels,
+                    parents=sunburst_parents,
+                    values=sunburst_values,
+                    branchvalues="total",
+                    hovertemplate='<b>%{label}</b><br>Score: %{value:.3f}<extra></extra>',
+                    marker=dict(
+                            colors=sunburst_marker_colors,
+                            line=dict(color=sunburst_line_colors, width=sunburst_line_widths)
+                    ),
+                        maxdepth=2,
+                        textfont=dict(color='black'),
+                        leaf=dict(opacity=1)
+                ))
+                fig.update_layout(
+                    title=dict(
+                        text=tr('sunburst_chart_title'),
+                            font=dict(size=18, family="Arial", color="#EAEAEA"),
+                        x=0.5
+                    ),
+                    height=600,
+                    font=dict(family='Arial', size=12)
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            # --- AI Explanation for Category Plot ---
+            if st.session_state.get('enable_ai_explanation') and 'explanation_part_3' in st.session_state:
+                if st.session_state.explanation_part_3:
+                    explanation_html = markdown.markdown(st.session_state.explanation_part_3)
+                    st.markdown(
+                        f"<div style='background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #FBBF24; font-size: 0.9rem;'>{explanation_html}</div>",
+                        unsafe_allow_html=True
+                    )
+                    # Faithfulness Check for Category Plot
+                    with st.expander(tr('faithfulness_check_expander')):
+                        st.markdown(tr('fv_faithfulness_explanation_pca_html'), unsafe_allow_html=True)
+                        if 'pca_faithfulness' in st.session_state.analysis_results:
+                            verification_results = st.session_state.analysis_results['pca_faithfulness']
+                        else:
+                            api_config = init_qwen_api()
+                            if api_config:
+                                with st.spinner(tr('running_faithfulness_check_spinner')):
+                                    claims = _cached_extract_fv_claims(api_config, st.session_state.explanation_part_3, "pca")
+                                    verification_results = verify_fv_claims(claims, results, "pca")
+                            else:
+                                verification_results = []
+                                st.warning(tr('api_key_not_configured_warning'))
+                        if verification_results:
+                            for result in verification_results:
+                                status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                                st.markdown(f"""
+                                <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                                    <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                                    <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                                    <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                                </div>
+                                """, unsafe_allow_html=True)
+                        else:
+                            st.info(tr('no_verifiable_claims_info'))
+            else:
+                st.warning("No category attribution data available to display.")
+        st.markdown("---")
+        # --- Section 3: Layer Evolution ---
+        st.markdown(f"<h2>{tr('layer_evolution_tab')}</h2>", unsafe_allow_html=True)
+        st.markdown(tr('layer_evolution_header'))
+        if 'evolution' in results and results['evolution']:
+            display_evolution_results(results['evolution'])
+        else:
+            st.info(tr('evolution_not_available_info'))
+def display_evolution_results(evolution_results):
+    # Displays the layer evolution analysis results.
+    import plotly.graph_objects as go
+    import numpy as np
+    # Extract key metrics from the results.
+    layer_vectors = evolution_results['layer_vectors']
+    similarity_matrix = evolution_results['similarity_matrix']
+    layer_changes = evolution_results['layer_changes']
+    # Calculate activation strengths.
+    activation_strengths = [float(np.sqrt(np.sum(vec ** 2))) for vec in layer_vectors.values()]
+    # Display the key insights.
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        max_change_layer = np.argmax(layer_changes) + 1
+        st.metric(
+            "Biggest Change",
+            f"Layer {max_change_layer}→{max_change_layer+1}",
+            f"{layer_changes[max_change_layer-1]:.3f}",
+            help="Layer transition with the largest representational change"
+        )
+    with col2:
+        max_activation_layer = np.argmax(activation_strengths)
+        st.metric(
+            "Peak Activation",
+            f"Layer {max_activation_layer}",
+            f"{activation_strengths[max_activation_layer]:.3f}",
+            help="Layer with strongest overall activation"
+        )
+    with col3:
+        avg_change = np.mean(layer_changes)
+        st.metric(
+            "Avg Change",
+            f"{avg_change:.3f}",
+            help="Average change magnitude across all layer transitions"
+        )
+    # Plot the activation strength.
+    st.markdown("<h3><i class='bi bi-lightning-charge-fill'></i> Activation Strength Across Layers</h3>", unsafe_allow_html=True)
+    # Create the line plot.
+    peak_idx = np.argmax(activation_strengths)
+    fig = go.Figure()
+    # Add the main line with gradient colors.
+    fig.add_trace(go.Scatter(
+        x=list(range(len(activation_strengths))),
+        y=activation_strengths,
+        mode='lines+markers',
+        line=dict(color='#4ECDC4', width=4),
+        marker=dict(size=10, color='#45B7D1', line=dict(color='white', width=2)),
+        name='Activation Strength',
+        hovertemplate='<b>Layer %{x}</b><br>Strength: %{y:.3f}<extra></extra>'
+    ))
+    # Highlight the peak activation.
+    fig.add_vline(
+        x=peak_idx,
+        line_dash="dash",
+        line_color="#FF6B6B",
+        line_width=3,
+        annotation_text=f"Peak at Layer {peak_idx}",
+        annotation_position="top"
+    )
+    # Add a marker for the peak.
+    fig.add_trace(go.Scatter(
+        x=[peak_idx],
+        y=[activation_strengths[peak_idx]],
+        mode='markers',
+        marker=dict(size=15, color='#FF6B6B', symbol='star', line=dict(color='white', width=2)),
+        name=f'Peak Layer {peak_idx}',
+        hovertemplate=f'<b>Peak Layer {peak_idx}</b><br>Strength: {activation_strengths[peak_idx]:.3f}<extra></extra>'
+    ))
+    fig.update_layout(
+        xaxis=dict(
+            title=dict(text="Layer Index", font=dict(size=16, color='#EAEAEA'), standoff=50),
+            tickfont=dict(size=14, color='#EAEAEA'),
+            gridcolor='rgba(200,200,200,0.3)',
+            showgrid=True,
+            zeroline=False
+        ),
+        yaxis=dict(
+            title=dict(text="Activation Strength (L2 norm)", font=dict(size=16, color='#EAEAEA')),
+            tickfont=dict(size=14, color='#EAEAEA'),
+            gridcolor='rgba(200,200,200,0.3)',
+            showgrid=True,
+            zeroline=False
+        ),
+        height=500,
+        margin=dict(l=80, r=80, t=100, b=80),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.2,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=12, color='#EAEAEA')
+        ),
+        font=dict(family='Arial'),
+        hovermode='x'
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # --- AI Explanation for Activation Strength ---
+    if st.session_state.get('enable_ai_explanation') and 'evolution_explanation_part_1' in st.session_state:
+        if st.session_state.evolution_explanation_part_1:
+            explanation_html = markdown.markdown(st.session_state.evolution_explanation_part_1)
+            st.markdown(
+                f"<div style='background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #A78BFA; font-size: 0.9rem;'>{explanation_html}</div>",
+                unsafe_allow_html=True
+            )
+            # Faithfulness Check for Activation Strength plot
+            with st.expander(tr('faithfulness_check_expander')):
+                st.markdown(tr('fv_faithfulness_explanation_evolution_html'), unsafe_allow_html=True)
+                if 'evolution_faithfulness' in st.session_state.analysis_results:
+                    verification_results = st.session_state.analysis_results['evolution_faithfulness']
+                else:
+                    api_config = init_qwen_api()
+                    if api_config:
+                        with st.spinner(tr('running_faithfulness_check_spinner')):
+                            claims = _cached_extract_fv_claims(api_config, st.session_state.evolution_explanation_part_1, "evolution")
+                            verification_results = verify_fv_claims(claims, st.session_state.analysis_results, "evolution")
+                    else:
+                        verification_results = []
+                        st.warning(tr('api_key_not_configured_warning'))
+                if verification_results:
+                    for result in verification_results:
+                        status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                        st.markdown(f"""
+                        <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                            <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                            <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                            <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                else:
+                    st.info(tr('no_verifiable_claims_info'))
+    # Plot the layer changes.
+    st.markdown("<h3><i class='bi bi-arrow-repeat'></i> Layer-to-Layer Changes</h3>", unsafe_allow_html=True)
+    max_change_idx = np.argmax(layer_changes)
+    fig2 = go.Figure()
+    # Add the main line with gradient colors.
+    fig2.add_trace(go.Scatter(
+        x=list(range(1, len(layer_changes) + 1)),
+        y=layer_changes,
+        mode='lines+markers',
+        line=dict(color='#FECA57', width=4),
+        marker=dict(size=10, color='#FF9FF3', line=dict(color='white', width=2)),
+        name='Layer Changes',
+        hovertemplate='<b>Layer %{x}→%{customdata}</b><br>Change: %{y:.3f}<extra></extra>',
+        customdata=[i+2 for i in range(len(layer_changes))]
+    ))
+    # Highlight the biggest change.
+    fig2.add_vline(
+        x=max_change_idx + 1,
+        line_dash="dash",
+        line_color="#FF6B6B",
+        line_width=3,
+        annotation_text=f"Biggest Change: {max_change_idx+1}→{max_change_idx+2}",
+        annotation_position="top"
+    )
+    # Add a marker for the peak.
+    fig2.add_trace(go.Scatter(
+        x=[max_change_idx + 1],
+        y=[layer_changes[max_change_idx]],
+        mode='markers',
+        marker=dict(size=15, color='#FF6B6B', symbol='diamond', line=dict(color='white', width=2)),
+        name=f'Max Change: L{max_change_idx+1}→L{max_change_idx+2}',
+        hovertemplate=f'<b>Max Change: Layer {max_change_idx+1}→{max_change_idx+2}</b><br>Change: {layer_changes[max_change_idx]:.3f}<extra></extra>'
+    ))
+    fig2.update_layout(
+        xaxis=dict(
+            title=dict(text="Layer Transition", font=dict(size=16, color='#EAEAEA'), standoff=50),
+            tickfont=dict(size=14, color='#EAEAEA'),
+            gridcolor='rgba(200,200,200,0.3)',
+            showgrid=True,
+            zeroline=False
+        ),
+        yaxis=dict(
+            title=dict(text="Change Magnitude (Cosine Distance)", font=dict(size=16, color='#EAEAEA')),
+            tickfont=dict(size=14, color='#EAEAEA'),
+            gridcolor='rgba(200,200,200,0.3)',
+            showgrid=True,
+            zeroline=False
+        ),
+        height=500,
+        margin=dict(l=80, r=80, t=100, b=80),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.2,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=12, color='#EAEAEA')
+        ),
+        font=dict(family='Arial'),
+        hovermode='x'
+    )
+    st.plotly_chart(fig2, use_container_width=True)
+    # --- AI Explanation for Layer Changes ---
+    if st.session_state.get('enable_ai_explanation') and 'evolution_explanation_part_2' in st.session_state:
+        if st.session_state.evolution_explanation_part_2:
+            explanation_html = markdown.markdown(st.session_state.evolution_explanation_part_2)
+            st.markdown(
+                f"<div style='background-color: #2b2b2b; color: #ffffff; padding: 1.2rem; border-radius: 10px; margin: 1rem 0; border-left: 5px solid #6EE7B7; font-size: 0.9rem;'>{explanation_html}</div>",
+                unsafe_allow_html=True
+            )
+            # Faithfulness Check for Layer Changes plot
+            with st.expander(tr('faithfulness_check_expander')):
+                st.markdown(tr('fv_faithfulness_explanation_evolution_html'), unsafe_allow_html=True)
+                if 'evolution_faithfulness' in st.session_state.analysis_results:
+                    verification_results = st.session_state.analysis_results['evolution_faithfulness']
+                else:
+                    api_config = init_qwen_api()
+                    if api_config:
+                        with st.spinner(tr('running_faithfulness_check_spinner')):
+                            claims = _cached_extract_fv_claims(api_config, st.session_state.evolution_explanation_part_2, "evolution")
+                            verification_results = verify_fv_claims(claims, st.session_state.analysis_results, "evolution")
+                    else:
+                        verification_results = []
+                        st.warning(tr('api_key_not_configured_warning'))
+                if verification_results:
+                    for result in verification_results:
+                        status_text = tr('verified_status') if result['verified'] else tr('contradicted_status')
+                        st.markdown(f"""
+                        <div style="margin-bottom: 1rem; padding: 0.8rem; border-radius: 8px; border-left: 5px solid {'#28a745' if result['verified'] else '#dc3545'}; background-color: #1a1a1a;">
+                            <p style="margin-bottom: 0.3rem;"><strong>{tr('claim_label')}:</strong> <em>"{result['claim_text']}"</em></p>
+                            <p style="margin-bottom: 0.3rem;"><strong>{tr('status_label')}:</strong> {status_text}</p>
+                            <p style="margin-bottom: 0;"><strong>{tr('evidence_label')}:</strong> {result['evidence']}</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                else:
+                    st.info(tr('no_verifiable_claims_info'))
+if __name__ == "__main__":
+    from utilities.localization import initialize_localization, tr
+    initialize_localization()
+    show_function_vectors_page()

function_vectors/generate_function_vectors.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+# Adjust path to import from the new 'data' directory
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES
+def generate_all_vectors():
+    # Generates and saves function vectors for all English and German prompts.
+    print("🚀 Starting function vector generation for both English and German...")
+    # Load the model and tokenizer.
+    print("🔧 Loading OLMo-2-7B model and tokenizer...")
+    try:
+        model_path = "./models/OLMo-2-1124-7B"
+        device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            output_hidden_states=True
+        )
+        print(f"✅ Model loaded successfully on device: {device}")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return
+    # Function to get activation vectors.
+    def get_activation_for_prompt(prompt):
+        # Calculates the model's activation for a given prompt.
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+        last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
+        last_hidden_state = outputs.hidden_states[-1]
+        activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
+        return activation.astype(np.float64)
+    # Generate and save vectors for both languages.
+    output_dir = Path(__file__).parent / "data" / "vectors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for lang in ["en", "de"]:
+        print(f"\n🌍 Generating vectors for {lang.upper()} prompts...")
+        category_vectors = {}
+        for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc=f"Processing {lang.upper()} Categories"):
+            prompts = data.get(lang, [])
+            if not prompts:
+                print(f"⚠️ Warning: No {lang.upper()} prompts for '{category_key}'. Skipping.")
+                continue
+            activations = [get_activation_for_prompt(p) for p in prompts]
+            if activations:
+                category_vectors[category_key] = np.mean(activations, axis=0)
+        if not category_vectors:
+            print(f"❌ No vectors were generated for {lang.upper()}. Aborting save.")
+            continue
+        output_path = output_dir / f"{lang}_category_vectors.npz"
+        try:
+            np.savez_compressed(output_path, **category_vectors)
+            print(f"✅ Successfully saved {lang.upper()} vectors to: {output_path}")
+        except Exception as e:
+            print(f"❌ Error saving {lang.upper()} vectors: {e}")
+if __name__ == "__main__":
+    generate_all_vectors()

function_vectors/generate_german_vectors.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+# Add root project dir to path
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES
+def generate_german_vectors():
+    # Generates and saves function vectors for all German prompts.
+    print("🚀 Starting German function vector generation...")
+    # Load the model and tokenizer.
+    print("🔧 Loading OLMo-2-7B model and tokenizer... (this may take a moment)")
+    try:
+        model_path = "./models/OLMo-2-1124-7B"
+        device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            output_hidden_states=True
+        )
+        print(f"✅ Model loaded successfully on device: {device}")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        print("Please ensure the model exists at './Models/OLMo-2-1124-7B'")
+        return
+    # Function to get activation vectors.
+    def get_activation_for_prompt(prompt):
+        # Calculates the model's activation for a given prompt.
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+        last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
+        last_hidden_state = outputs.hidden_states[-1]
+        activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
+        return activation.astype(np.float64)
+    # Generate vectors for German prompts.
+    print("\n🇩🇪 Generating vectors for German prompts...")
+    german_category_vectors = {}
+    # Loop over all categories and generate vectors.
+    for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc="Processing Categories"):
+        german_prompts = data.get('de', [])
+        if not german_prompts:
+            print(f"⚠️ Warning: No German prompts found for category '{category_key}'. Skipping.")
+            continue
+        # Get activations for all German prompts in the category
+        activations = [get_activation_for_prompt(p) for p in german_prompts]
+        if activations:
+            # Average the activations to get one vector per category.
+            german_category_vectors[category_key] = np.mean(activations, axis=0)
+    # Save the generated vectors.
+    if not german_category_vectors:
+        print("❌ No vectors were generated. Aborting save.")
+        return
+    output_dir = Path(__file__).parent / "data" / "vectors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "de_category_vectors.npz"
+    try:
+        np.savez_compressed(output_path, **german_category_vectors)
+        print(f"\n✅ Successfully generated and saved German function vectors to:")
+        print(f"   {output_path}")
+    except Exception as e:
+        print(f"❌ Error saving vectors: {e}")
+if __name__ == "__main__":
+    generate_german_vectors()

function_vectors/generate_page_assets.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+import json
+import plotly.graph_objects as go
+from sklearn.decomposition import PCA
+# Adjust path to import from the new 'data' directory
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES, FUNCTION_TYPES
+def generate_all_assets():
+    # Generates all pre-computed assets for the Function Vectors page.
+    print("🚀 Starting generation of all page assets...")
+    # Load the model and tokenizer.
+    print("🔧 Loading OLMo-2-7B model...")
+    try:
+        model_path = "./models/OLMo-2-1124-7B"
+        device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            output_hidden_states=True
+        )
+        print(f"✅ Model loaded successfully on device: {device}")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return
+    # Function to get activation vectors.
+    def get_activation_for_prompt(prompt):
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+        last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
+        last_hidden_state = outputs.hidden_states[-1]
+        activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
+        return activation.astype(np.float64)
+    # Generate and save function vectors.
+    output_dir = Path(__file__).parent / "data" / "vectors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    all_vectors_by_lang = {}
+    for lang in ["en", "de"]:
+        print(f"\n🌍 Generating vectors for {lang.upper()} prompts...")
+        category_vectors = {}
+        for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc=f"Processing {lang.upper()}"):
+            prompts = data.get(lang, [])
+            if not prompts: continue
+            activations = [get_activation_for_prompt(p) for p in prompts]
+            if activations:
+                category_vectors[category_key] = np.mean(activations, axis=0)
+        all_vectors_by_lang[lang] = category_vectors.copy()
+        output_path = output_dir / f"{lang}_category_vectors.npz"
+        np.savez_compressed(output_path, **category_vectors)
+        print(f"✅ Saved {lang.upper()} vectors to: {output_path}")
+    # Generate and save 3D PCA visualizations.
+    viz_dir = Path(__file__).parent / "data" / "visualizations"
+    viz_dir.mkdir(parents=True, exist_ok=True)
+    for lang, vectors_to_plot in all_vectors_by_lang.items():
+        print(f"\n🎨 Generating 3D PCA visualization for {lang.upper()}...")
+        if not vectors_to_plot:
+            print(f"⚠️ Skipping PCA for {lang.upper()} as vectors are missing.")
+            continue
+        try:
+            categories = list(vectors_to_plot.keys())
+            vectors = np.vstack([vectors_to_plot[cat] for cat in categories])
+            pca = PCA(n_components=3)
+            reduced_vectors = pca.fit_transform(vectors)
+            # Define colors and symbols for the plot.
+            func_type_keys = list(FUNCTION_TYPES.keys())
+            colors = ["skyblue", "lightgreen", "salmon", "orchid", "gold", "lightcoral"]
+            symbols = ["circle", "diamond", "square", "cross", "diamond-open", "square-open"]
+            function_type_colors = {key: colors[i % len(colors)] for i, key in enumerate(func_type_keys)}
+            plotly_symbols = {key: symbols[i % len(symbols)] for i, key in enumerate(func_type_keys)}
+            fig = go.Figure()
+            for func_type_key, cats in FUNCTION_TYPES.items():
+                func_categories = [cat for cat in cats if cat in categories]
+                if func_categories:
+                    indices = [categories.index(cat) for cat in func_categories]
+                    fig.add_trace(go.Scatter3d(
+                        x=reduced_vectors[indices, 0], y=reduced_vectors[indices, 1], z=reduced_vectors[indices, 2],
+                        mode='markers',
+                        marker=dict(size=8, color=function_type_colors.get(func_type_key, 'gray'), symbol=plotly_symbols.get(func_type_key, 'circle'), line=dict(width=1, color='black'), opacity=0.8),
+                        name=func_type_key.replace("_", " ").title(),
+                        text=[cat.replace("_", " ").title() for cat in func_categories],
+                        hovertemplate="<b>%{text}</b><br>PC1: %{x:.3f}<br>PC2: %{y:.3f}<br>PC3: %{z:.3f}<extra></extra>"
+                    ))
+            fig.update_layout(
+                title=f"3D PCA of {lang.upper()} Function Vector Categories",
+                width=1400, height=900,
+                scene=dict(xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3'),
+                legend_title_text='Function Types'
+            )
+            # Save the plot to an HTML file.
+            file_suffix = "pca_3d_categories_layer_-1.html"
+            viz_path = viz_dir / f"{lang}_{file_suffix}"
+            fig.write_html(viz_path)
+            print(f"✅ Saved {lang.upper()} 3D PCA visualization to: {viz_path}")
+        except Exception as e:
+            print(f"❌ Failed to generate PCA plot for {lang.upper()}: {e}")
+    # Layer evolution data is handled dynamically in the app.
+    print("\n✅ Layer Evolution analysis is handled dynamically in the app. No pre-computation needed.")
+    print("\n🎉 All assets generated successfully!")
+if __name__ == "__main__":
+    generate_all_assets()

function_vectors/translate_prompts.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import sys
+from pathlib import Path
+import requests
+import json
+import time
+from tqdm import tqdm
+# Add root project dir to path
+sys.path.append(str(Path(__file__).parent.parent))
+from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES, FUNCTION_TYPES
+# API configuration for Qwen.
+QWEN_API_CONFIG = {
+    "api_key": "6e3def45d61b0b20547a1fcbab6464d8",
+    "api_endpoint": "https://chat-ai.academiccloud.de/v1",
+    "model": "qwen2.5-vl-72b-instruct",
+    "rate_limit_per_minute": 2,
+}
+# --- Translation Logic ---
+def translate_text(text, target_language="German"):
+    # Translates a single string using the Qwen API.
+    headers = {
+        "Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
+        "Content-Type": "application/json"
+    }
+    prompt = f"Translate the following English text to {target_language}. Respond with ONLY the translated text, without any introductory phrases, explanations, or quotation marks. The original text is:\n\n'{text}'"
+    data = {
+        "model": QWEN_API_CONFIG["model"],
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": 150,
+        "temperature": 0.1,
+    }
+    try:
+        response = requests.post(
+            f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
+            headers=headers,
+            json=data,
+            timeout=60
+        )
+        if response.status_code == 200:
+            result = response.json()
+            translated_text = result["choices"][0]["message"]["content"].strip()
+            # Clean up quotes from the model's response.
+            if translated_text.startswith('"') and translated_text.endswith('"'):
+                translated_text = translated_text[1:-1]
+            return translated_text
+        elif response.status_code == 429:
+            # Handle rate limiting.
+            reset_time = response.headers.get('RateLimit-Reset', '0')
+            try:
+                wait_seconds = int(reset_time)
+                print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
+                return f"RATE_LIMIT_HOURLY:{wait_seconds}"
+            except ValueError:
+                print("Rate limit exceeded. Waiting 60 seconds...")
+                return "RATE_LIMIT_EXCEEDED"
+        else:
+            print(f"API Error: Status {response.status_code}, Response: {response.text}")
+            return None
+    except requests.RequestException as e:
+        print(f"Request failed: {e}")
+        return None
+def translate_batch_texts(texts, target_language="German"):
+    # Translates a batch of strings in one API call.
+    headers = {
+        "Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
+        "Content-Type": "application/json"
+    }
+    # A stronger prompt to ensure full translation.
+    batch_prompt = (
+        f"Translate the following English texts to {target_language}. "
+        "For each text, translate ALL words and phrases, including any words in quotation marks, into natural German. "
+        "Do NOT leave any English words in the translation. Respond with ONLY the German translations, one per line, in the same order.\n\n"
+    )
+    for i, text in enumerate(texts, 1):
+        batch_prompt += f"{i}. {text}\n"
+    batch_prompt += "\nProvide the German translations in the same order, one per line:"
+    data = {
+        "model": QWEN_API_CONFIG["model"],
+        "messages": [{"role": "user", "content": batch_prompt}],
+        "max_tokens": 300,  # Increased for batch processing
+        "temperature": 0.1,
+    }
+    try:
+        response = requests.post(
+            f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
+            headers=headers,
+            json=data,
+            timeout=60
+        )
+        if response.status_code == 200:
+            result = response.json()
+            translated_text = result["choices"][0]["message"]["content"].strip()
+            # Split the response into individual lines.
+            lines = [line.strip() for line in translated_text.split('\n') if line.strip()]
+            cleaned_translations = []
+            for line in lines:
+                # Remove numbering if the model adds it.
+                if line and line[0].isdigit() and '.' in line:
+                    line = line.split('.', 1)[1].strip()
+                # Clean up quotes.
+                if line.startswith('"') and line.endswith('"'):
+                    line = line[1:-1]
+                if line:
+                    cleaned_translations.append(line)
+            # Make sure we have the right number of translations.
+            if len(cleaned_translations) >= len(texts):
+                return cleaned_translations[:len(texts)]
+            else:
+                print(f"Warning: Expected {len(texts)} translations, got {len(cleaned_translations)}")
+                # Pad with error messages if some translations failed.
+                while len(cleaned_translations) < len(texts):
+                    cleaned_translations.append(f"TRANSLATION_ERROR: {texts[len(cleaned_translations)]}")
+                return cleaned_translations
+        elif response.status_code == 429:
+            # Handle rate limiting.
+            reset_time = response.headers.get('RateLimit-Reset', '0')
+            try:
+                wait_seconds = int(reset_time)
+                print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
+                return f"RATE_LIMIT_HOURLY:{wait_seconds}"
+            except ValueError:
+                print("Rate limit exceeded. Waiting 60 seconds...")
+                return "RATE_LIMIT_EXCEEDED"
+        else:
+            print(f"API Error: Status {response.status_code}, Response: {response.text}")
+            return None
+    except requests.RequestException as e:
+        print(f"Request failed: {e}")
+        return None
+def update_multilingual_categories_file(new_categories):
+    # Updates the multilingual_function_categories.py file.
+    file_path = Path(__file__).parent / "data" / "multilingual_function_categories.py"
+    # Create the new file content.
+    file_content = "# -*- coding: utf-8 -*-\n"
+    file_content += '"""\nThis file contains the multilingual prompts for function vector analysis.\n'
+    file_content += 'It is automatically updated by the translate_prompts.py script.\n"""\n\n'
+    # Format the FUNCTION_TYPES dictionary.
+    ft_content = "FUNCTION_TYPES = {\n"
+    for ft, cats in FUNCTION_TYPES.items():
+        ft_content += f'    "{ft}": [\n'
+        for cat in cats:
+            ft_content += f'        "{cat}",\n'
+        ft_content += "    ],\n"
+    ft_content += "}\n\n"
+    file_content += ft_content
+    # Add the function categories.
+    file_content += f"FUNCTION_CATEGORIES = {json.dumps(new_categories, indent=4, ensure_ascii=False)}\n"
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(file_content)
+    print(f"\n✅ Progress saved to '{file_path}'")
+def main():
+    # Translates all prompts and updates the file.
+    print("🚀 Starting batch translation of prompts to German...")
+    # Load existing categories to resume from where we left off.
+    translated_categories = FUNCTION_CATEGORIES.copy()
+    # Count how many prompts need to be translated.
+    total_prompts = sum(len(prompts.get('en', [])) for prompts in FUNCTION_CATEGORIES.values())
+    # Set up a progress bar.
+    with tqdm(total=total_prompts, desc="Translating Prompts") as pbar:
+        # Check how many are already translated.
+        already_translated_count = 0
+        for category_key, data in FUNCTION_CATEGORIES.items():
+            if 'de' not in translated_categories.get(category_key, {}):
+                if category_key not in translated_categories:
+                    translated_categories[category_key] = {}
+                translated_categories[category_key]['de'] = []
+            if 'de' in translated_categories[category_key]:
+                 already_translated_count += len(translated_categories[category_key]['de'])
+        pbar.update(already_translated_count)
+        # Get a list of all prompts that still need to be translated.
+        all_prompts_to_translate = []
+        prompt_mapping = []
+        for category_key, data in FUNCTION_CATEGORIES.items():
+            english_prompts = data.get('en', [])
+            # Make sure the 'de' key exists.
+            if 'de' not in translated_categories[category_key]:
+                translated_categories[category_key]['de'] = []
+            german_prompts = translated_categories[category_key]['de']
+            # Skip if this category is already done.
+            if len(german_prompts) == len(english_prompts):
+                continue
+            # Add prompts that are missing a translation.
+            for i in range(len(german_prompts), len(english_prompts)):
+                all_prompts_to_translate.append(english_prompts[i])
+                prompt_mapping.append((category_key, i))
+        # Process the prompts in batches.
+        batch_size = 6
+        for i in range(0, len(all_prompts_to_translate), batch_size):
+            batch_prompts = all_prompts_to_translate[i:i + batch_size]
+            batch_mapping = prompt_mapping[i:i + batch_size]
+            # Wait between batches to avoid hitting the rate limit.
+            time.sleep(30)
+            translated_batch = translate_batch_texts(batch_prompts)
+            # Handle rate limit responses.
+            if translated_batch and isinstance(translated_batch, str) and translated_batch.startswith("RATE_LIMIT_HOURLY:"):
+                wait_seconds = int(translated_batch.split(":")[1])
+                print(f"Waiting {wait_seconds} seconds for hourly rate limit reset...")
+                time.sleep(wait_seconds)
+                # Retry the batch.
+                translated_batch = translate_batch_texts(batch_prompts)
+            retry_wait = 60
+            while translated_batch == "RATE_LIMIT_EXCEEDED":
+                # Wait and retry if we hit the rate limit.
+                print(f"Waiting for {retry_wait} seconds due to rate limit...")
+                time.sleep(retry_wait)
+                translated_batch = translate_batch_texts(batch_prompts)
+                retry_wait *= 1.5
+            if translated_batch and isinstance(translated_batch, list):
+                # Add the new translations to our data.
+                for j, (category_key, prompt_idx) in enumerate(batch_mapping):
+                    if j < len(translated_batch):
+                        translated_categories[category_key]['de'].append(translated_batch[j])
+                # Save progress every so often.
+                if (pbar.n + len(batch_prompts)) % 30 == 0:
+                    update_multilingual_categories_file(translated_categories)
+                pbar.update(len(batch_prompts))
+            else:
+                print(f"❌ Failed to translate batch. Stopping.")
+                # Save any progress we made before stopping.
+                update_multilingual_categories_file(translated_categories)
+                return
+    # Final save at the end.
+    update_multilingual_categories_file(translated_categories)
+    print("\n✅ All prompts translated and file updated successfully.")
+if __name__ == "__main__":
+    main()

influence_tracer/build_dolma_index.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import json
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm as tqdm_iterator
+import sys
+import torch
+# Configuration for the script.
+DOLMA_DIR = os.path.join("influence_tracer", "dolma_dataset_sample_1.6v")
+INDEX_DIR = os.path.join("influence_tracer", "influence_tracer_data")
+INDEX_PATH = os.path.join(INDEX_DIR, "dolma_index_multi.faiss")
+MAPPING_PATH = os.path.join(INDEX_DIR, "dolma_mapping_multi.json")
+STATE_PATH = os.path.join(INDEX_DIR, "index_build_state_multi.json")
+MODEL_NAME = 'paraphrase-multilingual-mpnet-base-v2'
+# Performance tuning.
+BATCH_SIZE = 131072
+SAVE_INTERVAL = 10
+def build_index():
+    # Scans the Dolma dataset, creates vector embeddings, and builds a FAISS index.
+    print("--- Starting Influence Tracer Index Build (Optimized for Speed) ---")
+    if not os.path.exists(DOLMA_DIR):
+        print(f"Error: Dolma directory not found at '{DOLMA_DIR}'")
+        print("Please ensure the dolma_dataset_sample_1.6v directory is in your project root.")
+        sys.exit(1)
+    os.makedirs(INDEX_DIR, exist_ok=True)
+    # Load or initialize the state to allow resuming.
+    processed_files = []
+    doc_id_counter = 0
+    total_docs_processed = 0
+    doc_mapping = {}
+    if os.path.exists(STATE_PATH):
+        print("Found existing state. Attempting to resume...")
+        try:
+            with open(STATE_PATH, 'r', encoding='utf-8') as f:
+                state = json.load(f)
+            processed_files = state.get('processed_files', [])
+            doc_id_counter = state.get('doc_id_counter', 0)
+            total_docs_processed = state.get('total_docs_processed', 0)
+            with open(MAPPING_PATH, 'r', encoding='utf-8') as f:
+                doc_mapping = json.load(f)
+            print(f"Reading existing index from {INDEX_PATH}...")
+            index = faiss.read_index(INDEX_PATH)
+            print(f"Resumed from state: {len(processed_files)} files processed, {total_docs_processed} documents indexed.")
+        except (IOError, json.JSONDecodeError, RuntimeError) as e:
+            print(f"Error resuming from state: {e}. Starting fresh.")
+            processed_files = []
+            doc_id_counter = 0
+            total_docs_processed = 0
+            doc_mapping = {}
+            index = None  # Will be re-initialized
+    else:
+        print("No existing state found. Starting fresh.")
+        index = None
+    # Detect the best device to use (MPS, CUDA, or CPU).
+    device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device.upper()}")
+    # Load the sentence transformer model.
+    print(f"Loading sentence transformer model: '{MODEL_NAME}'...")
+    try:
+        model = SentenceTransformer(MODEL_NAME, device=device)
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        print("Please ensure you have an internet connection and the required libraries are installed.")
+        print("Try running: pip install sentence-transformers faiss-cpu numpy tqdm")
+        sys.exit(1)
+    print("Model loaded successfully.")
+    # Initialize the FAISS index if it wasn't loaded.
+    if index is None:
+        embedding_dim = model.get_sentence_embedding_dimension()
+        # Use Inner Product for cosine similarity.
+        index = faiss.IndexFlatIP(embedding_dim)
+        print(f"FAISS index initialized with dimension {embedding_dim} using Inner Product (IP) for similarity.")
+    # Get a list of all files to process.
+    print(f"Scanning for documents in '{DOLMA_DIR}'...")
+    all_files = sorted([os.path.join(DOLMA_DIR, f) for f in os.listdir(DOLMA_DIR) if f.endswith('.json')])
+    files_to_process = [f for f in all_files if os.path.basename(f) not in processed_files]
+    if not files_to_process:
+        if processed_files:
+            print("✅ All files have been processed. Index is up to date.")
+            print("--- Index Build Complete ---")
+            return
+        else:
+            print(f"Error: No JSON files found in '{DOLMA_DIR}'.")
+            sys.exit(1)
+    print(f"Found {len(all_files)} total files, {len(files_to_process)} remaining to process.")
+    # Process each file.
+    print(f"Processing remaining files with batch size {BATCH_SIZE}...")
+    files_processed_since_save = 0
+    for file_idx, path in enumerate(tqdm_iterator(files_to_process, desc="Processing files")):
+        texts_batch = []
+        batch_doc_info = []
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    try:
+                        data = json.loads(line)
+                        text = data.get('text', '')
+                        if text:
+                            texts_batch.append(text)
+                            batch_doc_info.append({
+                                'id': doc_id_counter,
+                                'info': {
+                                    'source': data.get('source', 'Unknown'),
+                                    'file': os.path.basename(path),
+                                    'text_snippet': text[:200] + '...'
+                                }
+                            })
+                            doc_id_counter += 1
+                            # Process the batch when it's full.
+                            if len(texts_batch) >= BATCH_SIZE:
+                                embeddings = model.encode(texts_batch, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
+                                index.add(embeddings.astype('float32'))
+                                # Update the document mapping.
+                                for doc in batch_doc_info:
+                                    doc_mapping[str(doc['id'])] = doc['info']
+                                total_docs_processed += len(texts_batch)
+                                texts_batch = []
+                                batch_doc_info = []
+                    except json.JSONDecodeError:
+                        continue
+            # Process any remaining documents in the last batch.
+            if texts_batch:
+                embeddings = model.encode(texts_batch, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
+                index.add(embeddings.astype('float32'))
+                # Update the mapping for the final batch.
+                for doc in batch_doc_info:
+                    doc_mapping[str(doc['id'])] = doc['info']
+                total_docs_processed += len(texts_batch)
+            # Save progress periodically.
+            processed_files.append(os.path.basename(path))
+            files_processed_since_save += 1
+            if files_processed_since_save >= SAVE_INTERVAL or file_idx == len(files_to_process) - 1:
+                print(f"\nSaving progress ({total_docs_processed} docs processed)...")
+                faiss.write_index(index, INDEX_PATH)
+                with open(MAPPING_PATH, 'w', encoding='utf-8') as f:
+                    json.dump(doc_mapping, f)
+                current_state = {
+                    'processed_files': processed_files,
+                    'doc_id_counter': doc_id_counter,
+                    'total_docs_processed': total_docs_processed
+                }
+                with open(STATE_PATH, 'w', encoding='utf-8') as f:
+                    json.dump(current_state, f)
+                files_processed_since_save = 0
+                print("Progress saved.")
+        except (IOError) as e:
+            print(f"Warning: Could not read or parse {path}. Skipping. Error: {e}")
+            continue
+    if index.ntotal == 0:
+        print("Error: No text could be extracted from the documents. Cannot build index.")
+        sys.exit(1)
+    print(f"\n🎉 Total documents processed: {total_docs_processed}")
+    print(f"✅ --- Index Build Complete ---")
+    print(f"Created index for {index.ntotal} documents.")
+if __name__ == "__main__":
+    # This allows the script to be run from the command line.
+    print("This script will build a searchable index from your Dolma dataset.")
+    print("It needs to download a model and process all documents, so it may take some time.")
+    # Check for required libraries.
+    try:
+        import sentence_transformers
+        import faiss
+        import numpy
+        import tqdm
+    except ImportError:
+        print("\n--- Missing Required Libraries ---")
+        print("To run this script, please install the necessary packages by running:")
+        print("pip install sentence-transformers faiss-cpu numpy tqdm")
+        print("---------------------------------\n")
+        sys.exit(1)
+    build_index()

locales/de/attribution_analysis_page.json ADDED Viewed

	@@ -0,0 +1,174 @@

+{
+    "desc_integrated_gradients": "Dies bietet ein zuverlässigeres Maß für die Wichtigkeit, indem nicht nur die endgültige Eingabe, sondern der gesamte Pfad von einem neutralen 'leeren' Zustand zu Ihrer spezifischen Anfrage berücksichtigt wird. Es summiert sorgfältig den Beitrag jedes Wortes, verhindert irreführende Ergebnisse und gibt ein wahreres Bild vom Einfluss jedes Tokens.",
+    "desc_occlusion": "Diese Methode testet die Notwendigkeit jedes Wortes, indem sie fragt: 'Was passiert, wenn dieses Wort fehlt?' Sie verbirgt vorübergehend jedes Token der Eingabe und misst, wie stark sich die Ausgabe des Modells ändert. Eine hohe Punktzahl bedeutet, dass das Wort für das Ergebnis entscheidend war.",
+    "desc_saliency": "Diese Methode zeigt die anfängliche 'Bauchreaktion' des Modells auf jedes Eingabe-Token. Sie hebt hervor, welche Wörter das Modell am interessantesten oder überraschendsten fand, basierend auf einer direkten und schnellen Berechnung der Wichtigkeit. Betrachten Sie es als einen schnellen Blick auf den Fokus des Modells.",
+    "unsupported_method_desc": "Für diese Methode ist keine Beschreibung verfügbar.",
+    "ai_expert_intro": "Sie sind ein erstklassiger Experte für KI-Interpretierbarkeit. Ihre Aufgabe ist es, eine Attributions-Heatmap zu analysieren und eine umfassende, leicht verständliche Erklärung für ein nicht-technisches Publikum zu liefern.",
+    "analysis_details": "Analyse-Details",
+    "method_being_used": "Verwendete Methode:",
+    "prompt_analyzed": "Analysierte Eingabeaufforderung:",
+    "full_generated_text": "Vollständig generierter Text:",
+    "method_specific_context": "Methodenspezifischer Kontext",
+    "instructions_for_analysis": "Anweisungen für die Analyse",
+    "instruction_part_1_header": "### Visueller Überblick auf hoher Ebene",
+    "instruction_part_1_desc": "Geben Sie in zwei bis drei Sätzen eine allgemeine Zusammenfassung der Muster, die Sie im beigefügten Heatmap-Bild sehen. Beschreiben Sie die allgemeine Position der 'Hot Spots' (hell gefärbte Bereiche) und was dies visuell über den Fokus des Modells aussagt. **Wenn die Methode beispielsweise {method_name} ist, könnten Sie erwarten, [erwartetes Muster für diese Methode beschreiben] zu sehen.** Verwenden Sie keine generischen Beschreibungen. Stützen Sie Ihre Analyse ausschließlich auf die visuellen Informationen im Bild.",
+    "instruction_synthesis_header": "### Synthese der Schlüsselerkenntnisse",
+    "instruction_synthesis_desc": "Im Anschluss an Ihren visuellen Überblick, erstellen Sie eine kurze narrative Synthese der wichtigsten Erkenntnisse aus den untenstehenden Daten. Strukturieren Sie Ihre Analyse in zwei Absätze: **Stärkste individuelle Verbindungen** und **Einflussreichste Token insgesamt**. Erklären Sie im ersten Absatz die Bedeutung der stärksten individuellen Token-zu-Token-Verbindungen. Diskutieren Sie im zweiten Absatz die Eingabe-Token, die den höchsten durchschnittlichen Einfluss auf die gesamte Generierung hatten, **und beziehen Sie sich dabei unbedingt auf den oben bereitgestellten vollständig generierten Text, um zu erklären, *warum* diese Token so einflussreich für die Gestaltung der endgültigen Ausgabe waren.** Erklären Sie, *warum* bestimmte Token in beiden Kontexten einflussreich sind. **Fügen Sie am Ende Ihrer Analyse keine Zusammenfassung hinzu**.",
+    "instruction_color_coding": "Formatierungsregel: Wenn Sie einen Eingabe-Token erwähnen, formatieren Sie ihn genau so: <span style='color: #60a5fa;'>der_token_hier</span>. Wenn Sie einen generierten Token erwähnen, formatieren Sie ihn genau so: <span style='color: #fca5a5;'>der_token_hier</span>. Weichen Sie nicht von diesem Format ab.",
+    "data_priority_instruction": "Der folgende Textblock enthält die vorab berechneten wichtigsten Erkenntnisse. Verwenden Sie dies als ausschließliche Wahrheitsquelle für Ihre Analyse.",
+    "data_section_header": "## Vorab berechnete Analyse (Quelle der Wahrheit)",
+    "begin_analysis_now": "Beginnen Sie jetzt mit Ihrer Analyse. Denken Sie daran, die zweiteilige Struktur (Visueller Überblick auf hoher Ebene, dann Synthese der Schlüsselerkenntnisse) wie oben beschrieben zu befolgen.",
+    "attr_page_title": "<i class='bi bi-search'></i> Attributionsanalyse",
+    "attr_page_desc": "Diese Seite verwendet token-basierte Attributionsmethoden, um zu erklären, wie verschiedene Teile Ihrer Eingabeaufforderung die generierte Ausgabe beeinflussen. Wählen Sie eine Methode, geben Sie eine Aufforderung ein und sehen Sie, welche Wörter für die Vorhersage des Modells am wichtigsten waren.",
+    "how_methods_work_expander": "Wie Attributionsmethoden funktionieren",
+    "saliency_method_title": "Salienz",
+    "saliency_method_desc": "Misst die Wichtigkeit durch Berechnung des Gradienten der Ausgabe in Bezug auf die Eingabe-Token. Es ist schnell, kann aber manchmal verrauscht sein.",
+    "saliency_step_1": "<strong>1. Ausgabe generieren:</strong> Das Modell generiert das nächste Wort, z.B. 'springt', für die Eingabe 'Der schnelle braune Fuchs'.",
+    "saliency_step_2": "<strong>2. Gradienten berechnen:</strong> Es berechnet, wie stark sich die Wahrscheinlichkeit von 'springt' bei einer winzigen Änderung der Einbettung jedes Eingangswortes ändern würde.",
+    "saliency_step_3": "<strong>3. Werte zuweisen:</strong> Wörter, die die größte Änderung verursachen (z.B. 'Fuchs'), erhalten die höchsten Werte.",
+    "ig_method_title": "Integrierte Gradienten",
+    "ig_method_desc": "Eine robustere Methode, die die Vorhersage den Eingaben zuschreibt, indem sie Gradienten entlang eines Pfades von einer Basislinie (z. B. Null-Einbettung) zur Eingabe integriert.",
+    "ig_step_1": "<strong>1. Pfad erstellen:</strong> Es wird ein glatter Pfad von einer 'leeren' Eingabe zur vollständigen Eingabe 'Der schnelle braune Fuchs' erstellt.",
+    "ig_step_2": "<strong>2. Gradienten entlang des Pfades berechnen:</strong> Es berechnet Gradienten für die Ausgabe 'springt' in vielen kleinen Schritten entlang dieses Pfades.",
+    "ig_step_3": "<strong>3. Gradienten summieren:</strong> Es summiert all diese kleinen Gradientenwerte, um einen zuverlässigen Wert dafür zu erhalten, wie stark jedes Wort zur Ausgabe beigetragen hat.",
+    "occlusion_method_title": "Okklusion",
+    "occlusion_method_desc": "Eine einfache, intuitive Methode, die die Wichtigkeit misst, indem jedes Eingabe-Token ersetzt wird und beobachtet wird, wie stark sich die Ausgabewahrscheinlichkeit ändert.",
+    "occlusion_step_1": "<strong>1. Ursprüngliche Wahrscheinlichkeit ermitteln:</strong> Das Modell generiert 'springt' mit einer bestimmten Wahrscheinlichkeit.",
+    "occlusion_step_2": "<strong>2. Wörter ersetzen:</strong> Es ersetzt systematisch jedes Wort (z.B. 'Fuchs') durch ein neutrales Token und führt das Modell erneut aus.",
+    "occlusion_step_3": "<strong>3. Auswirkung messen:</strong> Wenn das Ersetzen von 'Fuchs' dazu führt, dass die Wahrscheinlichkeit von 'springt' erheblich sinkt, wird 'Fuchs' als sehr wichtig eingestuft.",
+    "input_header": "<i class='bi bi-pencil-square'></i> Eingabe & Einstellungen",
+    "enter_prompt": "Geben Sie Ihre Eingabeaufforderung ein:",
+    "enter_prompt_help": "Geben Sie den Text ein, den das Modell fortsetzen soll",
+    "enable_ai_explanations": "KI-Erklärungen aktivieren",
+    "enable_ai_explanations_help": "Erklärungen für Visualisierungen mit Qwen 2.5 VL 72B generieren (erfordert API-Zugang)",
+    "generate_and_analyze_button": "Alle Methoden generieren & analysieren",
+    "max_new_tokens_slider": "Anzahl der zu generierenden Token",
+    "max_new_tokens_slider_help": "Steuert die Länge des generierten Textes.",
+    "loading_models_spinner": "Lade OLMo-Modell mit allen Attributionsmethoden...",
+    "generating_attributions_spinner": "Generiere Text und Attributionen...",
+    "analysis_complete_success": "Alle Attributionsanalysen abgeschlossen!",
+    "failed_to_generate_analysis_error": "Analyse konnte nicht generiert werden",
+    "failed_to_load_models_error": "Modelle konnten nicht geladen werden",
+    "please_enter_prompt_warning": "Bitte geben Sie eine Eingabeaufforderung ein",
+    "output_header": "<i class='bi bi-display'></i> Ausgabe",
+    "generated_text_subheader": "Generierter Text",
+    "input_label": "Eingabe:",
+    "generated_label": "Generiert:",
+    "attribution_analysis_results_header": "Ergebnisse der Attributionsanalyse",
+    "attr_tab": "Integrierte Gradienten",
+    "occlusion_tab": "Okklusion",
+    "saliency_tab": "Salienz",
+    "attr_title": "Analyse der integrierten Gradienten",
+    "occlusion_title": "Okklusionsanalyse",
+    "saliency_title": "Salienzanalyse",
+    "attr_viz_desc": "**Wie man diese Heatmap der integrierten Gradienten liest:**\\n- **X-Achse**: Generierte Token (was das Modell erzeugt hat)\\n- **Y-Achse**: Eingabe-Token (Ihre ursprüngliche Eingabeaufforderung)\\n- **Farbintensität**: Mathematische gradientenbasierte Wichtigkeitswerte\\n- **Interpretation**: Wie stark jedes Eingabe-Token jedes generierte Token mathematisch beeinflusst",
+    "occlusion_viz_desc": "Die Okklusionsanalyse hebt wichtige Token hervor, indem sie vorübergehend maskiert (okkludiert) werden und die Auswirkung auf die Ausgabe gemessen wird. Ein höherer Attributionswert bedeutet, dass das Token kritischer war.",
+    "saliency_viz_desc": "Diese Visualisierung hebt die salientesten Token in der Eingabe hervor, die zur Generierung beigetragen haben.",
+    "how_to_read_heatmap": "Wie man diese Heatmap liest:",
+    "xaxis_label": "X-Achse",
+    "xaxis_desc": "Generierte Token (was das Modell erzeugt hat)",
+    "yaxis_label": "Y-Achse",
+    "yaxis_desc": "Eingabe-Token (Ihre ursprüngliche Eingabeaufforderung)",
+    "color_intensity_label": "Farbintensität",
+    "color_intensity_desc": "Mathematische Wichtigkeitswerte",
+    "interpretation_label": "Interpretation",
+    "interpretation_desc": "Wie stark jedes Eingabe-Token jedes generierte Token beeinflusst.",
+    "special_tokens_label": "Spezielle Token (z.B., `Ġ`, `Ċ`)",
+    "special_tokens_desc": "Dies sind Artefakte des Tokenizers. Häufige sind:<ul><li>`Ġ`: Ein Leerzeichen, das ein neues Wort markiert.</li><li>`Ċ`: Ein Zeilenumbruchzeichen.</li><li>`<|endoftext|>`: Ein spezielles Token, das das Ende einer Sequenz markiert.</li></ul>",
+    "creating_viz_spinner": "Erstelle {method_title} Visualisierung...",
+    "generating_ai_explanation_spinner": "Generiere KI-Erklärung für {method_title}...",
+    "what_this_method_shows": "Was diese Methode zeigt:",
+    "ai_generated_analysis": "KI-generierte Analyse",
+    "download_results_subheader": "Ergebnisse herunterladen",
+    "download_html_button": "{method_title} HTML herunterladen",
+    "download_csv_button": "Werte herunterladen (CSV)",
+    "download_png_button": "{method_title} PNG herunterladen",
+    "heatmap_title": "Attributions-Heatmap",
+    "heatmap_xaxis": "Generierte Token",
+    "heatmap_yaxis": "Eingabe-Token",
+    "feedback_survey_header": "Feedback & Verständnisumfrage",
+    "feedback_survey_desc": "Ihr Feedback ist wertvoll für die Verbesserung dieses Tools. Bitte nehmen Sie sich einen Moment Zeit, um diese Fragen zu beantworten.",
+    "ux_feedback_subheader": "User Experience Feedback",
+    "q_visual_clarity": "1. Wie bewerten Sie die Klarheit der Heatmap-Visualisierungen?",
+    "q_visual_clarity_help": "1 = Sehr verwirrend, 5 = Sehr klar",
+    "q_cognitive_load": "2. Wie anspruchsvoll fanden Sie es, die Ergebnisse zu interpretieren?",
+    "q_cognitive_load_help": "1 = Überhaupt nicht anspruchsvoll, 5 = Sehr anspruchsvoll",
+    "q_influential_docs_plausibility": "3. Wie plausibel sind die 3 einflussreichsten Dokumente, die vom Influence Tracer identifiziert wurden?",
+    "q_influential_docs_plausibility_help": "1 = Überhaupt nicht plausibel, 5 = Sehr plausibel",
+    "comprehension_qs_subheader": "Kurze Verständnisprüfung",
+    "comprehension_qs_desc": "Basierend auf den Visualisierungen, die Sie gerade gesehen haben, welche Methode beantwortet die folgenden Fragen am besten?",
+    "q_options_ig": "Integrierte Gradienten",
+    "q_options_occlusion": "Okklusion",
+    "q_options_saliency": "Salienz",
+    "q_s1": "Welche Methode zeigt die anfängliche 'Bauchreaktion' des Modells auf jedes Wort und dessen direkten und unmittelbaren Fokus?",
+    "q_s2": "Welche Methode würden Sie verwenden, um die Auswirkung des Entfernens eines bestimmten Wortes zu verstehen?",
+    "q_s3": "Welche Methode erstellt ein zuverlässigeres Bild der Wichtigkeit, indem der gesamte Pfad von einer leeren Eingabe bis zu Ihrer endgültigen Eingabeaufforderung analysiert wird?",
+    "submit_feedback_button": "Feedback absenden",
+    "feedback_success_message": "Vielen Dank für Ihr Feedback!",
+    "feedback_error_message": "Entschuldigung, beim Senden Ihres Feedbacks ist ein Fehler aufgetreten: {e}",
+    "feedback_please_answer_all_qs": "Bitte beantworten Sie alle Verständnisfragen, bevor Sie absenden.",
+    "error_creating_heatmap": "Fehler beim Erstellen der Heatmap aus HTML: {e}",
+    "error_inseq_no_html": "Inseq konnte keine HTML-Ausgabe für {method_name} generieren.",
+    "error_no_table_in_html": "Konnte keine Datentabelle in der HTML-Ausgabe von inseq für {method_name} finden.",
+    "error_table_no_rows": "Tabelle in der HTML-Ausgabe enthält keine Zeilen für {method_name}.",
+    "error_failed_to_parse_rows": "Fehler beim Parsen von Datenzeilen aus dem HTML für {method_name}.",
+    "running_influence_trace_spinner": "Einflüsse in den Trainingsdaten werden verfolgt...",
+    "influence_index_not_found_warning": "Influence Tracer-Index nicht gefunden. Dieser Schritt wird übersprungen. Bitte führen Sie `build_dolma_index.py` aus, um ihn zu aktivieren.",
+    "influence_tracer_title": "Einfluss-Tracer",
+    "influence_tracer_desc": "Dieses Tool identifiziert Trainingsdokumente aus einer Stichprobe des <b>Dolma v1.6-Datensatzes</b>, die den größten Einfluss auf die Ausgabe des Modells hatten. Dolma v1.6 ist ein offener Datensatz mit 3 Billionen Token, der aus einer vielfältigen Mischung von Webinhalten (Common Crawl), wissenschaftlichen Veröffentlichungen (C4, arXiv), Code (The Stack), Büchern (Project Gutenberg) und enzyklopädischen Daten (Wikipedia) besteht. Indem wir die Generierung des Modells auf seine Trainingsdaten zurückführen, können wir seine Argumentation und Wissensquellen besser verstehen.",
+    "top_influential_docs_header": "Top {num_docs} einflussreichste Trainingsdokumente",
+    "no_influential_docs_found": "Für diese Generierung wurden keine einflussreichen Dokumente gefunden.",
+    "file_label": "Datei",
+    "source_label": "Quelle",
+    "similarity_label": "Ähnlichkeit",
+    "run_analysis_for_influence_info": "Führen Sie eine Analyse durch, um hier einflussreiche Trainingsdokumente zu sehen.",
+    "prompt_placeholder_text": "z.B., 'Die Hauptstadt von Frankreich ist' oder 'Sein oder Nichtsein, das ist hier die'",
+    "running_attribution_analysis_spinner": "Generiere Attributions-Heatmaps...",
+    "generating_ai_explanations_spinner": "Generiere KI-Erklärungen...",
+    "how_influence_is_found_header": "Wie Einfluss gefunden wird: Ein Blick auf die Kosinus-Ähnlichkeit",
+    "how_influence_is_found_desc": "Der Influence Tracer sucht nicht nur nach Schlüsselwörtern, sondern nach Bedeutung. Dazu wandelt er sowohl Ihre Eingabeaufforderung als auch jeden Satz in den Trainingsdaten in hochdimensionale Vektoren um. Anschließend verwendet er eine Technik namens <strong>Kosinus-Ähnlichkeit</strong>, um die engsten Übereinstimmungen zu finden.",
+    "influence_step_1_title": "<strong>1. Vektorumwandlung</strong>",
+    "influence_step_1_desc": "Ihre Eingabeaufforderung und jeder Satz aus den Trainingsdaten werden in numerische Vektoren umgewandelt.",
+    "influence_step_2_title": "<strong>2. Winkelberechnung</strong>",
+    "influence_step_2_desc": "Das System berechnet den Winkel (θ) zwischen dem Vektor Ihrer Eingabeaufforderung und jedem anderen Satzvektor.",
+    "influence_step_3_title": "<strong>3. Ähnlichkeitswert</strong>",
+    "influence_step_3_desc": "Ein kleinerer Winkel bedeutet eine höhere Ähnlichkeit. Ein Wert von 1 bedeutet, dass die Sätze in ihrer Bedeutung identisch sind, während ein Wert von 0 bedeutet, dass sie völlig unabhängig voneinander sind.",
+    "influence_example_sentence_a": "Ihre Eingabe",
+    "influence_example_sentence_b": "Trainingssatz",
+    "generating_all_visualizations_spinner": "Generiere alle Visualisierungen und KI-Erklärungen...",
+    "searching_influential_docs_progress": "Suche nach einflussreichen Dokumenten...",
+    "processing_doc_progress": "Verarbeite Dokument {i} von {k}...",
+    "search_complete_progress": "Suche abgeschlossen!",
+    "faithfulness_check_expander": "Überprüfung der Faktentreue",
+    "running_faithfulness_check_spinner": "Führe Überprüfung der Faktentreue aus...",
+    "verified_status": "Verifiziert",
+    "contradicted_status": "Widersprochen",
+    "claim_label": "Aussage",
+    "status_label": "Status",
+    "evidence_label": "Beweis",
+    "no_verifiable_claims_info": "Aus der Erklärung konnten keine überprüfbaren Aussagen extrahiert werden.",
+    "faithfulness_check_error": "Bei der Überprüfung der Faktentreue ist ein Fehler aufgetreten: {e}",
+    "faithfulness_check_results_header": "Ergebnisse der Überprüfung der Faktentreue:",
+    "faithfulness_check_explanation_html": "<div style='font-size: 0.9rem; color: #DCDCDC; margin-bottom: 1rem;'><p style='margin-bottom: 0.5rem;'><strong>Wie das funktioniert:</strong> Der Faktentreue-Prüfer verifiziert zwei Arten von Behauptungen aus der Erklärung der KI:</p><ul style='margin-left: 1.5rem; padding-left: 0; list-style-type: disc;'><li style='margin-bottom: 0.3rem;'><strong>Numerische Behauptungen:</strong> Überprüft, ob der Attributionswert eines Tokens (entweder sein Spitzenwert 'Hotspot' oder sein Durchschnittswert) einen dynamischen Schwellenwert erreicht.<ul style='margin-left: 1.5rem; padding-left: 0; list-style-type: circle;'><li>Eine <strong>\\\"hohe\\\"</strong> Behauptung (z.B. \\\"höchste\\\", \\\"stärkste\\\") muss über <strong>70%</strong> des Maximalwerts in der Analyse liegen.</li><li>Eine <strong>\\\"signifikante\\\"</strong> Behauptung (z.B. \\\"bemerkenswert\\\") muss über <strong>50%</strong> des Maximalwerts liegen.</li></ul></li><li style='margin-bottom: 0.3rem;'><strong>Begründungsbehauptungen:</strong> Verwendet eine weitere KI, um semantisch zu analysieren, ob die <strong>Begründung</strong> für die Wichtigkeit eines Tokens plausibel und logisch konsistent ist.</li></ul></div>",
+    "claim_extraction_prompt_header": "Sie sind ein Experten-System zur Extraktion von Behauptungen. Ihre Aufgabe ist es, eine Erklärung einer Text-Attributionsanalyse zu lesen und alle überprüfbaren, sachlichen Behauptungen in eine strukturierte JSON-Liste zu extrahieren. Ein einzelner Satz kann mehrere unterschiedliche Behauptungen enthalten.",
+    "claim_extraction_prompt_instruction": "Jedes Objekt in der Liste MUSS die folgenden Schlüssel haben:\n1. `claim_text`: Der exakte Satz oder die exakte Phrase aus der Erklärung, die die Behauptung aufstellt.\n2. `claim_type`: Einer der verfügbaren Behauptungstypen.\n3. `details`: Ein Objekt, das die spezifischen Parameter für die Überprüfung enthält.",
+    "claim_extraction_prompt_context_header": "**Kontext der Analysemethode:** {analysis_method}",
+    "claim_extraction_prompt_types_header": "**Verfügbare Behauptungstypen:**",
+    "claim_extraction_prompt_types_details": "- `attribution_claim`: Ein Anspruch, der behauptet, dass ein oder mehrere Token hohe oder signifikante Attributionswerte haben, entweder basierend auf ihrem Spitzenwert (Hotspot) oder ihrem durchschnittlichen Einfluss.\n  - `details`: {{ \"tokens\": [\"...\"], \"qualifier\": \"hoch\" | \"signifikant\", \"score_type\": \"spitze\" | \"durchschnitt\" }}\n- `token_begruendung_anspruch`: Ein Anspruch, der einen spezifischen Grund für die Wichtigkeit oder den Attributionswert eines oder mehrerer Tokens liefert.\n  - `details`: {{ \"tokens\": [\"...\"], \"begruendung\": \"...\" }}",
+    "claim_extraction_prompt_example_header": "**Beispiel:**",
+    "claim_extraction_prompt_example_explanation": "- **Satz der Erklärung:** \"Insgesamt hat 'Frankreich' den höchsten durchschnittlichen Einfluss, während '.' einen signifikanten Spitzenwert hat.\"",
+    "claim_extraction_prompt_example_json": "- **Ergebnis-JSON-Objekt:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"Insgesamt hat 'Frankreich' den höchsten durchschnittlichen Einfluss...\",\n      \"claim_type\": \"attribution_claim\",\n      \"details\": {{ \"tokens\": [\"Frankreich\"], \"qualifier\": \"hoch\", \"score_type\": \"durchschnitt\" }}\n    }},\n    {{\n      \"claim_text\": \"...während '.' einen signifikanten Spitzenwert hat.\",\n      \"claim_type\": \"attribution_claim\",\n      \"details\": {{ \"tokens\": [\".\"], \"qualifier\": \"signifikant\", \"score_type\": \"spitze\" }}\n    }}\n  ]\n  ```",
+    "claim_extraction_prompt_analyze_header": "**Zu analysierende Erklärung:**",
+    "claim_extraction_prompt_instruction_footer": "Antworten Sie NUR mit der JSON-Liste der Ansprüche.",
+    "justification_verification_prompt_collective_reasoning": "**Kollektive Begründung:** Die Begründung kann sich auf mehrere Token gleichzeitig beziehen (z.B. 'diese Token gemeinsam...'). Bei der Bewertung einer solchen Behauptung betrachten Sie die Gruppe von Token als eine einzige Einheit und beurteilen Sie, ob die Begründung für sie als Ganzes plausibel ist, auch wenn sie nicht auf jedes Token einzeln perfekt zutrifft.",
+    "justification_verification_prompt_header": "Sie sind ein KI-Faktenprüfer, der auf NLP und semantisches Denken spezialisiert ist. Ihre Aufgabe ist es zu bestimmen, ob eine Begründung für die Wichtigkeit eines Tokens plausibel und logisch konsistent ist, wenn der gesamte Kontext gegeben ist.",
+    "justification_verification_prompt_crucial_rule": "**Entscheidende Regel:** Eine Begründung ist plausibel, wenn sie eine vernünftige, kreative oder kontextuell relevante Verbindung darstellt. Widersprechen Sie nur, wenn die Argumentation völlig unlogisch, sachlich falsch oder inkonsistent mit dem gegebenen Eingabe- oder Ausgabetext ist.",
+    "justification_verification_prompt_token_location": "**Token-Standort:** Das „fragliche Token“ kann entweder aus der „Eingabeaufforderung“ oder dem „generierten Text“ stammen. Ein Token aus der Eingabe kann immer noch einen entscheidenden Einfluss auf die generierte Ausgabe haben. Widersprechen Sie einer Behauptung nicht einfach deshalb, weil das Token im generierten Text nicht vorhanden ist.",
+    "justification_verification_prompt_special_tokens": "**Spezielle Token:** Das 'fragliche Token' kann Sonderzeichen vom Tokenizer enthalten. `Ġ` steht für ein führendes Leerzeichen (z. B. ist `Ġof` ` of`), und Suffixe wie ` (1)` dienen der Eindeutigkeit (z. B. ist `. (1)` einfach `.`). Sie MÜSSEN dies berücksichtigen, wenn Sie prüfen, ob ein Token im Text vorhanden ist.",
+    "justification_verification_prompt_evaluating_justifications": "**Bewertung von Begründungen:** Eine Begründung sollte als plausibel angesehen werden, wenn sie eine vernünftige Verbindung aufzeigt, auch wenn es sich nicht um eine direkte oder einfache kausale Verknüpfung handelt. Dies schließt Beziehungen ein, die auf dem breiteren Kontext des Textes oder der grammatikalischen Struktur der Sprache basieren. Achten Sie besonders auf Token, die gebräuchliche Kollokationen, Entitäten oder Abkürzungen bilden; Verbindungen zwischen solchen Token sollten als plausibel angesehen werden, da sie vom Modell oft als eine einzige semantische Einheit verarbeitet werden.",
+    "justification_verification_prompt_linguistic_context": "**Linguistischer Kontext für autoregressive Modelle:** Es ist entscheidend, sich daran zu erinnern, dass in autoregressiven Modellen wie diesem JEDES Token die Wahrscheinlichkeit des nächsten Tokens direkt beeinflusst. Daher sind Begründungen, die auf grammatikalischer Struktur, Zeichensetzung oder syntaktischen Rollen basieren, nicht nur gültig, sondern stellen einen Kernbestandteil des Entscheidungsprozesses des Modells dar. Die strukturelle Rolle eines Tokens (wie eine Präposition oder ein Punkt) ist ein direkter und wichtiger Beitrag zur Inhaltsgenerierung. Weisen Sie diese Begründungen nicht als 'bloße Grammatik' zurück.",
+    "justification_verification_prompt_task_header": "**Ihre Aufgabe:**",
+    "justification_verification_prompt_task_instruction": "Ist die Begründung auf der Grundlage der obigen Regel plausibel?",
+    "justification_verification_prompt_json_instruction": "Antworten Sie mit einem JSON-Objekt mit zwei Schlüsseln:\n1. `is_verified`: boolean (wahr, wenn die Begründung plausibel ist, falsch, wenn sie unlogisch oder falsch ist).\n2. `reasoning`: Eine kurze, einzeilige Erklärung für Ihre Entscheidung.",
+    "justification_verification_prompt_footer": "Antworten Sie NUR mit dem JSON-Objekt."
+}

locales/de/circuit_trace_page.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "circuit_trace_page_title": "Circuit-Trace-Analyse",
+    "circuit_trace_page_desc": "Erkunden Sie die internen Pfade des OLMo-Modells. Diese Seite visualisiert, wie Informationen von Eingabe-Token über verschiedene Schichten und Merkmale fließen, um die endgültige Ausgabe zu erzeugen, basierend auf einer neuartigen Cross-Layer-Transcoder-Methode.",
+    "how_circuit_tracing_works_header": "Wie das funktioniert: Ein dreistufiger Prozess",
+    "how_circuit_tracing_works_desc": "Anstatt das gesamte Modell auf einmal zu betrachten, vereinfacht diese Technik die Analyse, indem sie sich auf 'Merkmale' konzentriert – spezifische, gelernte Muster von Neuronenaktivierungen. Durch das Training kleiner 'Transcoder'-Modelle können wir identifizieren, welche Merkmale in einer Schicht Merkmale in der nächsten aktivieren, was es uns ermöglicht, einen Schaltkreis des Informationsflusses zu verfolgen.",
+    "circuit_tracing_step1_title": "1. Merkmalsextraktion",
+    "circuit_tracing_step1_desc": "Kleine <strong>Autoencoder</strong>-Modelle (stellen Sie sie sich als Komprimierungswerkzeuge vor, die wichtige Informationen zusammenfassen) werden auf jeder Schicht des Haupt-OLMo-Modells trainiert, um wiederkehrende Muster von Neuronenaktivierungen zu entdecken, die wir 'Merkmale' nennen.",
+    "circuit_tracing_step2_title": "2. Schichtübergreifendes Mapping",
+    "circuit_tracing_step2_desc": "Winzige <strong>Transcoder</strong>-Modelle (die wie Übersetzer zwischen den Schichten agieren) werden trainiert, um die Aktivierung eines Merkmals in einer späteren Schicht basierend auf den Aktivierungen von Merkmalen in einer früheren Schicht vorherzusagen.",
+    "circuit_tracing_step3_title": "3. Graph-Konstruktion",
+    "circuit_tracing_step3_desc": "Indem wir die vorhersagekräftigsten Merkmalspaare aus den Transcoder-Modellen verbinden, erstellen wir einen gerichteten Graphen, der die wichtigsten Pfade des Informationsflusses für eine gegebene Eingabe darstellt.",
+    "enable_ai_explanations_circuit": "KI-Erklärungen aktivieren",
+    "enable_ai_explanations_circuit_help": "Generieren Sie detaillierte Erklärungen für Schaltungsvisualisierungen mit Qwen 2.5 VL 72B.",
+    "no_results_warning": "Ergebnisse der Attributionsgraphen nicht gefunden.",
+    "run_analysis_info": "Bitte führen Sie zuerst das Analyse-Skript aus: `python3 circuit_analysis/attribution_graphs_olmo_de.py --prompt-index 0 --force-retrain-clt`",
+    "config_header": "Konfiguration",
+    "model_label": "Modell:",
+    "device_label": "Gerät:",
+    "features_per_layer_label": "Merkmale pro Schicht:",
+    "training_steps_label": "Trainingsschritte:",
+    "batch_size_label": "Batch-Größe:",
+    "learning_rate_label": "Lernrate:",
+    "interactive_analysis_header": "Interaktive Analyse",
+    "select_prompt_label": "Wählen Sie einen Prompt zur Analyse aus:",
+    "select_prompt_help": "Diese Auswahl steuert sowohl den interaktiven Schaltungsgraphen als auch den Merkmals-Explorer unten.",
+    "graph_stats_header": "Graph-Statistiken",
+    "full_graph_nodes_label": "Knoten (vollständiger Graph)",
+    "full_graph_edges_label": "Kanten (vollständiger Graph)",
+    "pruned_graph_nodes_label": "Knoten (reduzierter Graph)",
+    "pruned_graph_edges_label": "Kanten (reduzierter Graph)",
+    "feature_explorer_header": "Merkmals-Explorer",
+    "token_analysis_header": "Token-Analyse",
+    "input_tokens_label": "Eingabe-Token:",
+    "feature_explorer_title": "Merkmals-Explorer: {prompt}",
+    "select_layer_label": "Wählen Sie eine Schicht zum Erkunden aus:",
+    "layer_label_format": "Schicht {layer_num}",
+    "no_feature_viz_warning": "Für diesen Prompt sind keine Merkmalsvisualisierungen verfügbar.",
+    "no_features_in_layer_warning": "In {selected_layer} wurden keine Merkmale gefunden.",
+    "active_features_label": "**Aktive Merkmale:**",
+    "choose_feature_label": "Wählen Sie ein Merkmal aus:",
+    "max_activation_label": "Maximale Aktivierung",
+    "mean_activation_label": "Mittlere Aktivierung",
+    "sparsity_label": "Sparsamkeit",
+    "interpretation_label": "Interpretation",
+    "top_activating_tokens_title": "Top aktivierende Token für {selected_feature}",
+    "xaxis_token_label": "Token",
+    "yaxis_activation_label": "Aktivierungsstärke",
+    "generating_feature_explanation_spinner": "Generiere KI-Erklärung für Merkmalsaktivierung...",
+    "feature_explanation_error": "Konnte keine Merkmalserklärung generieren: {e}",
+    "ai_feature_analysis_header": "KI-Merkmalsanalyse",
+    "node_size_label": "Knotengröße",
+    "edge_threshold_label": "Kantenschwellenwert",
+    "tip_scroll_horizontally": "Tipp: Verwenden Sie das Mausrad + Shift, um horizontal zu scrollen und alle 32 Schichten zu sehen.",
+    "colorbar_title": "Aktivierung",
+    "path_highlight_label": "Schaltkreis-Pfad",
+    "connections_legend": "Verbindungen",
+    "embedding_legend": "Einbettung",
+    "feature_legend": "Merkmal",
+    "layer_nav_header": "Schichtnavigation",
+    "layer_nav_desc": "Dieser Graph zeigt <strong>{num_layers} Schichten</strong> mit Merkmalen. Verwenden Sie den Bereichsregler unter dem Graphen, um durch alle Schichten zu navigieren, oder verwenden Sie <strong>Shift + Mausrad</strong>, um horizontal zu scrollen.",
+    "generating_circuit_explanation_spinner": "Generiere KI-Erklärung für den Schaltungsgraphen...",
+    "circuit_explanation_error": "Konnte keine Schaltungserklärung generieren: {e}",
+    "ai_circuit_analysis_header": "KI-Schaltungsanalyse",
+    "layer_stats_header": "Schichtstatistiken",
+    "total_layers_label": "Gesamtzahl der Schichten mit Merkmalen",
+    "total_features_label": "Gesamtzahl der Merkmale",
+    "avg_features_per_layer_label": "Durchschnittliche Merkmale pro Schicht",
+    "features_by_layer_header": "Merkmale nach Schicht",
+    "feature_dist_title": "Merkmalsverteilung über die Schichten",
+    "feature_count_label": "Anzahl der Merkmale",
+    "subnetwork_explorer_title": "Subnetzwerk-Explorer",
+    "subnetwork_explorer_desc": "Wählen Sie ein zentrales Merkmal aus, um dessen lokale Nachbarschaft zu visualisieren, die sowohl seine vorgeschalteten Ursachen als auch seine nachgeschalteten Effekte innerhalb einer bestimmten Verbindungstiefe anzeigt.",
+    "subnetwork_graph_empty_info": "Der Hauptschaltungsgraph wurde noch nicht generiert. Bitte warten Sie, bis er geladen ist.",
+    "no_features_in_graph_warning": "In der aktuellen Graphansicht sind keine Merkmale verfügbar, um ein Subnetzwerk zu erstellen.",
+    "select_layer_label_subnetwork": "1. Wählen Sie eine Schicht",
+    "no_features_in_layer_subnetwork_warning": "Keine Merkmale in {selected_layer} zum Auswählen.",
+    "select_feature_label_subnetwork": "2. Wählen Sie ein zentrales Merkmal",
+    "traversal_depth_label": "3. Verbindungstiefe einstellen",
+    "subnetwork_graph_title": "Subnetzwerk zentriert auf Merkmal: {feature}",
+    "subnetwork_no_connections_info": "Dieses Merkmal hat keine Verbindungen innerhalb der ausgewählten Tiefe.",
+    "generating_subnetwork_explanation_spinner": "Analysiere Subnetzwerk mit KI...",
+    "ai_subnetwork_analysis_header": "KI-Subnetzwerkanalyse",
+    "subnetwork_analysis_title": "Token-Aktivierungsanalyse",
+    "subnetwork_no_features_info": "In diesem Subnetzwerk wurden keine Merkmale zur Analyse gefunden.",
+    "subnetwork_no_token_info": "Für die Merkmale in diesem Subnetzwerk sind keine Token-Aktivierungsdaten verfügbar.",
+    "subnetwork_top_tokens_desc": "Die folgenden Eingabe-Token haben die Merkmale in diesem Subnetzwerk am stärksten aktiviert:",
+    "subnetwork_token_interpretation_info": "Dies zeigt, auf welche Teile des Prompts das Subnetzwerk 'achtet'.",
+    "what_is_a_feature_header": "Schlüsselkonzept: Was ist ein 'Merkmal'?",
+    "what_is_a_feature_title": "Ein Merkmal ist ein erlerntes, interpretierbares Muster von Neuronenaktivität.",
+    "what_is_a_feature_desc": "Stellen Sie es sich wie einen Konzept-Detektor vor. Ein Merkmal könnte zum Beispiel stark auf Wörter reagieren, die sich auf 'Programmierung' beziehen, während ein anderes 'Fragen zur Geschichte' erkennt. Diese Merkmale sind die Bausteine, die das Modell verwendet, um Eingaben zu verstehen und eine Antwort zu erstellen. Indem wir sie verfolgen, können wir den Denkprozess des Modells nachzeichnen.",
+    "faithfulness_check_expander": "Überprüfung der Faktentreue",
+    "running_faithfulness_check_spinner": "Führe Überprüfung der Faktentreue aus...",
+    "verified_status": "Verifiziert",
+    "contradicted_status": "Widersprochen",
+    "claim_label": "Aussage",
+    "status_label": "Status",
+    "evidence_label": "Beweis",
+    "no_verifiable_claims_info": "Aus der Erklärung konnten keine überprüfbaren Aussagen extrahiert werden.",
+    "faithfulness_explanation_circuit_graph_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>Wie das funktioniert:</strong> Der Faithfulness Checker überprüft zwei Arten von Behauptungen aus der KI-Erklärung:<ul><li><strong>Behauptungen zur Merkmalsinterpretation:</strong> Überprüft mittels Fuzzy-String-Matching, ob eine behauptete Interpretation für ein Merkmal in einer bestimmten Schicht (z.B. 'Erkennung des grammatikalischen Modus') eng mit der tatsächlichen Interpretation eines Merkmals in dieser Schicht übereinstimmt.</li><li><strong>Behauptungen zur Schichtrolle:</strong> Überprüft semantisch, ob die Zusammenfassung der Rolle eines Schichtabschnitts durch die KI (z.B. 'frühe Schichten behandeln die Syntax') eine plausible Verallgemeinerung der tatsächlichen Top-Merkmalsinterpretationen in diesem Abschnitt ist.</li></ul></div>",
+    "faithfulness_explanation_feature_explorer_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>Wie das funktioniert:</strong> Der Faithfulness Checker überprüft zwei Arten von Behauptungen aus der KI-Erklärung:<ul><li><strong>Behauptungen zu Top-Token:</strong> Überprüft, ob ein Token, das als Top-Aktivator für ein Merkmal beansprucht wird, tatsächlich in der Liste der Top-aktivierenden Token aus den Analysedaten vorhanden ist.</li><li><strong>Behauptungen zur Merkmalsrolle:</strong> Überprüft mittels Fuzzy-String-Matching, ob die zusammengefasste Interpretation der Rolle eines Merkmals durch die KI eng mit der detaillierten Interpretation aus den Analysedaten übereinstimmt.</li></ul></div>",
+    "faithfulness_explanation_subnetwork_graph_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>Wie das funktioniert:</strong> Der Faithfulness Checker überprüft drei Arten von Behauptungen aus der KI-Erklärung:<ul><li><strong>Kausale Behauptungen:</strong> Überprüft, ob eine behauptete kausale Verbindung (vorgelagert oder nachgelagert) gültig ist, indem mittels Fuzzy-String-Matching bestätigt wird, dass die Interpretation des behaupteten Merkmals in der tatsächlichen Liste der vor- oder nachgelagerten Nachbarn existiert.</li><li><strong>Behauptungen zum Token-Einfluss:</strong> Überprüft, ob als vorgelagerte Einflüsse beanspruchte Token in der tatsächlichen Liste der direkten vorgelagerten Token für das zentrale Merkmal vorhanden sind.</li><li><strong>Behauptungen zur Rolle des zentralen Merkmals:</strong> Überprüft mittels Fuzzy-String-Matching, ob die Interpretation der Rolle des zentralen Merkmals durch die KI eng mit der Interpretation aus den Analysedaten übereinstimmt.</li></ul></div>",
+    "claim_extraction_prompt_header": "Sie sind ein Experten-System zur Extraktion von Behauptungen. Ihre Aufgabe ist es, eine Erklärung einer Circuit-Trace-Visualisierung zu lesen und überprüfbare Behauptungen in eine strukturierte JSON-Liste zu extrahieren.",
+    "claim_extraction_prompt_instruction": "Jedes Objekt in der Liste MUSS enthalten: `claim_text`, `claim_type`, und `details`. Der `claim_text` sollte der vollständige, ursprüngliche Satz aus der Erklärung sein.",
+    "claim_extraction_prompt_rule": "**Extraktionsregeln:**\n1. **Behalten Sie die ursprüngliche Reihenfolge bei.** Die Behauptungen in der endgültigen JSON-Liste müssen in derselben Reihenfolge erscheinen wie im Quelltext.\n2. **Ignorieren Sie legendenartige Beschreibungen.** Extrahieren Sie keine Behauptungen aus Sätzen, die nur erklären, was die visuellen Elemente des Graphen darstellen (z. B. 'Jeder Knoten ist ein Merkmal', 'Farbe zeigt Aktivierung an'). Extrahieren Sie nur Behauptungen, die eine spezifische Aussage darüber machen, *was das Modell tut* für den aktuellen Prompt (z. B. 'Schicht 10 zeigt hohe Aktivierung für 'Syntax'-Merkmale').\n3. **Halten Sie Behauptungen kurz.** Eine einzelne Behauptung sollte keinen ganzen Absatz umfassen. Gliedern Sie lange Absätze in mehrere, kleinere Behauptungen, in der Regel eine für jeden Hauptpunkt oder eine kleine Gruppe verwandter Punkte.\n4. Extrahieren Sie für jede `interpretation_summary` oder `role_summary` nur das Kernkonzept, das sich normalerweise in einfachen Anführungszeichen befindet (z. B. aus „bemerkenswerte Aktivität für 'Satzstruktur'“ extrahieren Sie nur „Satzstruktur“).\n5. **Entscheidend: Wenn ein einzelner Satz mehrere Behauptungen aufstellt, MÜSSEN Sie diese in einem einzigen Behauptungsobjekt zusammenfassen.**\n   - Für `feature_interpretation_claim` sollte `details` eine Liste von Objekten sein, die jeweils `layer` und `interpretation_summary` enthalten.\n   - Für `layer_role_claim`, wenn die Behauptung mehrere Abschnitte (früh, mittel, spät) umfasst, sollte `details` eine Liste von Objekten sein, die jeweils `layer_section` und `role_summary` enthalten.",
+    "claim_extraction_prompt_context_header": "**Kontext:** {context}",
+    "claim_extraction_prompt_types_header": "**Verfügbare Behauptungstypen:**",
+    "claim_extraction_prompt_analyze_header": "**Zu analysierende Erklärung:**",
+    "claim_extraction_prompt_footer": "Antworten Sie NUR mit der JSON-Liste der Behauptungen. -",
+    "circuit_graph_claim_types": "- `feature_interpretation_claim`: Eine Behauptung über die interpretierte(n) Rolle(n) von Merkmalen in einer oder mehreren Schichten.\n  - `details`: Eine Liste von Objekten, z. B. `[{\"layer\": 6, \"interpretation_summary\": \"Satzstruktur\"}, {\"layer\": 9, \"interpretation_summary\": \"länderbezogene Kontexte\"}]`\n- `layer_role_claim`: Eine Behauptung über die allgemeine Funktion eines oder mehrerer Schichtabschnitte.\n  - `details`: Eine Liste von Objekten, z. B. `[{\"layer_section\": \"early\", \"role_summary\": \"Eingabe zerlegen\"}, {\"layer_section\": \"middle\", \"role_summary\": \"Bedeutung entwickeln\"}]`",
+    "feature_explorer_claim_types": "- `top_token_activation_claim`: Eine Behauptung, dass ein oder mehrere Token Top-Aktivatoren für das Merkmal sind.\n  - `details`: { \"tokens\": [\"...\", \"...\"] }\n- `feature_interpretation_claim`: Eine Behauptung über die Rolle, das Verhalten, die Bedeutung des Merkmals basierend auf seiner Schichtposition oder die Begründung für seine Token-Aktivierungen (z. B. „Seine Präsenz in einer späten Schicht deutet darauf hin...“). Dies schließt hochrangige Einblicke ein. Das `details`-Feld kann leer sein, wenn keine spezifische Interpretation erwähnt wird.\n  - `details`: { \"interpretation_summaries\": [\"...\"] }",
+    "subnetwork_graph_claim_types": "- `causal_claim`: Eine Behauptung über vorgelagerte (Ursache) oder nachgelagerte (Wirkung) Beziehungen. Kann mehrere Merkmale umfassen.\n  - `details`: { \"source_feature_interpretations\": [\"...\", \"...\"], \"relationship\": \"upstream\" } oder { \"target_feature_interpretations\": [\"...\", \"...\"], \"relationship\": \"downstream\" }\n- `feature_interpretation_claim`: Eine Behauptung über die Funktion(en) des zentralen Merkmals.\n  - `details`: { \"interpretation_summaries\": [\"...\"] }\n- `token_influence_claim`: Eine Behauptung, dass ein oder mehrere Eingabe-Token direkte vorgelagerte Einflüsse auf das zentrale Merkmal sind.\n  - `details`: { \"tokens\": [\"...\"] }\n- `subnetwork_purpose_claim`: Eine Behauptung über den Gesamtzweck des Subnetzwerks.\n  - `details`: { \"purpose_summary\": \"...\" }",
+    "semantic_verification_prompt_header": "Sie sind ein KI-Faktenprüfer, der auf die Interpretierbarkeit von Transformer-Modellen spezialisiert ist. Ihre Aufgabe ist es festzustellen, ob eine „behauptete Zusammenfassung“ eine vernünftige und getreue semantische Zusammenfassung der „tatsächlichen Datenpunkte“ ist, unter Berücksichtigung des allgemeinen Wissens über die Funktionsweise von Transformer-Schichten.",
+    "semantic_verification_prompt_rules_header": "**Entscheidende Regeln:**",
+    "semantic_verification_prompt_rule_1": "1.  Die Zusammenfassung muss nicht die exakt gleichen Worte wie die Datenpunkte verwenden, aber sie muss semantisch konsistent sein.",
+    "semantic_verification_prompt_rule_2": "2.  **KRITISCHE REGEL: Allgemeine Prinzipien haben Vorrang vor Daten.** Für den von Ihnen analysierten **{layer_section}** lautet das Schlüsselprinzip: *{principle}*. Sie MÜSSEN Behauptungen verifizieren, die dieses weithin anerkannte allgemeine Prinzip über die Rollen von Transformer-Schichten angeben, auch wenn die spezifischen Datenpunkte für diesen Prompt nicht perfekt übereinstimmen. Wenn eine behauptete Zusammenfassung diesem Prinzip entspricht, MÜSSEN Sie mit `is_verified: true` und einer Begründung antworten, die dies als korrektes allgemeines Prinzip anerkennt.",
+    "semantic_verification_prompt_rule_3": "3.  **Verallgemeinerungen sind akzeptabel und erwartet.** Zusammenfassungen müssen nicht jeden Datenpunkt auflisten. Eine hochrangige, konzeptionell genaue Zusammenfassung ist gültig. Eine Behauptung sollte als verifiziert betrachtet werden, wenn sie einen korrekten Aspekt der Funktion der Schicht beschreibt, auch wenn sie keine umfassende Zusammenfassung aller Funktionen ist. Zum Beispiel ist eine Behauptung wie 'Zerlegen der Eingabe' eine faire Verallgemeinerung für die Rolle der frühen Schichten. **Sie DÜRFEN einer Behauptung nicht einfach widersprechen, weil sie 'vage' oder 'allgemein' ist, wenn sie nicht sachlich falsch ist.**",
+    "semantic_verification_principle_early": "**Frühe Schichten (ca. 0-10):** Behandeln Syntax, Grammatik und grundlegende Muster.",
+    "semantic_verification_principle_middle": "**Mittlere Schichten (ca. 11-21):** Entwickeln thematische Verbindungen, verknüpfen Konzepte und bilden abstrakte Bedeutung.",
+    "semantic_verification_principle_late": "**Späte Schichten (ca. 22-31):** Synthetisieren alle Informationen, um die endgültige Ausgabe zu finalisieren.",
+    "semantic_verification_prompt_subnetwork_header": "Sie sind ein KI-Faktenprüfer, der auf die Interpretierbarkeit von Transformer-Modellen spezialisiert ist. Ihre Aufgabe ist es festzustellen, ob der 'behauptete Zweck' eine vernünftige und getreue semantische Zusammenfassung der Rollen der einzelnen Merkmale ist, die dieses rechentechnische Subnetzwerk bilden.",
+    "semantic_verification_prompt_subnetwork_rules_header": "**Entscheidende Regeln:**",
+    "semantic_verification_prompt_subnetwork_rule_1": "1. Der Zweck muss nicht dieselben Worte wie die Datenpunkte verwenden, aber er muss semantisch konsistent sein.",
+    "semantic_verification_prompt_subnetwork_rule_2": "2. Verallgemeinerungen sind akzeptabel, wenn sie korrekt sind (z.B. ist die Zusammenfassung von 'erkennt Satzzeichen' und 'identifiziert Wortarten' als 'Behandlung von Syntax' eine faire Verallgemeinerung).",
+    "semantic_verification_prompt_subnetwork_actual_data_header": "**Tatsächliche Datenpunkte (Merkmalsinterpretationen aus dem Subnetzwerk):**",
+    "semantic_verification_prompt_subnetwork_claimed_purpose_header": "**Behaupteter Zweck:**",
+    "semantic_verification_prompt_actual_data_header": "**Tatsächliche Datenpunkte (Top-Merkmalsinterpretationen aus diesem Schichtabschnitt):**",
+    "semantic_verification_prompt_claimed_summary_header": "**Behauptete Zusammenfassung:**",
+    "semantic_verification_prompt_task_header": "**Ihre Aufgabe:**",
+    "semantic_verification_prompt_task_instruction": "Ist die Zusammenfassung basierend auf den obigen Regeln eine faire und genaue semantische Beschreibung der Daten? Antworten Sie mit einem JSON-Objekt mit zwei Schlüsseln: `is_verified` (boolean) und `reasoning` (ein-satzige Erklärung). -",
+    "semantic_verification_prompt_feature_role_header": "Sie sind ein KI-Faktenprüfer, der auf die Interpretierbarkeit von Transformer-Modellen spezialisiert ist. Ihre Aufgabe ist es festzustellen, ob die 'behauptete Rolle' eine vernünftige und getreue semantische Zusammenfassung der bereitgestellten 'Merkmalsbeweise' ist.",
+    "semantic_verification_prompt_feature_role_rules_header": "**Entscheidende Regeln:**",
+    "semantic_verification_prompt_feature_role_rule_1": "1. Die behauptete Rolle muss nicht dieselben Worte wie die Beweise verwenden, aber sie muss semantisch konsistent und eine plausible Interpretation sein.",
+    "semantic_verification_prompt_feature_role_rule_2": "2. Betrachten Sie die Schichtposition (früh/mittel/spät) als wichtigen Kontext. Eine Behauptung, die mit der typischen Funktion dieses Schichtabschnitts übereinstimmt, ist wahrscheinlicher korrekt.",
+    "semantic_verification_prompt_feature_role_guidance_early": "Behandle Aussagen, die sich auf grundlegende Grammatik, Wortstellung oder andere grundlegende Satzaufbaumuster beziehen, als mit dem Verhalten früher Schichten vereinbar, auch wenn sie anders formuliert sind.",
+    "semantic_verification_prompt_feature_role_guidance_middle": "Akzeptiere Aussagen über Integration, kontextuelles Verknüpfen oder thematischen Aufbau als typisch für mittlere Schichten, selbst wenn sie anders formuliert sind.",
+    "semantic_verification_prompt_feature_role_guidance_late": "Akzeptiere Aussagen über die Synthese von Informationen, das Finalisieren von Antworten oder die Ausgabeerzeugung als Verhalten später Schichten, auch wenn andere Formulierungen verwendet werden.",
+    "semantic_verification_prompt_feature_role_rule_3": "3. Wenn vor- oder nachgelagerte Verbindungen bereitgestellt werden, verwenden Sie diese, um Behauptungen zu bewerten, dass das Merkmal als 'Brücke', 'Knotenpunkt' oder zum 'Integrieren' von Informationen dient. Die Behauptung sollte mit den Interpretationen der verbundenen Merkmale konsistent sein.",
+    "semantic_verification_prompt_feature_role_evidence_header": "**Merkmalsbeweise:**",
+    "semantic_verification_prompt_feature_role_upstream_header": "- **Vorgelagerte Verbindungen (Top-Interpretationen):** {interpretations}",
+    "semantic_verification_prompt_feature_role_downstream_header": "- **Nachgelagerte Verbindungen (Top-Interpretationen):** {interpretations}",
+    "semantic_verification_prompt_feature_role_claimed_role_header": "**Behauptete Rolle:**",
+    "semantic_verification_prompt_token_reasoning_header": "Sie sind ein KI-Faktenprüfer, der auf die Interpretierbarkeit von Transformer-Modellen spezialisiert ist. Ihre Aufgabe ist es festzustellen, ob die 'behauptete Erklärung', warum bestimmte Token ein Merkmal aktivieren, eine vernünftige und getreue semantische Zusammenfassung der bereitgestellten 'Merkmalsbeweise' ist.",
+    "semantic_verification_prompt_token_reasoning_rules_header": "**Entscheidende Regeln:**",
+    "semantic_verification_prompt_token_reasoning_rule_1": "1. Die Erklärung muss nicht dieselben Worte wie die Beweise verwenden, aber sie muss semantisch konsistent und eine plausible Interpretation der Token-Merkmal-Interaktion sein.",
+    "semantic_verification_prompt_token_reasoning_rule_2": "2. Konzentrieren Sie sich auf die bereitgestellte Begründung. Die Behauptung ist nicht nur, dass die Token das Merkmal aktivieren, sondern *warum* sie dies tun. Ist die Erklärung angesichts der Rolle des Merkmals und der Schichtposition logisch?",
+    "semantic_verification_prompt_token_reasoning_evidence_header": "**Merkmalsbeweise:**",
+    "semantic_verification_prompt_token_reasoning_claimed_explanation_header": "**Behauptete Erklärung:**",
+    "semantic_verification_prompt_causal_reasoning_header": "Sie sind ein KI-Faktenprüfer, der auf die Interpretierbarkeit von Transformer-Modellen spezialisiert ist. Ihre Aufgabe ist es festzustellen, ob die 'behauptete kausale Erklärung' eine vernünftige und getreue Zusammenfassung der bereitgestellten 'kausalen Beweise' ist.",
+    "semantic_verification_prompt_causal_reasoning_rules_header": "**Entscheidende Regeln:**",
+    "semantic_verification_prompt_causal_reasoning_rule_1": "1. Die Erklärung muss semantisch mit den Rollen der Quell-, Zentral- und Zielmerkmale übereinstimmen.",
+    "semantic_verification_prompt_causal_reasoning_rule_2": "2. Konzentrieren Sie sich auf die Begründung. Die Behauptung ist nicht nur, dass eine Verbindung besteht, sondern *warum* sie besteht oder was ihre Funktion ist. Ist die Erklärung logisch?",
+    "semantic_verification_prompt_causal_reasoning_evidence_header": "**Kausale Beweise:**",
+    "semantic_verification_prompt_causal_reasoning_claimed_explanation_header": "**Behauptete kausale Erklärung:**",
+    "explanation_prompt_header": "Sie sind ein Experte für die Interpretierbarkeit neuronaler Netze und die Circuit-Tracing-Analyse. Analysieren Sie diese Visualisierung, die zeigt, wie Informationen durch das Sprachmodell OLMo2 7B mittels Cross-Layer-Transcodern fließen.",
+    "explanation_prompt_context_header": "## Kontext",
+    "explanation_prompt_instructions_header": "## Anweisungen",
+    "circuit_graph_instruction_header": "Stellen Sie eine strukturierte, schichtweise Analyse des Schaltkreisgraphen bereit. Ihre Antwort MUSS kleinere Markdown-Überschriften (`####`) verwenden. Beziehen Sie sich nicht auf spezifische Merkmalsnummern (z. B. „Merkmal_411“); beschreiben Sie stattdessen deren Funktion basierend auf ihrer Interpretation.",
+    "circuit_graph_instruction_intro": "#### Einleitung: Was dieser Graph zeigt\nErklären Sie, was dieser spezifische Schaltkreisgraph für den gegebenen Prompt visualisiert. Erwähnen Sie, dass er den Informationsfluss von den Eingabe-Token durch Merkmalsaktivierungen in verschiedenen Schichten zeigt.",
+    "circuit_graph_instruction_early": "#### Frühe Schichten (0-10): Eingabeverarbeitung\nBeschreiben Sie anhand der im Kontext bereitgestellten Top-Merkmale die Hauptrolle dieser Schichten. Erklären Sie, wie sie die grundlegende Grammatik, Syntax oder Schlüsselbegriffe der Eingabe dekonstruieren, indem Sie die Funktionen der aktiven Merkmale beschreiben.",
+    "circuit_graph_instruction_middle": "#### Mittlere Schichten (11-21): Bedeutungsentwicklung\nErklären Sie, was diese Schichten mit den anfänglichen Mustern machen. Beschreiben Sie, wie sie Konzepte verknüpfen, Beziehungen aufbauen oder den Fokus der Analyse auf ein abstrakteres Verständnis verschieben.",
+    "circuit_graph_instruction_late": "#### Späte Schichten (22-31): Finalisierung der Ausgabe\nBeschreiben Sie, wie diese Schichten alle vorherigen Informationen synthetisieren, um das Endergebnis zu erzeugen, und konzentrieren Sie sich darauf, wie die Top-Merkmale zur Ausgabe des Modells beitragen.",
+    "circuit_graph_instruction_insight": "#### Zentrale Erkenntnis\nSchließen Sie mit einer zentralen Erkenntnis aus dieser Analyse ab. Was ist der wichtigste oder überraschendste Aspekt der Strategie des Modells für diesen Prompt?",
+    "circuit_graph_instruction_footer": "Stellen Sie sicher, dass Ihre gesamte Antwort dieser Struktur aus Überschriften und Absätzen folgt. Verwenden Sie keine Aufzählungsliste für die Hauptabschnitte.",
+    "feature_explorer_instruction_header": "Stellen Sie eine strukturierte Analyse des gezeigten Merkmals bereit. Ihre Antwort MUSS eine Markdown-Aufzählung sein, bei der jeder Punkt in einer NEUEN ZEILE steht. Verwenden Sie die folgende Struktur:",
+    "feature_explorer_instruction_role": "- **Rolle des Merkmals und Schichtkontext:** Erklären Sie die Interpretation des Merkmals und was seine Präsenz in dieser spezifischen Schicht (früh/mittel/spät) über seine Funktion aussagt.",
+    "feature_explorer_instruction_activations": "- **Wichtige Token-Aktivierungen:** Identifizieren Sie die am stärksten aktivierenden Token und erklären Sie, warum sie für die Rolle des Merkmals relevant sind.",
+    "feature_explorer_instruction_insight": "- **Gesamterkenntnis:** Geben Sie eine abschließende Erkenntnis darüber, was das Verhalten dieses Merkmals über die Informationsverarbeitungsstrategie des Modells verrät.",
+    "feature_explorer_instruction_footer": "Stellen Sie sicher, dass Ihre Ausgabe NUR aus dieser Drei-Punkte-Liste besteht.",
+    "subnetwork_graph_instruction_header": "Stellen Sie eine prägnante, aufschlussreiche Analyse dieses Subnetzwerks bereit. Ihre Antwort MUSS eine Markdown-Aufzählung sein, bei der jeder Punkt in einer NEUEN ZEILE steht. Verwenden Sie die folgende Struktur:",
+    "subnetwork_graph_instruction_role": "- **Rolle des zentralen Merkmals:** Erklären Sie kurz die Funktion des zentralen Merkmals basierend auf seiner Interpretation und Schichtposition.",
+    "subnetwork_graph_instruction_upstream": "- **Vorgeschalteter Einfluss:** Beschreiben Sie, welche früheren Merkmale oder Eingabe-Token (die Ursachen) dieses zentrale Merkmal am stärksten aktivieren. Nennen Sie bei Merkmalen auch deren Interpretation aus dem Kontext.",
+    "subnetwork_graph_instruction_downstream": "- **Nachgeschaltete Auswirkung:** Beschreiben Sie, zu welchen späteren Merkmalen (die Effekte) dieses zentrale Merkmal am stärksten beiträgt. Nennen Sie bei Merkmalen auch deren Interpretation aus dem Kontext.",
+    "subnetwork_graph_instruction_purpose": "- **Zweck des Subnetzwerks:** Fassen Sie die obigen Punkte zusammen, um eine Hypothese über den Gesamtzweck dieses spezifischen Berechnungspfads bei der Verarbeitung des Prompts aufzustellen.",
+    "subnetwork_graph_instruction_footer": "Stellen Sie sicher, dass Ihre Ausgabe NUR aus dieser Vier-Punkte-Liste besteht.",
+    "context_unspecified_viz": "Dies ist eine Circuit-Tracing-Visualisierung, die den Informationsfluss durch das Modell zeigt.",
+    "instruction_unspecified_viz": "Erklären Sie diese Visualisierung.",
+    "circuit_graph_context_header": "Dies ist ein Circuit-Tracing-Graph für den Prompt: „{prompt}“",
+    "circuit_graph_context_tokens": "Eingabe-Token: {tokens}",
+    "circuit_graph_context_summary_header": "#### Zusammenfassung der wichtigsten Merkmale nach Schichtabschnitt\nHier sind die aktivsten Merkmale in jedem Abschnitt des Modells für diesen Prompt:",
+    "circuit_graph_context_early_header": "**Frühe Schichten (0-10):**",
+    "circuit_graph_context_middle_header": "**Mittlere Schichten (11-21):**",
+    "circuit_graph_context_late_header": "**Späte Schichten (22-31):**",
+    "circuit_graph_context_no_features": "Keine signifikant aktiven Merkmale gefunden.",
+    "circuit_graph_context_feature_line": "- In L{layer} wird ein Merkmal als „{interpretation}“ interpretiert (Aktivierung: {activation:.2f})",
+    "subnetwork_context_header": "Dies ist eine Subnetzwerk-Visualisierung aus einem größeren Circuit-Trace für den Prompt: „{prompt}“",
+    "subnetwork_context_centered_on": "Das Subnetzwerk ist zentriert um:",
+    "subnetwork_context_feature": "- **Merkmal:** {name}",
+    "subnetwork_context_layer": "- **Schicht:** {layer}",
+    "subnetwork_context_interpretation": "- **Interpretation:** „{interpretation}“",
+    "subnetwork_context_no_interpretation": "Keine Interpretation verfügbar.",
+    "subnetwork_context_upstream_header": "\nWichtige vorgelagerte Merkmale (Ursachen) in diesem Subnetzwerk:",
+    "subnetwork_context_downstream_header": "\nWichtige nachgelagerte Merkmale (Effekte) in diesem Subnetzwerk:",
+    "subnetwork_context_feature_line": "- L{layer} {feature_name}: „{interpretation}“",
+    "subnetwork_context_depth": "Die Ansicht zeigt Verbindungen innerhalb einer Tiefe von **{depth}** Sprüngen vom zentralen Merkmal (in purpurrot hervorgehoben).",
+    "subnetwork_context_stats_header": "Subnetzwerk-Statistiken:",
+    "subnetwork_context_stats_nodes": "- **Knoten:** {nodes}",
+    "subnetwork_context_stats_edges": "- **Kanten:** {edges}",
+    "subnetwork_context_viz_header": "Die Visualisierung zeigt:",
+    "subnetwork_context_viz_central": "- Das zentrale Merkmal (purpurroter Rand) und seine Nachbarn.",
+    "subnetwork_context_viz_nodes": "- Vorgelagerte Knoten (Ursachen) und nachgelagerte Knoten (Effekte).",
+    "subnetwork_context_viz_lilac": "- Fliederfarbene Knoten sind Eingabe-Token-Einbettungen.",
+    "subnetwork_context_viz_other": "- Andere Knoten sind Merkmale, gefärbt nach Aktivierungsstärke (Viridis-Skala).",
+    "subnetwork_context_viz_edges": "- Kantendicke repräsentiert Verbindungsgewichte.",
+    "feature_explorer_context_header": "Dies ist eine Merkmals-Explorer-Visualisierung für den Prompt: „{prompt}“",
+    "feature_explorer_context_model_header": "**Modellkontext:** Das Modell ist OLMo-2-7B, das 32 Schichten hat (indiziert 0-31). Schicht 0 ist die erste Schicht (am nächsten an den Eingabe-Einbettungen), und Schicht 31 ist die letzte Schicht (am nächsten an der Endausgabe). Frühe Schichten (z.B. 0-10) behandeln grundlegende Muster, während späte Schichten (z.B. 22-31) abstraktere Konzepte behandeln.",
+    "feature_explorer_context_analyzing_feature": "Wir analysieren **Merkmal {feature}** in **Schicht {layer}**, was eine {position} Schicht im Modell ist.",
+    "feature_explorer_context_analyzing_feature_no_pos": "Wir analysieren **Merkmal {feature}** in **Schicht {layer}**.",
+    "feature_explorer_context_position_early": "frühe",
+    "feature_explorer_context_position_middle": "mittlere",
+    "feature_explorer_context_position_late": "späte",
+    "feature_explorer_context_tokens": "**Eingabe-Token:** {tokens}",
+    "feature_explorer_context_interpretation": "**Merkmalsinterpretation:** „{interpretation}“",
+    "feature_explorer_context_no_interpretation": "Keine Interpretation verfügbar.",
+    "feature_explorer_context_footer": "Das Balkendiagramm zeigt, welche Eingabe-Token die höchste Aktivierung für dieses spezifische Merkmal innerhalb seiner Schicht verursacht haben. Analysieren Sie die Beziehung zwischen den Token und der Interpretation des Merkmals unter Berücksichtigung der Position der Schicht."
+}

locales/de/common.json ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+    "llm_analysis_suite": "Explainable Language Interpretability Analysis Tool",
+    "main_menu": "Hauptmenü",
+    "attribution_analysis": "Attributionsanalyse",
+    "function_vectors": "Funktionsvektoren",
+    "circuit_tracing": "Schaltkreisverfolgung",
+    "language": "Sprache",
+    "unable_to_generate_explanation": "Erklärung kann zurzeit nicht generiert werden.",
+    "clear_cache_button": "Cache leeren & neu starten",
+    "q_influential_docs_plausibility_help": "Wie plausibel fanden Sie die vom Influence Tracer identifizierten Dokumente? (1=Nicht plausibel, 5=Sehr plausibel)",
+    "comprehension_qs_subheader": "Verständnisfragen",
+    "comprehension_qs_desc": "Bitte beantworten Sie die folgenden Fragen nach bestem Wissen und Gewissen, basierend auf Ihrem Verständnis der Visualisierungen.",
+    "submit_feedback_button": "Feedback absenden",
+    "feedback_success_message": "Vielen Dank, Ihr Feedback wurde übermittelt!",
+    "feedback_please_answer_all_qs": "Bitte beantworten Sie alle Verständnisfragen vor dem Absenden.",
+    "show_less_button": "Weniger anzeigen",
+    "what_is_this_function_type": "Was ist dieser Funktionstyp?",
+    "Antonym": "Antonym",
+    "Capitalize": "Großschreibung",
+    "Country Capital": "Landeshauptstadt",
+    "Country Currency": "Landeswährung",
+    "Translation French": "Übersetzung Französisch",
+    "Translation German": "Übersetzung Deutsch",
+    "Translation Spanish": "Übersetzung Spanisch",
+    "Landmark Country": "Wahrzeichen Land",
+    "Lowercase": "Kleinschreibung",
+    "National Parks": "Nationalparks",
+    "Next Item": "Nächstes Element",
+    "Previous Item": "Vorheriges Element",
+    "Park Country": "Park Land",
+    "Person Instrument": "Person Instrument",
+    "Person Occupation": "Person Beruf",
+    "Person Sport": "Person Sport",
+    "Present Past": "Gegenwart Vergangenheit",
+    "Product Company": "Produkt Unternehmen",
+    "Singular Plural": "Singular Plural",
+    "Synonym": "Synonym",
+    "Commonsense QA": "Allgemeinwissen QA",
+    "Math QA": "Mathe QA",
+    "Science QA": "Wissenschaft QA",
+    "History QA": "Geschichte QA",
+    "Geography QA": "Geographie QA",
+    "Biology QA": "Biologie QA",
+    "Chemistry QA": "Chemie QA",
+    "Physics QA": "Physik QA",
+    "Literature QA": "Literatur QA",
+    "Technology QA": "Technologie QA",
+    "Sports QA": "Sport QA",
+    "Music QA": "Musik QA",
+    "Art QA": "Kunst QA",
+    "Food QA": "Essen QA",
+    "Health QA": "Gesundheit QA",
+    "Business QA": "Wirtschaft QA",
+    "Environment QA": "Umwelt QA",
+    "Psychology QA": "Psychologie QA",
+    "Language QA": "Sprache QA",
+    "Animal QA": "Tier QA",
+    "Sentiment Analysis": "Stimmungsanalyse",
+    "Topic Classification": "Themenklassifizierung",
+    "Language Detection": "Sprachenerkennung",
+    "Spam Detection": "Spam-Erkennung",
+    "Ag News": "Nachrichten",
+    "Genre Classification": "Genre-Klassifizierung",
+    "Intent Classification": "Absichtsklassifizierung",
+    "Emotion Detection": "Emotionserkennung",
+    "Difficulty Level": "Schwierigkeitsgrad",
+    "Urgency Classification": "Dringlichkeitsklassifizierung",
+    "Formality Level": "Formalitätsstufe",
+    "Age Group Target": "Altersgruppen-Ziel",
+    "Readability Level": "Lesbarkeitsstufe",
+    "Political Leaning": "Politische Ausrichtung",
+    "Safety Level": "Sicherheitsstufe",
+    "Bias Detection": "Voreingenommenheitserkennung",
+    "Credibility Assessment": "Glaubwürdigkeitsbewertung",
+    "Content Rating": "Inhaltsbewertung",
+    "Complexity Level": "Komplexitätsstufe",
+    "Privacy Sensitivity": "Datenschutzsensibilität",
+    "Adjective Vs Verb": "Adjektiv vs. Verb",
+    "Animal Vs Object": "Tier vs. Objekt",
+    "Choose First Of List": "Erstes aus der Liste",
+    "Choose Middle Of List": "Mitte der Liste",
+    "Choose Last Of List": "Letztes aus der Liste",
+    "Color Vs Animal": "Farbe vs. Tier",
+    "Concept Vs Object": "Konzept vs. Objekt",
+    "Fruit Vs Animal": "Frucht vs. Tier",
+    "Object Vs Concept": "Objekt vs. Konzept",
+    "Verb Vs Adjective": "Verb vs. Adjektiv",
+    "Living Vs Nonliving": "Lebewesen vs. Unbelebtes",
+    "Natural Vs Artificial": "Natürlich vs. Künstlich",
+    "Singular Vs Plural Extractive": "Singular vs. Plural (Extraktiv)",
+    "Concrete Vs Abstract": "Konkret vs. Abstrakt",
+    "Positive Vs Negative": "Positiv vs. Negativ",
+    "Past Vs Present": "Vergangenheit vs. Gegenwart",
+    "Question Vs Statement": "Frage vs. Aussage",
+    "Formal Vs Informal": "Formell vs. Informell",
+    "Active Vs Passive": "Aktiv vs. Passiv",
+    "Literal Vs Figurative": "Wörtlich vs. Bildlich",
+    "Ner Person": "NER Person",
+    "Ner Location": "NER Ort",
+    "Ner Organization": "NER Organisation",
+    "Ner Date": "NER Datum",
+    "Ner Number": "NER Nummer",
+    "Ner Product": "NER Produkt",
+    "Ner Currency": "NER Währung",
+    "Ner Language": "NER Sprache",
+    "Ner Nationality": "NER Nationalität",
+    "Ner Event": "NER Ereignis",
+    "Ner Title": "NER Titel",
+    "Ner Website": "NER Webseite",
+    "Ner Email": "NER E-Mail",
+    "Ner Phone": "NER Telefon",
+    "Ner Address": "NER Adresse",
+    "Ner Time": "NER Zeit",
+    "Ner Percentage": "NER Prozentsatz",
+    "Ner Age": "NER Alter",
+    "Ner Duration": "NER Dauer",
+    "Ner Distance": "NER Entfernung",
+    "Complete Sentence": "Satz vervollständigen",
+    "Continue Story": "Geschichte fortsetzen",
+    "Writing Headlines": "Schlagzeilen schreiben",
+    "Question Generation": "Fragen generieren",
+    "Dialogue Generation": "Dialog generieren",
+    "Poetry Creation": "Gedichte erstellen",
+    "Recipe Writing": "Rezepte schreiben",
+    "Email Composition": "E-Mail verfassen",
+    "Social Media Posts": "Social-Media-Beiträge",
+    "Product Descriptions": "Produktbeschreibungen",
+    "Character Creation": "Charaktererstellung",
+    "Meeting Minutes": "Sitzungsprotokolle",
+    "Technical Documentation": "Technische Dokumentation",
+    "Creative Writing": "Kreatives Schreiben",
+    "Educational Content": "Bildungsinhalte",
+    "Review Writing": "Bewertungen schreiben",
+    "Persuasive Writing": "Überzeugendes Schreiben",
+    "Instructional Content": "Anleitungsinhalte",
+    "News Reporting": "Nachrichtenberichterstattung",
+    "Scientific Writing": "Wissenschaftliches Schreiben",
+    "desc_abstractive_tasks": "Diese Aufgaben erfordern vom Modell, neuen Text zu generieren, der die Essenz des Quelltextes erfasst, anstatt nur Teile davon zu extrahieren. Beispiele sind Zusammenfassungen oder Paraphrasierungen.",
+    "desc_multiple_choice_qa": "Dem Modell werden eine Frage und eine Reihe von Optionen gegeben, und es muss die richtige Antwort aus der Liste auswählen. Dies testet das logische Denken und das Verständnis über eine feste Auswahl an Möglichkeiten.",
+    "desc_text_classification": "Das Modell muss einem Text eine vordefinierte Kategorie oder ein Label zuweisen. Gängige Beispiele sind die Stimmungsanalyse (positiv/negativ), die Themenklassifizierung oder die Spam-Erkennung.",
+    "desc_extractive_tasks": "Diese Aufgaben beinhalten das Identifizieren und Extrahieren eines bestimmten Textabschnitts direkt aus einem gegebenen Kontext. Dies wird oft für Fragenbeantwortungen verwendet, bei denen die Antwort explizit im Text angegeben ist.",
+    "desc_named_entity_recognition": "Eine Unteraufgabe von extraktiven Aufgaben, bei der das Modell benannte Entitäten wie Personen, Organisationen, Orte, Daten und andere spezifische Begriffe im Text identifiziert und kategorisiert.",
+    "desc_text_generation": "Aufgaben zur offenen Texterstellung, bei denen das Modell kreativen, kohärenten oder kontextuell angemessenen Text basierend auf einer Eingabeaufforderung generiert. Beispiele sind das Schreiben einer Geschichte, eines Gedichts oder das Fortsetzen eines Absatzes.",
+    "likert_scale_meaning": "1 = Stimme überhaupt nicht zu/Überhaupt nicht klar, 5 = Stimme voll und ganz zu/Sehr klar",
+    "q1_pca_clarity": "Wie verständlich war die 3D-PCA-Visualisierung?",
+    "q2_type_attribution_clarity": "Wie verständlich war das Balkendiagramm zur Funktionsart-Attribution?",
+    "q_layer_evolution_plausibility": "Wie plausibel fanden Sie die Layer-Evolution-Analyse (die Art und Weise, wie sich die Funktion über die Layer verändert)?",
+    "ct_q_main_graph_clarity": "Wie klar war die Haupt-Schaltkreisgraph-Visualisierung zum Verständnis des gesamten Informationsflusses?",
+    "ct_q_feature_explorer_usefulness": "Wie nützlich war der Feature Explorer zum Verständnis einzelner Komponenten?",
+    "ct_q_subnetwork_clarity": "Wie hilfreich war die Subnetzwerk-Ansicht zur Verfolgung spezifischer Pfade?",
+    "ct_q1": "Was ist die Hauptrolle der FRÜHEN Schichten (z. B. 0-10) beim Circuit Tracing?",
+    "ct_q1_option_a": "Endgültige Konzepte zu synthetisieren und komplexe Entscheidungen zu treffen.",
+    "ct_q1_option_b": "Grundlegende Muster wie Syntax und Wortstellung aus dem Eingabetext zu verarbeiten.",
+    "ct_q1_option_c": "Abstrakte Ideen aus verschiedenen Teilen des Prompts miteinander zu verknüpfen.",
+    "ct_q2": "Was ist der Hauptvorteil bei der Verwendung des **Subnetzwerk-Explorers**, um sich auf ein einzelnes Merkmal zu konzentrieren?",
+    "ct_q2_option_a": "Um alle Merkmale im Modell auf einmal zu sehen.",
+    "ct_q2_option_b": "Um die lokale rechnerische Rolle eines Merkmals zu verstehen, indem man seine direkten Ursachen (Eingaben) und Auswirkungen (Ausgaben) sieht.",
+    "ct_q2_option_c": "Um die Farbe und Größe der Knoten im Graphen zu ändern.",
+    "ct_q3": "Wenn ein Merkmal in einer frühen Schicht (z. B. zur Erkennung der Syntax) stark mit einem Merkmal in einer späten Schicht (z. B. zur Identifizierung eines Konzepts) verbunden ist, was stellt dieser Pfad wahrscheinlich dar?",
+    "ct_q3_option_a": "Das Modell nutzt die grundlegende Grammatik, um ein abstrakteres, konzeptionelles Verständnis aufzubauen.",
+    "ct_q3_option_b": "Eine zufällige, bedeutungslose Verbindung, die ignoriert werden sollte.",
+    "ct_q3_option_c": "Das Modell achtet nur auf die letzten Schichten und ignoriert die frühen.",
+    "circuit_trace_explanation": "Circuit Tracing ist eine Technik, die verwendet wird, um den 'Denkprozess' eines Sprachmodells zu verstehen. Ziel ist es, die spezifischen Pfade oder <b>Schaltkreise</b> von Neuronen und Verbindungen zu identifizieren, die für ein bestimmtes Verhalten verantwortlich sind. Indem wir verfolgen, wie Informationen vom Eingabetext zur endgültigen Ausgabe fließen, können wir genau bestimmen, welche Teile des Modells welche Art von Arbeit leisten.<br><br>Diese Seite verwendet <b>Cross-Layer Transcoders (CLTs)</b>, eine spezifische Methode für das Circuit Tracing. Ein CLT ist ein kleines Diagnosewerkzeug, das darauf trainiert ist, die Informationen von einer Schicht des Modells in die Sprache der nächsten Schicht zu 'übersetzen'. Indem wir analysieren, wie gut es diese Übersetzung durchführen kann, können wir die Stärke der Verbindung messen und die wichtigsten Berechnungspfade im Netzwerk identifizieren."
+}

locales/de/function_vectors_page.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+    "fv_page_title": "<i class='bi bi-cpu'></i> Funktionsvektoranalyse",
+    "fv_page_desc": "Diese Seite untersucht das Konzept der <strong>Funktionsvektoren</strong> – hochdimensionale Darstellungen dessen, was ein Modell über den zugrunde liegenden Zweck einer Eingabeaufforderung 'versteht'. Durch die Visualisierung dieser Vektoren können wir sehen, wie das Modell ähnliche Aufgaben und Anweisungen gruppiert.",
+    "viz_dir_not_found_error": "Visualisierungsverzeichnis nicht gefunden. Bitte führen Sie zuerst die Funktionsvektoranalyse aus.",
+    "dataset_overview": "Datensatzübersicht",
+    "dataset_overview_desc_long": "Die folgenden Beispiele sind die Prompts, die zur Erstellung des vektorisierten Datensatzes verwendet werden. Dies hilft dabei, ein Gefühl dafür zu entwickeln, wie unterschiedliche Aufgaben im Vektorraum des Modells dargestellt werden, der in der folgenden Grafik visualisiert wird.",
+    "interactive_analysis_section_header": "<i class='bi bi-pencil-square'></i> Interaktive Analyse",
+    "pca_3d_section_header": "<i class='bi bi-dice-3'></i> 3D-PCA-Visualisierung von Funktionsvektoren",
+    "run_analysis_for_viz_info": "<i class='bi bi-info-circle'></i> Führen Sie unten eine interaktive Analyse durch, um Ihren eigenen Prompt in diesem Raum darzustellen.",
+    "pca_box_title": "<i class='bi bi-box'></i> Interaktive 3D-Hauptkomponentenanalyse",
+    "pca_box_purpose": "<strong>Zweck:</strong> Reduziert hochdimensionale Funktionsvektoren auf den 3D-Raum unter Beibehaltung der maximalen Varianz",
+    "pca_box_how_to": "<strong>Interaktion:</strong> Klicken und ziehen, um die Grafik zu drehen. Bewegen Sie den Mauszeiger über die Punkte, um zu sehen, zu welcher Kategorie sie gehören.",
+    "pca_box_features": "<strong>Hauptmerkmale:</strong> 3D-Rotation • Zoom & Schwenken • Hover-Details • Form- & Farbcodierung • Legende umschalten",
+    "pca_box_elements": "<strong>Visuelle Elemente:</strong> 🔵 Kreise (Abstraktiv) • 🔷 Rauten (QA) • 🟦 Quadrate (Klassifizierung) • ✖️ Kreuze (Extraktiv) • 🔹 Offene Rauten (NER) • ⬜ Offene Quadrate (Generierung)",
+    "pca_box_best_for": "<strong>Am besten für:</strong> Das Verständnis der gesamten funktionalen Organisation und der dimensionalen Beziehungen",
+    "generating_enhanced_pca_info": "🎯 Erstelle erweiterte 3D-PCA mit Ihrer Eingabe!",
+    "error_creating_enhanced_pca": "Fehler beim Erstellen der erweiterten PCA-Visualisierung: {e}",
+    "pca_3d_with_input_title": "3D-PCA mit Ihrer Eingabe",
+    "your_input_legend": "Ihre Eingabe",
+    "your_input_hover_title": "Ihre Eingabe",
+    "your_input_analysis_desc": "🔍 **Ihre Input-Analyse:** Der rote Stern zeigt, wo **\\\"{input_text}\\\"** im 3D-Funktionsraum liegt. Beachten Sie, welchen Funktionstypen er am nächsten ist - das verrät, welche linguistischen Fähigkeiten Ihr Text am stärksten aktiviert!",
+    "pca_3d_standard_title": "3D-PCA der Funktionskategorien<br><sub>Interaktive Visualisierung funktionaler Beziehungen</sub>",
+    "standard_view_desc": "🔍 **Standardansicht:** Dies zeigt alle 120 Funktionskategorien im 3D-Raum unter Verwendung tatsächlich berechneter Vektoren. Führen Sie oben eine interaktive Analyse durch, um Ihre Eingabe als rote Raute in dieser Visualisierung zu sehen!",
+    "error_creating_standard_pca": "Fehler beim Erstellen der Standard-PCA-Visualisierung: {e}",
+    "pca_viz_not_found_warning": "3D-PCA-Visualisierung nicht gefunden. Bitte generieren Sie sie mit dem Analyse-Skript.",
+    "pca_key_insights": "<strong>Wichtige Einblicke:</strong> Beachten Sie, wie sich englische Übersetzungsaufgaben (Englisch-Deutsch, Englisch-Spanisch usw.) zusammenballen und wie verschiedene Funktionstypen unterschiedliche Bereiche des 3D-Raums einnehmen, was die interne funktionale Organisation des Modells offenbart.",
+    "error_loading_pca_viz": "Fehler beim Laden der 3D-PCA-Visualisierung: {e}",
+    "interactive_analysis_box_title": "🔬 Interaktive Funktionsvektor- & Schichtevolutionsanalyse",
+    "interactive_analysis_box_purpose": "<strong>Zweck:</strong> Analysieren, wie Ihr Eingabetext verschiedene linguistische Funktionen aus unserem ausgewogenen Datensatz von 120 Kategorien aktiviert",
+    "interactive_analysis_box_features": "<strong>Merkmale:</strong> Echtzeitanalyse • Funktionsattribution über 6 Typen • Schichtevolution • Token-Level-Analyse • Visuelle Ausgaben",
+    "interactive_analysis_box_model": "<strong>Modell:</strong> OLMo-2-1124-7B analysiert anhand ausgewogener Funktionsvektoren (20 Kategorien pro Funktionstyp)",
+    "interactive_analysis_box_best_for": "<strong>Am besten für:</strong> Das Verständnis, wie spezifische Texteingaben funktionale Darstellungen über verschiedene linguistische Aufgaben hinweg aktivieren und entwickeln",
+    "input_text_header": "",
+    "input_text_label": "Geben Sie Ihre Eingabeaufforderung ein",
+    "input_text_placeholder": "Z.B. 'Übersetze 'Guten Morgen' nach Deutsch' oder 'Was ist die Hauptstadt von Frankreich?'",
+    "input_text_help": "Geben Sie einen beliebigen Text ein, den Sie analysieren möchten. Das System zeigt an, welche linguistischen Funktionen aktiviert werden und wie sie sich durch die Modellschichten entwickeln.",
+    "about_dataset_expander": "Über den Funktionsvektor-Datensatz",
+    "balanced_dataset_title": "Datensatzzusammensetzung",
+    "balanced_dataset_body": "Der Vergleichsdatensatz enthält 600 Prompts, die 120 Kategorien in 6 Hauptfunktionstypen abdecken.",
+    "analyze_button": "Text analysieren",
+    "running_analysis_spinner": "Analyse wird ausgeführt...",
+    "analysis_failed_error": "Analyse fehlgeschlagen. Bitte stellen Sie sicher, dass die Funktionsvektordaten generiert wurden.",
+    "analysis_error": "Fehler bei der Analyse: {e}",
+    "ensure_model_and_data_info": "Bitte stellen Sie sicher, dass das OLMo-2-1124-7B-Modell und die Funktionsvektordaten verfügbar sind.",
+    "example_queries_header": "<i class='bi bi-lightbulb'></i> Beispielabfragen zum Ausprobieren",
+    "example_queries_desc": "*Diese Beispiele zeigen verschiedene Funktionstypen aus unserem ausgewogenen Datensatz:*",
+    "example_query_help": "Klicken zum Analysieren: {example}",
+    "analysis_complete_success": "Analyse abgeschlossen!",
+    "analyzed_text_header": "Analysierter Text",
+    "function_types_tab": "<i class='bi bi-bar-chart-line'></i> Funktionstyp-Attribution",
+    "category_analysis_tab": "<i class='bi bi-pie-chart'></i> Kategorie-Analyse",
+    "layer_evolution_tab": "<i class='bi bi-layers'></i> Schicht-Evolutions-Analyse",
+    "ai_explanation_header": "<i class='bi bi-robot'></i> KI-gestützte Erklärung",
+    "generating_ai_explanation_spinner": "KI-gestützte Erklärung wird generiert...",
+    "enable_ai_explanation_checkbox": "KI-Erklärung aktivieren",
+    "enable_ai_explanation_help": "Generieren Sie eine natürlichsprachliche Erklärung der Analyseergebnisse mit dem Qwen-72B-VL-Modell.",
+    "pca_explanation_prompt_de": "Sie sind ein Experte für KI-Analysen. Ihre Aufgabe ist es, die Positionierung des Prompts eines Benutzers in einem 3D-PCA-Plot von Funktionsvektoren zu erklären. Der Plot visualisiert, wie ein Sprachmodell Prompts basierend auf ihrer zugrunde liegenden Funktion kategorisiert, wobei ähnliche Funktionen zusammengefasst werden.\\n\\n**Benutzer-Prompt:** \"{input_text}\"\\n\\n**Analysedaten (Top 3 Übereinstimmungen):**\\n- **Funktionstypen:** {top_types}\\n- **Spezifische Kategorien:** {top_cats}\\n\\nBasierend auf diesen Daten geben Sie bitte eine prägnante, analytische Erklärung in drei separaten Teilen. **Entscheidend ist, dass Sie für die Überschriften jedes Teils Markdown-Überschriften (`####`) verwenden und die angeforderte Struktur genau einhalten.**\\n\\n#### Gesamtplatzierung\\nBeginnen Sie mit einer allgemeinen Zusammenfassung, wo sich der Prompt im PCA-Plot befindet. Erwähnen Sie, in welche allgemeine funktionale Nachbarschaft er fällt.\\n\\n#### Top-Funktionstyp-Zuschreibungen\\nAnalysieren Sie die drei dominantesten Funktionstypen. Erklären Sie für jeden der drei Typen kurz, warum der Prompt des Benutzers damit übereinstimmt, und beziehen Sie sich dabei auf den Inhalt des Prompts und die Art des Funktionstyps.\\n\\n#### Top-spezifische Kategorie-Zuschreibung\\nDiskutieren Sie die drei spezifischsten Kategorien. Erklären Sie für jede Kategorie kurz die Verbindung und warum sie als enger Nachbar des Benutzer-Prompts sinnvoll ist.\\n\\nStrukturieren Sie Ihre Antwort mit klaren Überschriften für jeden der drei Teile. Stützen Sie Ihre gesamte Erklärung auf die bereitgestellten Daten.",
+    "function_type_attribution_header": "Dieses Diagramm zeigt, wie stark Ihre Eingabe mit den sechs Hauptfunktionstypen übereinstimmt, die in den Trainingsdaten des Modells definiert sind. Eine höhere Punktzahl bedeutet eine stärkere Übereinstimmung.",
+    "attribution_score_xaxis": "Zuschreibungspunktzahl (Kosinus-Ähnlichkeit)",
+    "running_layer_evolution_spinner": "Schichtentwicklungsanalyse wird ausgeführt...",
+    "evolution_not_available_info": "Die Schichtentwicklungsanalyse wurde nicht ausgeführt oder ist fehlgeschlagen. Bitte aktivieren Sie sie in den Optionen und versuchen Sie es erneut.",
+    "pca_3d_title": "3D PCA der {lang} Funktionskategorien",
+    "legend_title": "Funktionstypen",
+    "category_examples_desc": "",
+    "no_examples_for_type": "Für diesen Funktionstyp sind keine Beispiele in der ausgewählten Sprache verfügbar.",
+    "prompt_examples_for_category": "Beispiel-Prompts für {category}",
+    "no_examples_for_category_specific": "Für diese spezifische Kategorie sind keine Beispiele verfügbar.",
+    "function_types_subheader": "Funktionstypen",
+    "select_function_type_label": "Wählen Sie einen Funktionstyp zum Erkunden aus",
+    "prompt_examples_for_category_header": "Verwendete Prompts für {category}",
+    "show_all_button": "Alle {count} Kategorien anzeigen",
+    "show_less_button": "Weniger anzeigen",
+    "abstractive_tasks": "Abstrakte Aufgaben",
+    "multiple_choice_qa": "Multiple-Choice-Fragen",
+    "text_classification": "Textklassifizierung",
+    "extractive_tasks": "Extraktive Aufgaben",
+    "named_entity_recognition": "Named-Entity-Erkennung",
+    "text_generation": "Textgenerierung",
+    "feedback_survey_header": "Feedback & Verständnisumfrage",
+    "feedback_survey_desc": "Ihr Feedback ist wertvoll für die Verbesserung dieses Tools. Bitte nehmen Sie sich einen Moment Zeit, um diese Fragen zu beantworten.",
+    "ux_feedback_subheader": "User Experience Feedback",
+    "comprehension_subheader": "Verständnisfragen",
+    "likert_scale_meaning": "Bewerten Sie auf einer Skala von 1 (Überhaupt nicht klar) bis 5 (Sehr klar).",
+    "q1_pca_clarity": "Wie klar war die 3D-PCA-Visualisierung, um zu zeigen, wo Ihre Eingabe unter anderen Funktionen einzuordnen ist?",
+    "q2_cognitive_load": "Wie anspruchsvoll fanden Sie die Interpretation der Analyseergebnisse insgesamt?",
+    "submit_feedback_button": "Feedback absenden",
+    "feedback_success_message": "Vielen Dank für Ihr Feedback!",
+    "feedback_error_message": "Entschuldigung, beim Senden Ihres Feedbacks ist ein Fehler aufgetreten: {e}",
+    "feedback_please_answer_all_qs": "Bitte beantworten Sie alle Verständnisfragen, bevor Sie absenden.",
+    "sunburst_chart_title": "Top 20 Kategoriezuschreibungen",
+    "missing_category_mapping_warning": "Einige Kategorien konnten keinem Funktionstyp zugeordnet werden und wurden im Diagramm ausgelassen: {categories}",
+    "no_mapped_categories_info": "Es standen keine Kategorien mit gültigen Funktionstyp-Zuordnungen zur Anzeige zur Verfügung.",
+    "unmapped_function_type": "Nicht zugewiesener Funktionstyp",
+    "layer_evolution_tab": "Schichtentwicklung",
+    "layer_evolution_header": "Ein Sprachmodell ist keine einzelne Einheit; es besteht aus vielen aufeinanderfolgenden Schichten, ähnlich wie eine Fließbandfertigung in einer Fabrik. Wenn Sie eine Anweisung geben, durchläuft die Information jede Schicht und wird schrittweise verfeinert. Frühe Schichten befassen sich mit grundlegender Syntax und Wortbedeutungen, mittlere Schichten bauen komplexere Beziehungen auf, und die letzten Schichten synthetisieren diese Informationen, um eine Ausgabe zu erzeugen. Diese Analyse visualisiert diese Reise und zeigt, wie sich das 'Verständnis' des Modells für Ihre Eingabe entwickelt. Die folgenden Diagramme zeigen, welche Teile dieses 'Fließbands' für Ihren spezifischen Text am aktivsten sind, und geben Aufschluss über den Denkprozess des Modells.",
+    "evolution_not_available_info": "Die Schichtentwicklungsanalyse wurde nicht ausgeführt oder ist fehlgeschlagen. Bitte aktivieren Sie sie in den Optionen und versuchen Sie es erneut.",
+    "evolution_explanation_prompt_de": "Sie sind ein Experte für KI-Analysen. Ihre Aufgabe ist es, zwei Diagramme zur Schichtentwicklung für den Prompt eines Benutzers zu erklären.\\n\\n**Benutzer-Prompt:** \"{input_text}\"\\n\\n**Analysedaten:**\\n- **Spitzenaktivierung:** Schicht {peak_activation_layer} (Stärke: {peak_activation_strength:.2f})\\n- **Größte Veränderung:** Zwischen Schicht {biggest_change_start_layer} und {biggest_change_end_layer} (Veränderungsgröße: {biggest_change_magnitude:.2f})\\n\\nBasierend auf diesen Daten geben Sie bitte eine detaillierte (2-3 Sätze pro Teil) Erklärung in zwei Teilen. **Sie MÜSSEN für jeden Teil Markdown-Überschriften (`####`) verwenden.**\\n\\n#### Analyse der Aktivierungsstärke\\nErklären Sie die Bedeutung der Spitzenaktivierung in Schicht {peak_activation_layer}. Was deutet dies auf die Verarbeitungsstufe des Modells hin (z. B. frühe Merkmalsextraktion, mittlere Abstraktion oder späte Entscheidungsfindung)?\\n\\n#### Analyse der Schicht-zu-Schicht-Veränderung\\nErklären Sie die Bedeutung der größten Veränderung zwischen den Schichten {biggest_change_start_layer} und {biggest_change_end_layer}. Was impliziert diese Verschiebung über die Verarbeitung des Modells?\\n\\nStützen Sie Ihre Erklärung auf die bereitgestellten Daten.",
+    "top_category_attribution_header": "Dieses Sunburst-Diagramm schlüsselt die Zuordnung in granularere Kategorien auf und zeigt die 20 ähnlichsten Funktionen zu Ihrer Eingabe.",
+    "activation_strength_plot_title": "Aktivierungsstärke über die Schichten",
+    "layer_changes_plot_title": "Repräsentative Veränderung zwischen den Schichten",
+    "fv_faithfulness_explanation_pca_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>Wie das funktioniert:</strong> Der Faithfulness Checker überprüft drei Arten von Behauptungen aus der KI-Erklärung:<ul><li><strong>Ranking-Behauptungen:</strong> Überprüft, ob ein behaupteter 'ähnlichster' Funktionstyp oder eine Kategorie tatsächlich unter den Top-3-Übereinstimmungen basierend auf den Kosinus-Ähnlichkeitswerten liegt.</li><li><strong>Positionsbezogene Behauptungen:</strong> Überprüft semantisch, ob die Beschreibung der Position der Eingabe durch die KI (z.B. 'in der Nähe von Textklassifikation') eine plausible Zusammenfassung der tatsächlich am besten bewerteten Funktionen ist.</li><li><strong>Begründungsbehauptungen:</strong> Analysiert semantisch, ob die Begründung für die Relevanz einer Kategorie plausibel und logisch mit der Eingabeaufforderung übereinstimmt.</li></ul></div>",
+    "fv_faithfulness_explanation_evolution_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>Wie das funktioniert:</strong> Der Faithfulness Checker überprüft drei Arten von Behauptungen aus der KI-Erklärung:<ul><li><strong>Spitzen-/Tiefpunkt-Behauptungen:</strong> Überprüft, ob eine Behauptung über ein Spitzenereignis die Schicht, in der das Ereignis aufgetreten ist, korrekt identifiziert.</li><li><strong>Numerische Behauptungen:</strong> Überprüft, ob ein genannter spezifischer Wert mit dem tatsächlich berechneten Wert übereinstimmt.</li><li><strong>Schicht-Behauptungen:</strong> Überprüft, ob eine Behauptung den Schichtindex für eine bestimmte Metrik korrekt angibt.</li></ul></div>",
+    "desc_named_entity_recognition": "Identifizierung von benannten Entitäten wie Personen, Orten und Organisationen im Text.",
+    "desc_text_generation": "Offene Textgenerierung, einschließlich kreativem Schreiben oder dem Fortsetzen einer Geschichte.",
+    "how_vectors_are_made_header": "Wie werden diese Vektoren erstellt?",
+    "how_vectors_are_made_desc": "Die Erstellung eines Funktionsvektors ist ein mehrstufiger Prozess, der Rohtext in eine aussagekräftige numerische Darstellung umwandelt. Das nachstehende Diagramm veranschaulicht diese Umwandlung und zeigt, wie ein einfacher Prompt vom Modell verarbeitet wird, um einen Vektor zu erzeugen, der seine Kernfunktion zusammenfasst.",
+    "how_vectors_are_made_step1_title": "SCHRITT 1: EINGABEAUFFORDERUNG",
+    "how_vectors_are_made_step2_title": "SCHRITT 2: TOKENIZER",
+    "how_vectors_are_made_step3_title": "SCHRITT 3: OLMo-2-7B MODELL",
+    "how_vectors_are_made_step3_desc": "Verborgene Zustände aus allen 32 Schichten",
+    "how_vectors_are_made_step4_title": "SCHRITT 4: EXTRAKTION DER LETZTEN SCHICHT",
+    "how_vectors_are_made_step4_desc": "Vektor mit 4096 Zahlen",
+    "how_vectors_are_made_step5_title": "SCHRITT 5: FUNKTIONSVEKTOR",
+    "how_vectors_are_made_step1_example": "Übersetze 'Guten Morgen' ins Deutsche",
+    "how_vectors_are_made_step2_example": "[\"Übersetze\", \"'\", \"Guten\", ..., \"Deutsche\"]",
+    "comprehension_qs_subheader": "Verständnisfragen",
+    "comprehension_qs_desc": "Bitte beantworten Sie die folgenden Fragen nach bestem Wissen und Gewissen. Ihre Antworten helfen uns, die Klarheit der Visualisierungen zu bewerten.",
+    "fv_q1": "Was stellt ein 'Funktionsvektor' in diesem Kontext dar?",
+    "fv_q1_option_a": "Ein einzelnes Wort aus der Eingabeaufforderung.",
+    "fv_q1_option_b": "Die grammatikalische Struktur der Eingabeaufforderung.",
+    "fv_q1_option_c": "Ein numerischer Fingerabdruck des Kernzwecks der Eingabeaufforderung.",
+    "fv_q2": "Was ist der Hauptzweck der Verwendung der Hauptkomponentenanalyse (PCA) für die 3D-Visualisierung?",
+    "fv_q2_option_a": "Um das Diagramm bunter aussehen zu lassen.",
+    "fv_q2_option_b": "Um hochdimensionale Vektordaten zur Visualisierung in einen 3D-Raum zu reduzieren.",
+    "fv_q2_option_c": "Um die Verarbeitungszeit des Modells zu beschleunigen.",
+    "fv_q3": "Was gibt der Abstand zwischen zwei Punkten im 3D-PCA-Diagramm an?",
+    "fv_q3_option_a": "Der Längenunterschied zwischen zwei Eingabeaufforderungen.",
+    "fv_q3_option_c": "Die funktionale Ähnlichkeit zwischen den Eingabeaufforderungen (nähere Punkte sind ähnlicher).",
+    "fv_q3_option_d": "Die Anzahl der von jeder Eingabeaufforderung aktivierten Schichten.",
+    "pca_explanation_prompt_de": "Sie sind ein Experte für KI-Analysen. Ihre Aufgabe ist es, die Positionierung des Prompts eines Benutzers in einem 3D-PCA-Plot von Funktionsvektoren zu erklären. Der Plot visualisiert, wie ein Sprachmodell Prompts basierend auf ihrer zugrunde liegenden Funktion kategorisiert, wobei ähnliche Funktionen zusammengefasst werden.\\n\\n**Benutzer-Prompt:** \"{input_text}\"\\n\\n**Analysedaten (Top 3 Übereinstimmungen):**\\n- **Funktionstypen:** {top_types}\\n- **Spezifische Kategorien:** {top_cats}\\n\\nBasierend auf diesen Daten geben Sie bitte eine prägnante, analytische Erklärung in drei separaten Teilen. **Entscheidend ist, dass Sie für die Überschriften jedes Teils Markdown-Überschriften (`####`) verwenden und die angeforderte Struktur genau einhalten.**\\n\\n#### Gesamtplatzierung\\nBeginnen Sie mit einer allgemeinen Zusammenfassung, wo sich der Prompt im PCA-Plot befindet. Erwähnen Sie, in welche allgemeine funktionale Nachbarschaft er fällt.\\n\\n#### Top-Funktionstyp-Zuschreibungen\\nAnalysieren Sie die drei dominantesten Funktionstypen. Erklären Sie für jeden der drei Typen kurz, warum der Prompt des Benutzers damit übereinstimmt, und beziehen Sie sich dabei auf den Inhalt des Prompts und die Art des Funktionstyps.\\n\\n#### Top-spezifische Kategorie-Zuschreibung\\nDiskutieren Sie die drei spezifischsten Kategorien. Erklären Sie für jede Kategorie kurz die Verbindung und warum sie als enger Nachbar des Benutzer-Prompts sinnvoll ist.\\n\\nStrukturieren Sie Ihre Antwort mit klaren Überschriften für jeden der drei Teile. Stützen Sie Ihre gesamte Erklärung auf die bereitgestellten Daten.",
+    "evolution_explanation_prompt_de": "Sie sind ein Experte für KI-Analysen. Ihre Aufgabe ist es, zwei Diagramme zur Schichtentwicklung für den Prompt eines Benutzers zu erklären.\\n\\n**Benutzer-Prompt:** \"{input_text}\"\\n\\n**Analysedaten:**\\n- **Spitzenaktivierung:** Schicht {peak_activation_layer} (Stärke: {peak_activation_strength:.2f})\\n- **Größte Veränderung:** Zwischen Schicht {biggest_change_start_layer} und {biggest_change_end_layer} (Veränderungsgröße: {biggest_change_magnitude:.2f})\\n\\nBasierend auf diesen Daten geben Sie bitte eine detaillierte (2-3 Sätze pro Teil) Erklärung in zwei Teilen. **Sie MÜSSEN für jeden Teil Markdown-Überschriften (`####`) verwenden.**\\n\\n#### Analyse der Aktivierungsstärke\\nErklären Sie die Bedeutung der Spitzenaktivierung in Schicht {peak_activation_layer}. Was deutet dies auf die Verarbeitungsstufe des Modells hin (z. B. frühe Merkmalsextraktion, mittlere Abstraktion oder späte Entscheidungsfindung)?\\n\\n#### Analyse der Schicht-zu-Schicht-Veränderung\\nErklären Sie die Bedeutung der größten Veränderung zwischen den Schichten {biggest_change_start_layer} und {biggest_change_end_layer}. Was impliziert diese Verschiebung über die Verarbeitung des Modells?\\n\\nStützen Sie Ihre Erklärung auf die bereitgestellten Daten.",
+    "fv_claim_extraction_prompt_header": "Sie sind ein Experten-System zur Extraktion von Behauptungen. Ihre Aufgabe ist es, eine Erklärung einer Datenvisualisierung zu lesen und alle überprüfbaren, faktischen Behauptungen in eine strukturierte JSON-Liste zu extrahieren. Ein einzelner Satz kann mehrere Behauptungen enthalten.",
+    "fv_claim_extraction_prompt_instruction": "Jedes Objekt in der Liste MUSS die folgenden Schlüssel haben:\n1.  `claim_text`: Der exakte Satz oder die Phrase aus der Erklärung, die die Behauptung aufstellt.\n2.  `claim_type`: Einer der verfügbaren Behauptungstypen für den gegebenen Kontext.\n3.  `details`: Ein Objekt, das die spezifischen Parameter für die Überprüfung enthält.",
+    "fv_claim_extraction_prompt_context_header": "**Kontext dieser Erklärung:** {context}",
+    "fv_claim_extraction_prompt_types_header": "**Verfügbare Behauptungstypen:**",
+    "fv_claim_extraction_prompt_pca_types_details": "-   `top_k_similarity`: Eine Behauptung, dass ein oder mehrere Funktionstypen/Kategorien dem Input am ähnlichsten sind.\n    -   `details`: {{ \"item_type\": \"function_type\" oder \"category\", \"items\": [\"...\"], \"rank_description\": \"most/least\" }}\n-   `positional_claim`: Eine Behauptung über die Position des Inputs im Verhältnis zu einem oder mehreren Clustern im PCA-Plot.\n    -   `details`: {{ \"cluster_names\": [\"...\"], \"position\": \"near/far/between\" }}\n-   `category_justification_claim`: Eine Behauptung, die einen bestimmten Grund für die Relevanz einer Kategorie für den Input-Prompt liefert.\n    -   `details`: {{ \"category_name\": \"...\", \"justification\": \"...\" }}",
+    "fv_claim_extraction_prompt_evolution_types_details": "-   `peak_activation`: Eine Behauptung darüber, welche Schicht die höchste Aktivierungsstärke hatte.\n    -   `details`: {{ \"layer_index\": 12 }}\n-   `biggest_change`: Eine Behauptung darüber, welcher Schichtübergang die größte Veränderung hatte.\n    -   `details`: {{ \"start_layer\": 10, \"end_layer\": 11 }}\n-   `specific_value_claim`: Eine Behauptung über einen spezifischen numerischen Wert.\n    -   `details`: {{ \"metric\": \"activation_strength\" oder \"change_magnitude\", \"layer_index\": 12, \"value\": 65.91 }}\n    -   **Hinweis:** Bei \"change_magnitude\" bezieht sich `layer_index` auf die **Startschicht** des Übergangs (z. B. für Schicht 1->2 ist `layer_index` 1).",
+    "fv_claim_extraction_prompt_pca_example_header": "**Beispiel für einen 'pca'-Kontext:**",
+    "fv_claim_extraction_prompt_pca_example_explanation": "- **Erklärungssatz:** \"Insbesondere fällt es in eine Region, die durch abstrakte Aufgaben, Textklassifizierung und Textgenerierung gekennzeichnet ist.\"",
+    "fv_claim_extraction_prompt_pca_example_json": "- **Ergebnis-JSON-Objekt:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"Insbesondere fällt es in eine Region, die durch abstrakte Aufgaben, Textklassifizierung und Textgenerierung gekennzeichnet ist.\",\n      \"claim_type\": \"positional_claim\",\n      \"details\": {{\n        \"cluster_names\": [\"abstrakte Aufgaben\", \"Textklassifizierung\", \"Textgenerierung\"],\n        \"position\": \"near\"\n      }}\n    }},\n    {{\n      \"claim_text\": \"Der Prompt ist eng mit Language QA verbunden, da er die Beantwortung einer Frage zu einem literarischen Werk beinhaltet.\",\n      \"claim_type\": \"category_justification_claim\",\n      \"details\": {{\n        \"category_name\": \"Language QA\",\n        \"justification\": \"er beinhaltet die Beantwortung einer Frage zu einem literarischen Werk\"\n      }}\n    }}\n  ]\n  ```",
+    "fv_claim_extraction_prompt_evolution_example_header": "**Beispiel für einen 'evolution'-Kontext:**",
+    "fv_claim_extraction_prompt_evolution_example_explanation": "- **Erklärungssatz:** \"Die größte Veränderung tritt zwischen Schicht 1 und 2 auf, mit einer Größenordnung von 0,40...\"",
+    "fv_claim_extraction_prompt_evolution_example_json": "- **Ergebnis-JSON-Objekt:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"Die größte Veränderung tritt zwischen Schicht 1 und 2 auf, mit einer Größenordnung von 0,40...\",\n      \"claim_type\": \"biggest_change\",\n      \"details\": {{ \"start_layer\": 1, \"end_layer\": 2 }}\n    }},\n    {{\n      \"claim_text\": \"Die größte Veränderung tritt zwischen Schicht 1 und 2 auf, mit einer Größenordnung von 0,40...\",\n      \"claim_type\": \"specific_value_claim\",\n      \"details\": {{ \"metric\": \"change_magnitude\", \"layer_index\": 1, \"value\": 0.40 }}\n    }}\n  ]\n  ```",
+    "fv_claim_extraction_prompt_analyze_header": "**Zu analysierende Erklärung:**",
+    "fv_claim_extraction_prompt_footer": "Antworten Sie NUR mit der JSON-Liste der Behauptungen. Wenn keine überprüfbaren Behauptungen gefunden werden, geben Sie eine leere Liste `[]` zurück.",
+    "fv_semantic_verification_prompt_header": "Sie sind ein KI-Faktenchecker, der auf semantische Analyse spezialisiert ist. Ihre Aufgabe ist es, festzustellen, ob eine behauptete „funktionale Nachbarschaft“ plausibel mit den tatsächlich am höchsten eingestuften Funktionen für einen bestimmten Prompt zusammenhängt.",
+    "fv_semantic_verification_prompt_rule": "**Entscheidende Regel:** Die behauptete Nachbarschaft muss keine direkte Zusammenfassung der Top-Funktionen sein. Sie sollte als „verifiziert“ betrachtet werden, wenn sie ein plausibles, kontextuell relevantes oder semantisch benachbartes Konzept darstellt. Kennzeichnen Sie sie als „nicht verifiziert“, wenn die behauptete Nachbarschaft nicht zusammenhängt oder logisch inkonsistent mit den Top-Funktionen ist.",
+    "fv_semantic_verification_prompt_actual_header": "**Tatsächliche Top-bewertete Funktionen:**",
+    "fv_semantic_verification_prompt_claimed_header": "**Behauptete funktionale Nachbarschaft:**",
+    "fv_semantic_verification_prompt_task_header": "**Ihre Aufgabe:**",
+    "fv_semantic_verification_prompt_task_instruction": "Ist die „behauptete funktionale Nachbarschaft“ basierend auf der obigen Regel plausibel mit den „tatsächlichen Top-bewerteten Funktionen“ verknüpft? Geben Sie eine eindeutige Entscheidung und stützen Sie diese auf konkrete Hinweise.",
+    "fv_semantic_verification_prompt_json_instruction": "Antworten Sie mit einem JSON-Objekt mit zwei Schlüsseln:\n1.  `is_verified`: boolean (true, wenn plausibel verknüpft, sonst false).\n2.  `reasoning`: Eine ausführliche Begründung in 2-3 Sätzen, die mindestens einen Eintrag aus der tatsächlichen Liste erwähnt, erläutert, warum die Behauptung passt oder widerspricht, und den ursprünglichen Wortlaut nicht einfach wiederholt.",
+    "fv_semantic_verification_prompt_footer": "Antworten Sie NUR mit dem JSON-Objekt und nichts anderem.",
+    "fv_justification_verification_prompt_header": "Sie sind ein KI-Faktenchecker, der auf semantisches Denken spezialisiert ist. Ihre Aufgabe ist es zu bestimmen, ob eine Begründung für die Relevanz einer funktionalen Kategorie für eine Eingabeaufforderung plausibel und logisch konsistent ist.",
+    "fv_justification_verification_prompt_rule": "**Entscheidende Regel:** Die Begründung muss nicht das stärkste mögliche Argument sein. Sie sollte als „verifiziert“ betrachtet werden, wenn sie eine plausible, kreative oder kontextuell relevante Verbindung darstellt, auch wenn sie weit hergeholt erscheint. Kennzeichnen Sie sie nur dann als „nicht verifiziert“, wenn die Argumentation völlig unlogisch, sachlich falsch ist oder der Eingabeaufforderung direkt widerspricht.",
+    "fv_justification_verification_prompt_input_header": "**Eingabeaufforderung:**",
+    "fv_justification_verification_prompt_category_header": "**Funktionale Kategorie:**",
+    "fv_justification_verification_prompt_justification_header": "**Gegebene Begründung:**",
+    "fv_justification_verification_prompt_task_header": "**Ihre Aufgabe:**",
+    "fv_justification_verification_prompt_task_instruction": "Ist die Begründung basierend auf der obigen Regel plausibel? Beziehen Sie Ihre Entscheidung ausdrücklich auf den Eingabeprompt und die Kategorie.",
+    "fv_justification_verification_prompt_json_instruction": "Antworten Sie mit einem JSON-Objekt mit zwei Schlüsseln:\n1. `is_verified`: boolean (true, wenn die Begründung plausibel ist, false, wenn sie unlogisch oder falsch ist).\n2. `reasoning`: Eine Begründung in 2-3 Sätzen, die klar auf den Eingabeprompt und die Kategorie eingeht und erläutert, weshalb die Begründung stimmig oder unstimmig ist, ohne den ursprünglichen Wortlaut einfach zu wiederholen.",
+    "fv_justification_verification_prompt_footer": "Antworten Sie NUR mit dem JSON-Objekt und nichts anderem."
+}

locales/de/welcome_page.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "welcome_page_title": "Willkommen & Einrichtung",
+    "welcome_page_header": "Bevor Sie beginnen...",
+    "welcome_page_intro": "Um unsere Forschung zur Benutzerfreundlichkeit dieses Tools zu unterstützen, geben Sie bitte einige anonyme Informationen an. Diese werden sicher gespeichert und nur für akademische Zwecke verwendet.",
+    "research_tool_intro": "Ein fortschrittliches Forschungswerkzeug zur Erkundung der inneren Funktionsweise von großen Sprachmodellen.",
+    "about_this_tool": "Über dieses Tool",
+    "research_study_info": "Diese Anwendung ist Teil einer Forschungsstudie, die darauf abzielt zu verstehen, wie Benutzer mit komplexen KI-Modellen interagieren und diese interpretieren. Durch die Nutzung dieses Tools nehmen Sie an dieser Studie teil.",
+    "your_role": "Ihre Rolle als Teilnehmer:",
+    "role_1": "Sie werden die verschiedenen Analysewerkzeuge verwenden, um das Verhalten eines Sprachmodells zu untersuchen.",
+    "role_2": "Sie werden gebeten, Feedback zur Benutzerfreundlichkeit und Klarheit der Visualisierungen zu geben.",
+    "role_3": "Ihre Interaktionen und Ihr Feedback helfen uns, bessere und transparentere KI-Werkzeuge zu entwickeln.",
+    "data_privacy": "Datenschutz & Einwilligung:",
+    "privacy_1": "Ihre Antworten und Interaktionen sind anonym. Wir speichern nur Ihr Alter, Ihr Fachwissen und Ihr Feedback.",
+    "privacy_2": "Alle gesammelten Daten werden ausschließlich für akademische Forschungszwecke verwendet.",
+    "privacy_3": "Indem Sie fortfahren, stimmen Sie der Erhebung und Nutzung dieser anonymen Daten zu.",
+    "tell_us_about_yourself": "Erzählen Sie uns von sich",
+    "what_is_your_age_group": "Was ist Ihre Altersgruppe?",
+    "under_18": "Unter 18",
+    "18_24": "18-24",
+    "25_34": "25-34",
+    "35_44": "35-44",
+    "45_54": "45-54",
+    "55_64": "55-64",
+    "65_or_over": "65 oder älter",
+    "prefer_not_to_say": "Keine Angabe",
+    "rate_your_expertise": "Wie würden Sie Ihre Erfahrung mit KI und Sprachmodellen bewerten?",
+    "novice": "Anfänger (Wenig bis keine Erfahrung mit KI-Tools)",
+    "intermediate": "Fortgeschritten (Sicherer Umgang mit KI für alltägliche Aufgaben)",
+    "expert": "Experte (Tiefes technisches Wissen oder Forschung im Bereich KI)",
+    "start_analysis_button": "Analyse starten",
+    "form_submitted": "formular_abgesendet",
+    "thank_you_proceed": "Vielen Dank! Sie können nun mit der Analyse fortfahren.",
+    "thank_you_main_suite": "Vielen Dank! Die Hauptanalyse-Suite wird geladen...",
+    "welcome_to_llm_analysis_suite": "Willkommen beim Explainable Language Interpretability Analysis Tool!",
+    "toolkit_description": "Dieses Toolkit bietet eine Sammlung fortschrittlicher Methoden zur Interpretation und zum Verständnis der inneren Funktionsweise von Sprachmodellen. Wählen Sie eine Analyse aus der Seitenleiste, um zu beginnen.",
+    "attribution_analysis_description": "<strong>Attributionsanalyse:</strong> Verstehen Sie, welche Teile des Eingabetextes die Ausgabe des Modells beeinflussen, mithilfe von Methoden wie Integrierte Gradienten, Okklusion und Salienz.",
+    "function_vectors_description": "<strong>Funktionsvektoren:</strong> Analysieren Sie, wie Text verschiedene funktionale Fähigkeiten innerhalb des Modells aktiviert.",
+    "circuit_tracing_description": "<strong>Schaltkreisverfolgung:</strong> Erforschen Sie die Rechenpfade innerhalb des Modells, um zu sehen, wie Informationen fließen."
+}

locales/en/attribution_analysis_page.json ADDED Viewed

	@@ -0,0 +1,174 @@

+{
+    "desc_integrated_gradients": "This provides a more reliable measure of importance by considering not just the final input, but the entire path from a neutral 'blank' state to your specific prompt. It carefully adds up the contribution of each word, preventing misleading results and giving a truer picture of each token's influence.",
+    "desc_occlusion": "This method tests each word's necessity by asking, 'What happens if this word is missing?' It temporarily hides (or 'occludes') each token from the input and measures how much the model's output changes. A high score means the word was critical for the result.",
+    "desc_saliency": "This method reveals the model's initial 'gut reaction' to each input token. It highlights which words the model found most interesting or surprising, based on a direct and fast calculation of importance. Think of it as a quick look at the model's focus.",
+    "unsupported_method_desc": "Description not available for this method.",
+    "ai_expert_intro": "You are a world-class AI interpretability expert. Your task is to analyze an attribution heatmap and provide a comprehensive, easy-to-understand explanation for a non-technical audience.",
+    "analysis_details": "Analysis Details",
+    "method_being_used": "Method Used:",
+    "prompt_analyzed": "Prompt Analyzed:",
+    "full_generated_text": "Full Generated Text:",
+    "method_specific_context": "Method-Specific Context",
+    "instructions_for_analysis": "Instructions for Analysis",
+    "instruction_part_1_header": "### High-Level Visual Overview",
+    "instruction_part_1_desc": "In two to three sentences, provide a general summary of the patterns you see in the attached heatmap image. Describe the general location of the 'hot spots' (brightly colored areas) and what this visually implies about the model's focus. **For example, if the method is {method_name}, you might expect to see [describe expected pattern for this method].** Do not use generic descriptions. Base your analysis exclusively on the visual information in the image.",
+    "instruction_synthesis_header": "### Synthesis of Key Findings",
+    "instruction_synthesis_desc": "Following your high-level visual overview, create a brief narrative synthesis of the key findings from the data provided below. Structure your analysis into two paragraphs: **Strongest Individual Connections** and **Most Influential Tokens Overall**. In the first paragraph, explain the significance of the strongest individual token-to-token connections. In the second, discuss the input tokens that had the highest average influence on the entire generation, **making sure to refer to the full generated text provided above to explain *why* these tokens were so influential in shaping the final output.** Explain *why* certain tokens are influential in both contexts. **Do not add a summary at the end of your analysis**.",
+    "instruction_color_coding": "Formatting Rule: When you mention an input token, format it exactly like this: <span style='color: #60a5fa;'>the_token_here</span>. When you mention a generated token, format it exactly like this: <span style='color: #fca5a5;'>the_token_here</span>. Do not deviate from this format.",
+    "data_priority_instruction": "The following text block contains the pre-calculated key findings. Use this as the exclusive source of truth for your analysis.",
+    "data_section_header": "## Pre-Calculated Analysis (Source of Truth)",
+    "begin_analysis_now": "Begin your analysis now. Remember to follow the two-part structure (High-Level Visual Overview, then Synthesis of Key Findings) as described above.",
+    "attr_page_title": "<i class='bi bi-search'></i> Attribution Analysis",
+    "attr_page_desc": "This page uses token-level attribution methods to explain how different parts of your input prompt influence the generated output. Select a method, enter a prompt, and see which words were most important for the model's prediction.",
+    "how_methods_work_expander": "How Attribution Methods Work",
+    "saliency_method_title": "Saliency",
+    "saliency_method_desc": "Measures importance by calculating the gradient of the output with respect to the input tokens. It's fast but can sometimes be noisy.",
+    "saliency_step_1": "<strong>1. Generate Output:</strong> The model generates the next word, e.g., 'over', for the prompt 'The quick brown fox jumps'.",
+    "saliency_step_2": "<strong>2. Calculate Gradients:</strong> It computes how much the probability of 'over' would change with a tiny nudge to each input word's embedding.",
+    "saliency_step_3": "<strong>3. Assign Scores:</strong> Words that cause the biggest change (e.g., 'jumps') get the highest scores.",
+    "ig_method_title": "Integrated Gradients",
+    "ig_method_desc": "A more robust method that attributes the prediction to the inputs by integrating gradients along a path from a baseline (e.g., zero embedding) to the input.",
+    "ig_step_1": "<strong>1. Create Path:</strong> It creates a smooth path from a 'blank' input to the full prompt, 'The quick brown fox jumps'.",
+    "ig_step_2": "<strong>2. Compute Gradients Along Path:</strong> It calculates gradients for the output 'over' at many small steps along this path.",
+    "ig_step_3": "<strong>3. Sum Gradients:</strong> It sums up all these small gradient values to get a reliable score for how much each word contributed to the output.",
+    "occlusion_method_title": "Occlusion",
+    "occlusion_method_desc": "A simple, intuitive method that measures importance by replacing each input token and seeing how much the output probability changes.",
+    "occlusion_step_1": "<strong>1. Get Original Probability:</strong> The model generates 'over' with a certain probability.",
+    "occlusion_step_2": "<strong>2. Replace Words:</strong> It systematically replaces each word (e.g., 'jumps') with a neutral token and re-runs the model.",
+    "occlusion_step_3": "<strong>3. Measure Impact:</strong> If replacing 'jumps' causes the probability of 'over' to drop significantly, 'jumps' is considered very important.",
+    "input_header": "<i class='bi bi-pencil-square'></i> Input & Settings",
+    "enter_prompt": "Enter your prompt:",
+    "enter_prompt_help": "Enter the text for the model to continue",
+    "enable_ai_explanations": "Enable AI Explanations",
+    "enable_ai_explanations_help": "Generate explanations for visualizations using Qwen 2.5 VL 72B (requires API access)",
+    "generate_and_analyze_button": "Generate & Analyze All Methods",
+    "max_new_tokens_slider": "Number of Tokens to Generate",
+    "max_new_tokens_slider_help": "Controls the length of the generated text.",
+    "loading_models_spinner": "Loading OLMo model with all attribution methods...",
+    "generating_attributions_spinner": "Generating text and attributions...",
+    "analysis_complete_success": "All attribution analyses complete!",
+    "failed_to_generate_analysis_error": "Failed to generate analysis",
+    "failed_to_load_models_error": "Failed to load models",
+    "please_enter_prompt_warning": "Please enter a prompt",
+    "output_header": "<i class='bi bi-display'></i> Output",
+    "generated_text_subheader": "Generated Text",
+    "input_label": "Input:",
+    "generated_label": "Generated:",
+    "attribution_analysis_results_header": "Attribution Analysis Results",
+    "attr_tab": "Integrated Gradients",
+    "occlusion_tab": "Occlusion",
+    "saliency_tab": "Saliency",
+    "attr_title": "Integrated Gradients Analysis",
+    "occlusion_title": "Occlusion Analysis",
+    "saliency_title": "Saliency Analysis",
+    "attr_viz_desc": "**How to read this Integrated Gradients heatmap:**\\n- **X-axis**: Generated tokens (what the model produced)\\n- **Y-axis**: Input tokens (your original prompt)\\n- **Color intensity**: Mathematical gradient-based importance scores\\n- **Interpretation**: How much each input token mathematically influences each generated token",
+    "occlusion_viz_desc": "Occlusion analysis highlights important tokens by temporarily masking (occluding) them and measuring the impact on the output. A larger attribution score means the token was more critical.",
+    "saliency_viz_desc": "This visualization highlights the most salient tokens in the input that contributed to the generation.",
+    "how_to_read_heatmap": "How to read this heatmap:",
+    "xaxis_label": "X-axis",
+    "xaxis_desc": "Generated tokens (what the model produced)",
+    "yaxis_label": "Y-axis",
+    "yaxis_desc": "Input tokens (your original prompt)",
+    "color_intensity_label": "Color Intensity",
+    "color_intensity_desc": "Mathematical importance scores",
+    "interpretation_label": "Interpretation",
+    "interpretation_desc": "How much each input token influences each generated token.",
+    "special_tokens_label": "Special Tokens (e.g., `Ġ`, `Ċ`)",
+    "special_tokens_desc": "These are artifacts from the tokenizer. Common ones include:<ul><li>`Ġ`: A space, marking a new word.</li><li>`Ċ`: A newline character.</li><li>`<|endoftext|>`: A special token marking the end of a sequence.</li></ul>",
+    "creating_viz_spinner": "Creating {method_title} visualization...",
+    "generating_ai_explanation_spinner": "Generating AI explanation for {method_title}...",
+    "what_this_method_shows": "What this method shows:",
+    "ai_generated_analysis": "AI Generated Analysis",
+    "download_results_subheader": "Download Results",
+    "download_html_button": "Download {method_title} HTML",
+    "download_csv_button": "Download Scores (CSV)",
+    "download_png_button": "Download {method_title} PNG",
+    "heatmap_title": "Attribution Heatmap",
+    "heatmap_xaxis": "Generated Tokens",
+    "heatmap_yaxis": "Input Tokens",
+    "feedback_survey_header": "Feedback & Comprehension Survey",
+    "feedback_survey_desc": "Your feedback is valuable for improving this tool. Please take a moment to answer these questions.",
+    "ux_feedback_subheader": "User Experience Feedback",
+    "q_visual_clarity": "1. How would you rate the clarity of the heatmap visualizations?",
+    "q_visual_clarity_help": "1 = Very Confusing, 5 = Very Clear",
+    "q_cognitive_load": "2. How mentally demanding did you find it to interpret the results?",
+    "q_cognitive_load_help": "1 = Not Demanding at all, 5 = Very Demanding",
+    "q_influential_docs_plausibility": "3. How plausible are the 3 most influential documents identified by the Influence Tracer?",
+    "q_influential_docs_plausibility_help": "1 = Not Plausible at all, 5 = Very Plausible",
+    "comprehension_qs_subheader": "Quick Comprehension Check",
+    "comprehension_qs_desc": "Based on the visualizations you just saw, which method best answers the following questions?",
+    "q_options_ig": "Integrated Gradients",
+    "q_options_occlusion": "Occlusion",
+    "q_options_saliency": "Saliency",
+    "q_s1": "Which method reveals the model's initial 'gut reaction' to each word, showing its most direct and immediate focus?",
+    "q_s2": "Which method would you use to understand the impact of removing a specific word?",
+    "q_s3": "Which method builds a more reliable picture of importance by analyzing the entire path from a blank input to your final prompt?",
+    "submit_feedback_button": "Submit Feedback",
+    "feedback_success_message": "Thank you for your feedback!",
+    "feedback_error_message": "Sorry, there was an error submitting your feedback: {e}",
+    "feedback_please_answer_all_qs": "Please answer all comprehension questions before submitting.",
+    "error_creating_heatmap": "Error creating heatmap from HTML: {e}",
+    "error_inseq_no_html": "Inseq failed to generate HTML output for {method_name}.",
+    "error_no_table_in_html": "Could not find data table in inseq's HTML output for {method_name}.",
+    "error_table_no_rows": "Table in HTML output contains no rows for {method_name}.",
+    "error_failed_to_parse_rows": "Failed to parse any data rows from the HTML for {method_name}.",
+    "running_influence_trace_spinner": "Tracing influences in the training data...",
+    "influence_index_not_found_warning": "Influence tracer index not found. Skipping this step. Please run `build_dolma_index.py` to enable it.",
+    "influence_tracer_title": "Influence Tracer",
+    "influence_tracer_desc": "This tool identifies training documents from a sample of the <b>Dolma v1.6 dataset</b> that were most influential on the model's output. Dolma v1.6 is a 3-trillion-token open dataset composed of a diverse mix of web content (Common Crawl), academic publications (C4, arXiv), code (The Stack), books (Project Gutenberg), and encyclopedic data (Wikipedia). By tracing the model's generation back to its training data, we can better understand its reasoning and knowledge sources.",
+    "top_influential_docs_header": "Top {num_docs} Most Influential Training Documents",
+    "no_influential_docs_found": "No influential documents were found for this generation.",
+    "file_label": "File",
+    "source_label": "Source",
+    "similarity_label": "Similarity",
+    "run_analysis_for_influence_info": "Run an analysis to see influential training documents here.",
+    "prompt_placeholder_text": "e.g., 'The capital of France is' or 'To be or not to be, that is the'",
+    "running_attribution_analysis_spinner": "Generating attribution heatmaps...",
+    "generating_ai_explanations_spinner": "Generating AI explanations...",
+    "how_influence_is_found_header": "How Influence is Found: A Look at Cosine Similarity",
+    "how_influence_is_found_desc": "The Influence Tracer doesn't just search for keywords; it searches for meaning. It does this by converting both your prompt and every sentence in the training data into high-dimensional vectors. It then uses a technique called <strong>Cosine Similarity</strong> to find the closest matches.",
+    "influence_step_1_title": "<strong>1. Vector Conversion</strong>",
+    "influence_step_1_desc": "Your prompt and each sentence from the training data are transformed into numerical vectors.",
+    "influence_step_2_title": "<strong>2. Angle Calculation</strong>",
+    "influence_step_2_desc": "The system calculates the angle (θ) between your prompt's vector and every other sentence vector.",
+    "influence_step_3_title": "<strong>3. Similarity Score</strong>",
+    "influence_step_3_desc": "A smaller angle means a higher similarity. A score of 1 means the sentences are identical in meaning, while a score of 0 means they are completely unrelated.",
+    "influence_example_sentence_a": "Your Prompt",
+    "influence_example_sentence_b": "Training Sentence",
+    "generating_all_visualizations_spinner": "Generating all visualizations and AI explanations...",
+    "searching_influential_docs_progress": "Searching for influential documents...",
+    "processing_doc_progress": "Processing document {i} of {k}...",
+    "search_complete_progress": "Search complete!",
+    "faithfulness_check_expander": "Faithfulness Check",
+    "running_faithfulness_check_spinner": "Running faithfulness check...",
+    "verified_status": "Verified",
+    "contradicted_status": "Contradicted",
+    "claim_label": "Claim",
+    "status_label": "Status",
+    "evidence_label": "Evidence",
+    "no_verifiable_claims_info": "No verifiable claims were extracted from the explanation.",
+    "faithfulness_check_error": "An error occurred during the faithfulness check: {e}",
+    "faithfulness_check_results_header": "Faithfulness Check Results:",
+    "faithfulness_check_explanation_html": "<div style='font-size: 0.9rem; color: #DCDCDC; margin-bottom: 1rem;'><p style='margin-bottom: 0.5rem;'><strong>How This Works:</strong> The faithfulness checker verifies two types of claims from the AI's explanation:</p><ul style='margin-left: 1.5rem; padding-left: 0; list-style-type: disc;'><li style='margin-bottom: 0.3rem;'><strong>Numerical Claims:</strong> Checks if a token's attribution score (either its peak 'hotspot' or its average score) meets a dynamic threshold.<ul style='margin-left: 1.5rem; padding-left: 0; list-style-type: circle;'><li>A <strong>\"high\"</strong> claim (e.g., \"highest,\" \"strongest\") must be above <strong>70%</strong> of the maximum score in the analysis.</li><li>A <strong>\"significant\"</strong> claim (e.g., \"notable\") must be above <strong>50%</strong> of the maximum score.</li></ul></li><li style='margin-bottom: 0.3rem;'><strong>Justification Claims:</strong> Uses another AI to semantically analyze whether the <strong>reasoning</strong> provided for a token's importance is plausible and logically consistent.</li></ul></div>",
+    "claim_extraction_prompt_header": "You are an expert claim extraction system. Your task is to read an explanation of a text attribution analysis and extract all verifiable, factual claims into a structured JSON list. A single sentence may contain multiple distinct claims.",
+    "claim_extraction_prompt_instruction": "Each object in the list MUST have the following keys:\n1. `claim_text`: The exact sentence or phrase from the explanation that makes the claim.\n2. `claim_type`: One of the available claim types.\n3. `details`: An object containing the specific parameters for verification.",
+    "claim_extraction_prompt_context_header": "**Analysis Method Context:** {analysis_method}",
+    "claim_extraction_prompt_types_header": "**Available Claim Types:**",
+    "claim_extraction_prompt_types_details": "- `attribution_claim`: A claim that one or more tokens have high or significant attribution scores, either based on their peak (hotspot) or average influence.\n  - `details`: {{ \"tokens\": [\"...\"], \"qualifier\": \"high\" | \"significant\", \"score_type\": \"peak\" | \"average\" }}\n  - **Note:** Use \"peak\" for claims about hotspots or specific connections. Use \"average\" for claims about overall or average influence.\n- `token_justification_claim`: A claim that provides a specific reason for one or more tokens' importance or attribution score.\n  - `details`: {{ \"tokens\": [\"...\"], \"justification\": \"...\" }}",
+    "claim_extraction_prompt_example_header": "**Example:**",
+    "claim_extraction_prompt_example_explanation": "- **Explanation sentence:** \"Overall, 'France' has the highest average influence, while '.' has a significant peak score.\"",
+    "claim_extraction_prompt_example_json": "- **Resulting JSON object:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"Overall, 'France' has the highest average influence...\",\n      \"claim_type\": \"attribution_claim\",\n      \"details\": {{ \"tokens\": [\"France\"], \"qualifier\": \"high\", \"score_type\": \"average\" }}\n    }},\n    {{\n      \"claim_text\": \"...while '.' has a significant peak score.\",\n      \"claim_type\": \"attribution_claim\",\n      \"details\": {{ \"tokens\": [\".\"], \"qualifier\": \"significant\", \"score_type\": \"peak\" }}\n    }}\n  ]\n  ```",
+    "claim_extraction_prompt_analyze_header": "**Explanation to Analyze:**",
+    "claim_extraction_prompt_instruction_footer": "Respond with ONLY the JSON list of claims.",
+    "justification_verification_prompt_collective_reasoning": "**Collective Reasoning:** The justification may refer to multiple tokens at once (e.g., 'these tokens collectively...'). When evaluating such a claim, consider the group of tokens as a single unit and assess if the justification is plausible for them as a whole, even if it doesn't apply perfectly to each token individually.",
+    "justification_verification_prompt_header": "You are an AI fact-checker specializing in NLP and semantic reasoning. Your task is to determine if a justification for a token's importance is plausible and logically consistent, given the full context.",
+    "justification_verification_prompt_crucial_rule": "**Crucial Rule:** A justification is plausible if it presents a reasonable, creative, or contextually relevant connection. Only contradict if the reasoning is completely illogical, factually incorrect, or inconsistent with the given input or output text.",
+    "justification_verification_prompt_token_location": "**Token Location:** The \"Token in Question\" can be from either the \"Input Prompt\" or the \"Generated Text\". A token from the input can still have a crucial influence on the generated output. Do not contradict a claim simply because the token is not present in the generated text.",
+    "justification_verification_prompt_special_tokens": "**Special Tokens:** The 'Token in Question' may contain special characters from the tokenizer. `Ġ` represents a leading space (e.g., `Ġof` is ` of`), and suffixes like ` (1)` are for uniqueness (e.g., `. (1)` is just `.`). You MUST account for these when checking if a token exists in the text.",
+    "justification_verification_prompt_evaluating_justifications": "**Evaluating Justifications:** A justification should be considered plausible if it identifies a reasonable connection, even if it is not a direct or simple causal link. This includes relationships based on the broader context of the text or the grammatical structure of the language. Pay special attention to tokens that form common collocations, entities, or abbreviations; connections between such tokens should be considered plausible as they are often processed as a single semantic unit by the model.",
+    "justification_verification_prompt_linguistic_context": "**Linguistic Context for Autoregressive Models:** It is crucial to remember that in autoregressive models like this one, EVERY token directly influences the probability of the next token. Therefore, justifications based on grammatical structure, punctuation, or syntactic roles are not just valid, but represent a core part of the model's decision-making process. A token's structural role (like a preposition or a period) is a direct and important contributor to content generation. Do not dismiss these justifications as 'mere grammar'.",
+    "justification_verification_prompt_task_header": "**Your Task:**",
+    "justification_verification_prompt_task_instruction": "Based on the rule above, is the justification plausible?",
+    "justification_verification_prompt_json_instruction": "Respond with a JSON object with two keys:\n1. `is_verified`: boolean (true if the justification is plausible, false if it is illogical or incorrect).\n2. `reasoning`: A brief, one-sentence explanation for your decision.",
+    "justification_verification_prompt_footer": "Respond with ONLY the JSON object."
+}

locales/en/circuit_trace_page.json ADDED Viewed

	@@ -0,0 +1,217 @@

+{
+    "circuit_trace_page_title": "Circuit Trace Analysis",
+    "circuit_trace_page_desc": "Explore the internal pathways of the OLMo model. This page visualizes how information flows from input tokens through different layers and features to produce the final output, based on a novel Cross-Layer Transcoder method.",
+    "how_circuit_tracing_works_header": "How This Works: A Three-Step Process",
+    "how_circuit_tracing_works_desc": "Instead of looking at the entire model at once, this technique simplifies the analysis by focusing on 'features'—specific, learned patterns of neuron activations. By training small 'transcoder' models, we can identify which features in one layer activate features in the next, allowing us to trace a circuit of information flow.",
+    "circuit_tracing_step1_title": "1. Feature Extraction",
+    "circuit_tracing_step1_desc": "Small <strong>autoencoder</strong> models (think of them as compression tools that summarize important information) are trained on each layer of the main OLMo model to discover recurring patterns of neuron activations, which we call 'features'.",
+    "circuit_tracing_step2_title": "2. Cross-Layer Mapping",
+    "circuit_tracing_step2_desc": "Tiny <strong>transcoder</strong> models (which act like translators between layers) are trained to predict the activation of a feature in a later layer based on the activations of features in an earlier layer.",
+    "circuit_tracing_step3_title": "3. Graph Construction",
+    "circuit_tracing_step3_desc": "By connecting the most predictive feature pairs from the transcoder models, we construct a directed graph that represents the most important pathways of information flow for a given prompt.",
+    "enable_ai_explanations_circuit": "Enable AI Explanations",
+    "enable_ai_explanations_circuit_help": "Generate detailed explanations for circuit visualizations using Qwen 2.5 VL 72B",
+    "about_circuit_tracing_header": "About Circuit Tracing",
+    "about_circuit_tracing_body": "Circuit tracing is a technique to understand a model's decision-making by mapping the information flow through its internal components, much like following wires on a circuit board. This is achieved by identifying key 'features' in each layer and then building a graph showing how they influence each other from input to output. This page provides tools like the Interactive Circuit Graph, Feature Explorer, and Subnetwork Explorer to visualize and analyze these computational pathways.",
+    "no_results_warning": "Attribution graph results not found.",
+    "run_analysis_info": "Please run the analysis script first: `python3 circuit_analysis/attribution_graphs_olmo.py --prompt-index 0 --force-retrain-clt`",
+    "config_header": "Configuration",
+    "model_label": "Model:",
+    "device_label": "Device:",
+    "features_per_layer_label": "Features per layer:",
+    "training_steps_label": "Training steps:",
+    "batch_size_label": "Batch size:",
+    "learning_rate_label": "Learning rate:",
+    "interactive_analysis_header": "Interactive Analysis",
+    "select_prompt_label": "Select a Prompt to Analyze:",
+    "select_prompt_help": "This selection controls both the Interactive Circuit Graph and the Feature Explorer below.",
+    "graph_stats_header": "Graph Statistics",
+    "full_graph_nodes_label": "Full Graph Nodes",
+    "full_graph_edges_label": "Full Graph Edges",
+    "pruned_graph_nodes_label": "Pruned Graph Nodes",
+    "pruned_graph_edges_label": "Pruned Graph Edges",
+    "feature_explorer_header": "Feature Explorer",
+    "token_analysis_header": "Token Analysis",
+    "input_tokens_label": "Input tokens:",
+    "feature_explorer_title": "Feature Explorer: {prompt}",
+    "select_layer_label": "Select Layer to Explore:",
+    "layer_label_format": "Layer {layer_num}",
+    "no_feature_viz_warning": "No feature visualizations available for this prompt.",
+    "no_features_in_layer_warning": "No features found in {selected_layer}",
+    "active_features_label": "**Active Features:**",
+    "choose_feature_label": "Choose a feature:",
+    "max_activation_label": "Max Activation",
+    "mean_activation_label": "Mean Activation",
+    "sparsity_label": "Sparsity",
+    "interpretation_label": "Interpretation",
+    "top_activating_tokens_title": "Top Activating Tokens for {selected_feature}",
+    "xaxis_token_label": "Token",
+    "yaxis_activation_label": "Activation Strength",
+    "generating_feature_explanation_spinner": "Generating AI explanation for feature activation...",
+    "feature_explanation_error": "Could not generate feature explanation: {e}",
+    "ai_feature_analysis_header": "AI Feature Analysis",
+    "node_size_label": "Node Size",
+    "edge_threshold_label": "Edge Threshold",
+    "tip_scroll_horizontally": "Tip: Use mouse wheel + Shift to scroll horizontally and see all 32 layers",
+    "colorbar_title": "Activation",
+    "path_highlight_label": "Circuit path",
+    "connections_legend": "Connections",
+    "embedding_legend": "Embedding",
+    "feature_legend": "Feature",
+    "layer_nav_header": "Layer Navigation",
+    "layer_nav_desc": "This graph shows <strong>{num_layers} layers</strong> with features. Use the range slider below the graph to navigate through all layers, or use <strong>Shift + Mouse Wheel</strong> to scroll horizontally.",
+    "generating_circuit_explanation_spinner": "Generating AI explanation for circuit graph...",
+    "circuit_explanation_error": "Could not generate circuit explanation: {e}",
+    "ai_circuit_analysis_header": "AI Circuit Analysis",
+    "layer_stats_header": "Layer Statistics",
+    "total_layers_label": "Total Layers with Features",
+    "total_features_label": "Total Features",
+    "avg_features_per_layer_label": "Avg Features per Layer",
+    "features_by_layer_header": "Features by Layer",
+    "feature_dist_title": "Feature Distribution Across Layers",
+    "feature_count_label": "Feature Count",
+    "subnetwork_explorer_title": "Subnetwork Explorer",
+    "subnetwork_explorer_desc": "Select a central feature to visualize its local neighborhood, showing both its upstream causes and downstream effects within a specific connection depth.",
+    "subnetwork_graph_empty_info": "The main circuit graph has not been generated yet. Please wait for it to load.",
+    "no_features_in_graph_warning": "No features are available in the current graph view to build a subnetwork from.",
+    "select_layer_label_subnetwork": "1. Select a Layer",
+    "no_features_in_layer_subnetwork_warning": "No features to select in {selected_layer}.",
+    "select_feature_label_subnetwork": "2. Select a Central Feature",
+    "traversal_depth_label": "3. Set Connection Depth",
+    "subnetwork_graph_title": "Subnetwork Centered on Feature: {feature}",
+    "subnetwork_no_connections_info": "This feature has no connections within the selected depth.",
+    "generating_subnetwork_explanation_spinner": "Analyzing subnetwork with AI...",
+    "ai_subnetwork_analysis_header": "AI Subnetwork Analysis",
+    "subnetwork_analysis_title": "Token Activation Analysis",
+    "subnetwork_no_features_info": "No features were found in this subnetwork to analyze.",
+    "subnetwork_no_token_info": "No token activation data is available for the features in this subnetwork.",
+    "subnetwork_top_tokens_desc": "The following input tokens most strongly activated the features in this subnetwork:",
+    "subnetwork_token_interpretation_info": "This shows what parts of the prompt the subnetwork is 'paying attention to.'",
+    "what_is_a_feature_header": "Key Concept: What is a 'Feature'?",
+    "what_is_a_feature_title": "A feature is a learned, interpretable pattern of neuron activity.",
+    "what_is_a_feature_desc": "Think of it as a concept detector. For example, one feature might activate strongly for words related to 'programming,' while another might detect 'questions about history.' These features are the building blocks the model uses to understand input and construct a response. By tracing them, we can map out the model's reasoning process.",
+    "faithfulness_check_expander": "Faithfulness Check",
+    "running_faithfulness_check_spinner": "Running faithfulness check...",
+    "verified_status": "Verified",
+    "contradicted_status": "Contradicted",
+    "claim_label": "Claim",
+    "status_label": "Status",
+    "evidence_label": "Evidence",
+    "no_verifiable_claims_info": "No verifiable claims were extracted from the explanation.",
+    "faithfulness_explanation_circuit_graph_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>How This Works:</strong> The faithfulness checker verifies two types of claims from the AI's explanation:<ul><li><strong>Feature Interpretation Claims:</strong> Checks if a claimed interpretation for a feature in a specific layer (e.g., 'detecting grammatical mood') closely matches an actual feature's interpretation in that layer using fuzzy string matching.</li><li><strong>Layer Role Claims:</strong> Semantically verifies if the AI's summary of a layer section's role (e.g., 'early layers handle syntax') is a plausible generalization of the actual top feature interpretations within that section.</li></ul></div>",
+    "faithfulness_explanation_feature_explorer_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>How This Works:</strong> The faithfulness checker verifies two types of claims from the AI's explanation:<ul><li><strong>Top Token Claims:</strong> Checks if a token claimed to be a top activator for a feature is actually present in the list of top activating tokens from the analysis data.</li><li><strong>Feature Role Claims:</strong> Checks if the AI's summarized interpretation of a feature's role closely matches the detailed interpretation from the analysis data using fuzzy string matching.</li></ul></div>",
+    "faithfulness_explanation_subnetwork_graph_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>How This Works:</strong> The faithfulness checker verifies three types of claims from the AI's explanation:<ul><li><strong>Causal Claims:</strong> Checks if a claimed causal link (upstream or downstream) is valid by using fuzzy string matching to confirm that the claimed feature's interpretation exists in the actual list of upstream or downstream neighbors.</li><li><strong>Token Influence Claims:</strong> Checks if tokens claimed to be upstream influences are present in the actual list of direct upstream tokens for the central feature.</li><li><strong>Central Feature Role Claims:</strong> Checks if the AI's interpretation of the central feature's role closely matches the interpretation from the analysis data using fuzzy string matching.</li></ul></div>",
+    "claim_extraction_prompt_header": "You are an expert claim extraction system. Your task is to read an explanation of a circuit trace visualization and extract verifiable claims into a structured JSON list.",
+    "claim_extraction_prompt_instruction": "Each object in the list MUST have: `claim_text`, `claim_type`, and `details`. The `claim_text` should be the full, original sentence from the explanation.",
+    "claim_extraction_prompt_rule": "**Extraction Rules:**\n1. **Maintain Original Order.** The claims in the final JSON list must appear in the same order as they do in the source text.\n2. **Ignore legend-like descriptions.** Do not extract claims from sentences that only explain what the visual elements of the graph represent (e.g., 'Each node is a feature', 'Color indicates activation'). Only extract claims that make a specific point about *what the model is doing* for the current prompt (e.g., 'Layer 10 shows high activation for 'syntax' features').\n3. **Keep claims concise.** A single claim should not span an entire paragraph. Break down long paragraphs into multiple, smaller claims, generally one for each main point or a small group of related points.\n4. For each `interpretation_summary` or `role_summary`, extract only the core concept, usually found within single quotes (e.g., from \"notable activity for 'sentence structure'\", extract just \"sentence structure\").\n5. **Crucially, if a single sentence makes multiple claims, you MUST group them into a single claim object.**\n   - For `feature_interpretation_claim`, `details` should be a list of objects, each containing `layer` and `interpretation_summary`.\n   - For `layer_role_claim`, if the claim spans multiple sections (early, middle, late), `details` should be a list of objects, each with `layer_section` and `role_summary`.",
+    "claim_extraction_prompt_context_header": "**Context:** {context}",
+    "claim_extraction_prompt_types_header": "**Available Claim Types:**",
+    "claim_extraction_prompt_analyze_header": "**Explanation to Analyze:**",
+    "claim_extraction_prompt_footer": "Respond with ONLY the JSON list of claims. -",
+    "circuit_graph_claim_types": "- `feature_interpretation_claim`: A claim about the interpreted role(s) of features in one or more layers. \n  - `details`: A list of objects, e.g., `[{\"layer\": 6, \"interpretation_summary\": \"sentence structure\"}, {\"layer\": 9, \"interpretation_summary\": \"country-related contexts\"}]`\n- `layer_role_claim`: A claim about the general function of one or more layer sections.\n  - `details`: A list of objects, e.g., `[{\"layer_section\": \"early\", \"role_summary\": \"dissect the input\"}, {\"layer_section\": \"middle\", \"role_summary\": \"develop meaning\"}]`",
+    "feature_explorer_claim_types": "- `top_token_activation_claim`: A claim that one or more tokens are top activators for the feature.\n  - `details`: { \"tokens\": [\"...\", \"...\"] }\n- `feature_interpretation_claim`: A claim about the feature's role, behavior, significance based on its layer position, or the reasoning for its token activations (e.g., \"Its presence in a late layer indicates...\"). This includes high-level insights. The `details` can be empty if no specific interpretation is mentioned.\n  - `details`: { \"interpretation_summaries\": [\"...\"] }",
+    "subnetwork_graph_claim_types": "- `causal_claim`: A claim about upstream (cause) or downstream (effect) relationships. Can involve multiple features.\n  - `details`: { \"source_feature_interpretations\": [\"...\", \"...\"], \"relationship\": \"upstream\" } or { \"target_feature_interpretations\": [\"...\", \"...\"], \"relationship\": \"downstream\" }\n- `feature_interpretation_claim`: A claim about the central feature's function(s).\n  - `details`: { \"interpretation_summaries\": [\"...\"] }\n- `token_influence_claim`: A claim that one or more input tokens are direct upstream influences.\n  - `details`: { \"tokens\": [\"...\"] }\n- `subnetwork_purpose_claim`: A claim about the overall purpose of the subnetwork.\n  - `details`: { \"purpose_summary\": \"...\" }",
+    "semantic_verification_prompt_header": "You are an AI fact-checker specializing in transformer model interpretability. Your task is to determine if a \"Claimed Summary\" is a reasonable and faithful semantic summary of the \"Actual Data Points,\" taking into account general knowledge of how transformer layers function.",
+    "semantic_verification_prompt_rules_header": "**Crucial Rules:**",
+    "semantic_verification_prompt_rule_1": "1.  The summary does not need to use the exact same words as the data points, but it must be semantically consistent.",
+    "semantic_verification_prompt_rule_2": "2.  **CRITICAL RULE: General Principles Override Data.** For the **{layer_section}** you are analyzing, the key principle is: *{principle}*. You MUST verify claims that state this widely accepted general principle, even if the specific data points for this prompt do not perfectly align. If a claimed summary matches this principle, you MUST respond with `is_verified: true` and a reasoning that acknowledges it as a correct general principle.",
+    "semantic_verification_prompt_rule_3": "3.  **Generalizations are acceptable and expected.** Summaries do not need to list every data point. A high-level, conceptually accurate summary is valid. A claim should be considered verified if it describes a correct aspect of the layer's function, even if it is not a comprehensive summary of all functions. For instance, a claim like 'dissecting the input' is a fair generalization for the early layers' role. **You MUST NOT contradict a claim simply for being 'vague' or 'general' if it is not factually incorrect.**",
+    "semantic_verification_principle_early": "**Early layers (approx. 0-10):** Handle syntax, grammar, and basic patterns.",
+    "semantic_verification_principle_middle": "**Middle layers (approx. 11-21):** Develop thematic connections, link concepts, and build abstract meaning.",
+    "semantic_verification_principle_late": "**Late layers (approx. 22-31):** Synthesize all information to finalize the output.",
+    "semantic_verification_prompt_subnetwork_header": "You are an AI fact-checker specializing in transformer model interpretability. Your task is to determine if the 'Claimed Purpose' is a reasonable and faithful semantic summary of the roles of the individual features that make up this computational subnetwork.",
+    "semantic_verification_prompt_subnetwork_rules_header": "**Crucial Rules:**",
+    "semantic_verification_prompt_subnetwork_rule_1": "1. The purpose does not need to use the exact same words as the data points, but it must be semantically consistent.",
+    "semantic_verification_prompt_subnetwork_rule_2": "2. Generalizations are acceptable if they are accurate (e.g., summarizing 'detects punctuation' and 'identifies parts of speech' as 'handling syntax' is a fair generalization).",
+    "semantic_verification_prompt_subnetwork_actual_data_header": "**Actual Data Points (Feature interpretations from the subnetwork):**",
+    "semantic_verification_prompt_subnetwork_claimed_purpose_header": "**Claimed Purpose:**",
+    "semantic_verification_prompt_actual_data_header": "**Actual Data Points (Top feature interpretations from this layer section):**",
+    "semantic_verification_prompt_claimed_summary_header": "**Claimed Summary:**",
+    "semantic_verification_prompt_task_header": "**Your Task:**",
+    "semantic_verification_prompt_task_instruction": "Based on the rules above, is the summary a fair and accurate semantic description of the data? Respond with a JSON object with two keys: `is_verified` (boolean) and `reasoning` (one-sentence explanation). -",
+    "semantic_verification_prompt_feature_role_header": "You are an AI fact-checker specializing in transformer model interpretability. Your task is to determine if the 'Claimed Role' is a reasonable and faithful semantic summary of the provided 'Feature Evidence.'",
+    "semantic_verification_prompt_feature_role_rules_header": "**Crucial Rules:**",
+    "semantic_verification_prompt_feature_role_rule_1": "1. The Claimed Role does not need to use the exact same words as the evidence, but it must be semantically consistent and a plausible interpretation.",
+    "semantic_verification_prompt_feature_role_rule_2": "2. Consider the layer position (early/middle/late) as important context. A claim that aligns with the typical function of that layer section is more likely to be correct.",
+    "semantic_verification_prompt_feature_role_guidance_early": "Treat claims mentioning foundational grammar, basic sentence structure, or token order as consistent with early-layer behavior even if the exact wording differs from the evidence.",
+    "semantic_verification_prompt_feature_role_guidance_middle": "Treat claims about integrating context, linking concepts, or building thematic meaning as consistent with middle-layer behavior even when phrased differently.",
+    "semantic_verification_prompt_feature_role_guidance_late": "Treat claims about synthesizing information, finalizing answers, or generating outputs as consistent with late-layer behavior even if the exact words differ from the evidence.",
+    "semantic_verification_prompt_feature_role_rule_3": "3. If Upstream or Downstream connections are provided, use them to evaluate claims about the feature acting as a 'bridge', 'hub', or 'integrating' information. The claim should be consistent with the interpretations of the connected features.",
+    "semantic_verification_prompt_feature_role_evidence_header": "**Feature Evidence:**",
+    "semantic_verification_prompt_feature_role_upstream_header": "- **Upstream Connections (Top Interpretations):** {interpretations}",
+    "semantic_verification_prompt_feature_role_downstream_header": "- **Downstream Connections (Top Interpretations):** {interpretations}",
+    "semantic_verification_prompt_feature_role_claimed_role_header": "**Claimed Role:**",
+    "semantic_verification_prompt_token_reasoning_header": "You are an AI fact-checker specializing in transformer model interpretability. Your task is to determine if the 'Claimed Explanation' for why certain tokens activate a feature is a reasonable and faithful semantic summary of the provided 'Feature Evidence.'",
+    "semantic_verification_prompt_token_reasoning_rules_header": "**Crucial Rules:**",
+    "semantic_verification_prompt_token_reasoning_rule_1": "1. The explanation does not need to use the exact same words as the evidence, but it must be semantically consistent and a plausible interpretation of the token-feature interaction.",
+    "semantic_verification_prompt_token_reasoning_rule_2": "2. Focus on the reasoning provided. The claim is not just that the tokens activate the feature, but *why* they do. Is the explanation logical given the feature's role and layer position?",
+    "semantic_verification_prompt_token_reasoning_evidence_header": "**Feature Evidence:**",
+    "semantic_verification_prompt_token_reasoning_claimed_explanation_header": "**Claimed Explanation:**",
+    "semantic_verification_prompt_causal_reasoning_header": "You are an AI fact-checker specializing in transformer model interpretability. Your task is to determine if the 'Claimed Causal Explanation' is a reasonable and faithful summary of the provided 'Causal Evidence.'",
+    "semantic_verification_prompt_causal_reasoning_rules_header": "**Crucial Rules:**",
+    "semantic_verification_prompt_causal_reasoning_rule_1": "1. The explanation must be semantically consistent with the roles of the source, central, and target features.",
+    "semantic_verification_prompt_causal_reasoning_rule_2": "2. Focus on the reasoning. The claim is not just that a connection exists, but *why* it exists or what its function is. Is the explanation logical?",
+    "semantic_verification_prompt_causal_reasoning_evidence_header": "**Causal Evidence:**",
+    "semantic_verification_prompt_causal_reasoning_claimed_explanation_header": "**Claimed Causal Explanation:**",
+    "explanation_prompt_header": "You are an expert in neural network interpretability and circuit tracing analysis. Analyze this visualization that shows how information flows through the OLMo2 7B language model using Cross-Layer Transcoders.",
+    "explanation_prompt_context_header": "## Context",
+    "explanation_prompt_instructions_header": "## Instructions",
+    "circuit_graph_instruction_header": "Provide a structured, layer-by-layer analysis of the circuit graph. Your response MUST use smaller Markdown headings (`####`). Do not refer to specific feature numbers (e.g., \"feature_411\"); instead, describe their function based on their interpretation.",
+    "circuit_graph_instruction_intro": "#### Introduction: What This Graph Shows\nExplain what this specific circuit graph visualizes for the given prompt. Mention that it shows information flow from input tokens through feature activations in different layers.",
+    "circuit_graph_instruction_early": "#### Early Layers (0-10): Input Processing\nBased on the top features provided in the context, describe the primary role of these layers. Explain how they deconstruct the input's basic grammar, syntax, or key terms by describing the functions of the active features.",
+    "circuit_graph_instruction_middle": "#### Middle Layers (11-21): Developing Meaning\nExplain what these layers do with the initial patterns. Describe how they link concepts, build relationships, or shift the focus of the analysis toward a more abstract understanding.",
+    "circuit_graph_instruction_late": "#### Late Layers (22-31): Finalizing the Output\nDescribe how these layers synthesize all the previous information to produce the final result, focusing on how the top features contribute to the model's output.",
+    "circuit_graph_instruction_insight": "#### Primary Insight\nConclude with a key takeaway from this analysis. What is the most important or surprising aspect of the model's strategy for this prompt?",
+    "circuit_graph_instruction_footer": "Ensure your entire response follows this structure of headings and paragraphs. Do not use bullet points for the main sections.",
+    "feature_explorer_instruction_header": "Provide a structured analysis of the feature shown. Your response MUST be a Markdown bulleted list, with each bullet point on a NEW LINE. Use the following structure:",
+    "feature_explorer_instruction_role": "- **Feature Role and Layer Context:** Explain the feature's interpretation and what its presence in this specific layer (early/middle/late) implies about its function.",
+    "feature_explorer_instruction_activations": "- **Key Token Activations:** Identify the top activating tokens and explain why they are relevant to the feature's role.",
+    "feature_explorer_instruction_insight": "- **Overall Insight:** Provide a concluding insight about what this feature's behavior reveals about the model's information processing strategy.",
+    "feature_explorer_instruction_footer": "Ensure your output is ONLY this three-bullet list.",
+    "subnetwork_graph_instruction_header": "Provide a concise, insightful analysis of this subnetwork. Your response MUST be a Markdown bulleted list, with each bullet point on a NEW LINE. Use the following structure:",
+    "subnetwork_graph_instruction_role": "- **Central Feature's Role:** Briefly explain the function of the central feature based on its interpretation and layer position.",
+    "subnetwork_graph_instruction_upstream": "- **Upstream Influence:** Describe which earlier features or input tokens (the causes) are most strongly activating this central feature. When mentioning features, refer to their interpretation provided in the context.",
+    "subnetwork_graph_instruction_downstream": "- **Downstream Impact:** Describe what later features (the effects) this central feature contributes to most strongly. When mentioning features, refer to their interpretation provided in the context.",
+    "subnetwork_graph_instruction_purpose": "- **Subnetwork's Purpose:** Synthesize the above points to hypothesize the overall purpose of this specific computational pathway in processing the prompt.",
+    "subnetwork_graph_instruction_footer": "Ensure your output is ONLY this four-bullet list.",
+    "context_unspecified_viz": "This is a circuit tracing visualization showing information flow through the model.",
+    "instruction_unspecified_viz": "Explain this visualization.",
+    "circuit_graph_context_header": "This is a circuit tracing graph for the prompt: \"{prompt}\"",
+    "circuit_graph_context_tokens": "Input tokens: {tokens}",
+    "circuit_graph_context_summary_header": "#### Key Feature Summary by Layer Section\nHere are the most active features in each section of the model for this prompt:",
+    "circuit_graph_context_early_header": "**Early Layers (0-10):**",
+    "circuit_graph_context_middle_header": "**Middle Layers (11-21):**",
+    "circuit_graph_context_late_header": "**Late Layers (22-31):**",
+    "circuit_graph_context_no_features": "No significantly active features found.",
+    "circuit_graph_context_feature_line": "- In L{layer}, a feature interpreted as \"{interpretation}\" (Activation: {activation:.2f})",
+    "subnetwork_context_header": "This is a subnetwork visualization from a larger circuit trace for the prompt: \"{prompt}\"",
+    "subnetwork_context_centered_on": "The subnetwork is centered around:",
+    "subnetwork_context_feature": "- **Feature:** {name}",
+    "subnetwork_context_layer": "- **Layer:** {layer}",
+    "subnetwork_context_interpretation": "- **Interpretation:** \"{interpretation}\"",
+    "subnetwork_context_no_interpretation": "No interpretation available.",
+    "subnetwork_context_upstream_header": "\nKey Upstream Features (Causes) in this Subgraph:",
+    "subnetwork_context_downstream_header": "\nKey Downstream Features (Effects) in this Subgraph:",
+    "subnetwork_context_feature_line": "- L{layer} {feature_name}: \"{interpretation}\"",
+    "subnetwork_context_depth": "The view shows connections within a depth of **{depth}** hops from the central feature (highlighted in crimson).",
+    "subnetwork_context_stats_header": "Subnetwork Statistics:",
+    "subnetwork_context_stats_nodes": "- **Nodes:** {nodes}",
+    "subnetwork_context_stats_edges": "- **Edges:** {edges}",
+    "subnetwork_context_viz_header": "The visualization shows:",
+    "subnetwork_context_viz_central": "- The central feature (crimson border) and its neighbors.",
+    "subnetwork_context_viz_nodes": "- Upstream nodes (causes) and downstream nodes (effects).",
+    "subnetwork_context_viz_lilac": "- Lilac nodes are input token embeddings.",
+    "subnetwork_context_viz_other": "- Other nodes are features, colored by activation strength (viridis scale).",
+    "subnetwork_context_viz_edges": "- Edge thickness represents connection weights.",
+    "feature_explorer_context_header": "This is a feature explorer visualization for the prompt: \"{prompt}\"",
+    "feature_explorer_context_model_header": "**Model Context:** The model is OLMo-2-7B, which has 32 layers (indexed 0-31). Layer 0 is the first layer (closest to input embeddings), and Layer 31 is the last layer (closest to the final output). Early layers (e.g., 0-10) handle basic patterns, while late layers (e.g., 22-31) handle more abstract concepts.",
+    "feature_explorer_context_analyzing_feature": "We are analyzing **Feature {feature}** in **Layer {layer}**, which is {position} layer in the model.",
+    "feature_explorer_context_analyzing_feature_no_pos": "We are analyzing **Feature {feature}** in **Layer {layer}**.",
+    "feature_explorer_context_position_early": "an early",
+    "feature_explorer_context_position_middle": "a middle",
+    "feature_explorer_context_position_late": "a late",
+    "feature_explorer_context_tokens": "**Input tokens:** {tokens}",
+    "feature_explorer_context_interpretation": "**Feature Interpretation:** \"{interpretation}\"",
+    "feature_explorer_context_no_interpretation": "No interpretation available.",
+    "feature_explorer_context_footer": "The bar chart shows which input tokens caused the highest activation for this specific feature within its layer. Analyze the relationship between the tokens and the feature's interpretation, keeping the layer's position in mind."
+}

locales/en/common.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "llm_analysis_suite": "Explainable Language Interpretability Analysis Tool",
+    "main_menu": "Main Menu",
+    "attribution_analysis": "Attribution Analysis",
+    "function_vectors": "Function Vectors",
+    "circuit_tracing": "Circuit Tracing",
+    "language": "Language",
+    "unable_to_generate_explanation": "Unable to generate explanation at this time.",
+    "clear_cache_button": "Clear Cache & Rerun",
+    "q_influential_docs_plausibility_help": "How plausible did you find the documents identified by the Influence Tracer? (1=Not plausible, 5=Very plausible)",
+    "comprehension_qs_subheader": "Comprehension Questions",
+    "comprehension_qs_desc": "Please answer the following questions to the best of your ability based on your understanding of the visualizations.",
+    "submit_feedback_button": "Submit Feedback",
+    "feedback_success_message": "Thank you, your feedback has been submitted!",
+    "feedback_please_answer_all_qs": "Please answer all comprehension questions before submitting.",
+    "what_is_this_function_type": "What is this function type?",
+    "desc_abstractive_tasks": "These tasks require the model to generate new text that captures the essence of the source text, rather than just extracting parts of it. Examples include summarization or paraphrasing.",
+    "desc_multiple_choice_qa": "The model is given a question and a set of options, and it must choose the correct answer from the list. This tests reasoning and comprehension over a fixed set of choices.",
+    "desc_text_classification": "The model must assign a predefined category or label to a piece of text. Common examples include sentiment analysis (positive/negative), topic classification, or spam detection.",
+    "desc_extractive_tasks": "These tasks involve identifying and extracting a specific span of text directly from a given context. This is often used for question answering where the answer is explicitly stated in the text.",
+    "desc_named_entity_recognition": "A sub-task of extractive tasks where the model identifies and categorizes named entities such as people, organizations, locations, dates, and other specific terms in text.",
+    "desc_text_generation": "Open-ended text creation tasks where the model generates creative, coherent, or contextually appropriate text based on a prompt. Examples include writing a story, a poem, or continuing a paragraph.",
+    "likert_scale_meaning": "1 = Strongly Disagree/Not at all Clear, 5 = Strongly Agree/Very Clear",
+    "q1_pca_clarity": "How clear was the 3D PCA visualization?",
+    "q2_type_attribution_clarity": "How clear was the Function Type Attribution bar chart?",
+    "q_layer_evolution_plausibility": "How plausible did you find the Layer Evolution analysis (the way function changes across layers)?",
+    "ct_q_main_graph_clarity": "How clear was the main circuit graph visualization for understanding the overall information flow?",
+    "ct_q_feature_explorer_usefulness": "How useful was the Feature Explorer for understanding individual components?",
+    "ct_q_subnetwork_clarity": "How helpful was the Subnetwork view for tracing specific pathways?",
+    "ct_q1": "What is the primary role of the EARLY layers (e.g., 0-10) in circuit tracing?",
+    "ct_q1_option_a": "To synthesize final concepts and make complex decisions.",
+    "ct_q1_option_b": "To process basic patterns like syntax and word order from the input text.",
+    "ct_q1_option_c": "To link abstract ideas from different parts of the prompt together.",
+    "ct_q2": "What is the primary benefit of using the **Subnetwork Explorer** to focus on a single feature?",
+    "ct_q2_option_a": "To see all features in the model at once.",
+    "ct_q2_option_b": "To understand the local computational role of a feature by seeing its direct causes (inputs) and effects (outputs).",
+    "ct_q2_option_c": "To change the color and size of the nodes in the graph.",
+    "ct_q3": "If an early-layer feature (e.g., detecting syntax) strongly connects to a late-layer feature (e.g., identifying a concept), what does this pathway likely represent?",
+    "ct_q3_option_a": "The model using foundational grammar to build a more abstract, conceptual understanding.",
+    "ct_q3_option_b": "A random, meaningless connection that should be ignored.",
+    "ct_q3_option_c": "The model only paying attention to the final layers and ignoring early ones."
+}

locales/en/function_vectors_page.json ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+    "fv_page_title": "<i class='bi bi-cpu'></i> Function Vector Analysis",
+    "fv_page_desc": "This page explores the concept of <strong>function vectors</strong>—high-dimensional representations of what a model 'understands' about a prompt's underlying purpose. By visualizing these vectors, we can see how the model groups similar tasks and instructions.",
+    "viz_dir_not_found_error": "Visualizations directory not found. Please run the function vector analysis first.",
+    "dataset_overview": "Dataset Overview",
+    "interactive_analysis_section_header": "<i class='bi bi-pencil-square'></i> Interactive Analysis",
+    "pca_3d_section_header": "<i class='bi bi-dice-3'></i> 3D PCA Visualization of Function Vectors",
+    "run_analysis_for_viz_info": "<i class='bi bi-info-circle'></i> Run an interactive analysis below to see your own prompt plotted in this space.",
+    "dataset_overview_desc_long": "The following examples are the prompts used to generate the vectorized dataset. This helps build intuition for how different tasks are represented in the model's vector space, which is visualized in the plot below.",
+    "try_your_own_subheader": "Or try one of these examples:",
+    "pca_box_title": "<i class='bi bi-box'></i> Interactive 3D Principal Component Analysis",
+    "pca_box_purpose": "<strong>Purpose:</strong> Reduces high-dimensional function vectors to 3D space while preserving maximum variance",
+    "pca_box_how_to": "<strong>How to interact:</strong> Click and drag to rotate the plot. Hover over points to see which category they belong to.",
+    "pca_box_features": "<strong>Key Features:</strong> 3D rotation • Zoom & pan • Hover details • Shape & color coding • Legend toggle",
+    "pca_box_elements": "<strong>Visual Elements:</strong> 🔵 Circles (Abstractive) • 🔷 Diamonds (QA) • 🟦 Squares (Classification) • ✖️ Crosses (Extractive) • 🔹 Open Diamonds (NER) • ⬜ Open Squares (Generation)",
+    "pca_box_best_for": "<strong>Best For:</strong> Understanding overall functional organization and dimensional relationships",
+    "generating_enhanced_pca_info": "🎯 Generating enhanced 3D PCA with your input!",
+    "error_creating_enhanced_pca": "Error creating enhanced PCA visualization: {e}",
+    "pca_3d_with_input_title": "3D PCA with Your Input<br><sub>Red star shows where your text sits in function space</sub>",
+    "your_input_legend": "Your Input",
+    "your_input_hover_title": "Your Input Text",
+    "your_input_analysis_desc": "🔍 **Your Input Analysis:** The red star shows where **\\\"{input_text}\\\"** sits in the 3D function space. Notice which function types it's closest to - this reveals what linguistic capabilities your text most strongly activates!",
+    "pca_3d_standard_title": "3D PCA of Function Categories<br><sub>Interactive visualization of functional relationships</sub>",
+    "standard_view_desc": "🔍 **Standard View:** This shows all 120 function categories in 3D space using actual computed vectors. Run an interactive analysis above to see your input as a red diamond in this visualization!",
+    "error_creating_standard_pca": "Error creating standard PCA visualization: {e}",
+    "pca_viz_not_found_warning": "3D PCA visualization not found. Please generate it using the analysis script.",
+    "pca_key_insights": "<strong>Key Insights:</strong> Notice how English translation tasks (English-German, English-Spanish, etc.) cluster together, and how different function types occupy distinct regions of the 3D space, revealing the model's internal functional organization.",
+    "error_loading_pca_viz": "Error loading 3D PCA visualization: {e}",
+    "interactive_analysis_box_title": "🔬 Interactive Function Vector & Layer Evolution Analysis",
+    "interactive_analysis_box_purpose": "<strong>Purpose:</strong> Analyze how your input text activates different linguistic functions from our balanced dataset of 120 categories",
+    "interactive_analysis_box_features": "<strong>Features:</strong> Real-time analysis • Function attribution across 6 types • Layer evolution • Token-level analysis • Visual outputs",
+    "interactive_analysis_box_model": "<strong>Model:</strong> OLMo-2-1124-7B analyzing against balanced function vectors (20 categories per function type)",
+    "interactive_analysis_box_best_for": "<strong>Best For:</strong> Understanding how specific text inputs activate and evolve functional representations across diverse linguistic tasks",
+    "input_text_header": "",
+    "input_text_label": "Enter your prompt",
+    "input_text_placeholder": "E.g., 'Translate 'Good morning' to German' or 'What is the capital of France?'",
+    "input_text_help": "Enter any text you want to analyze. The system will show which linguistic functions are activated and how they evolve through the model layers.",
+    "about_dataset_expander": "About the function vector dataset",
+    "balanced_dataset_title": "Dataset Composition",
+    "balanced_dataset_body": "The comparison dataset contains 600 prompts, covering 120 categories across 6 main function types.",
+    "analyze_button": "Analyze Text",
+    "running_analysis_spinner": "Running analysis...",
+    "analysis_failed_error": "Analysis failed. Please ensure the function vector data has been generated.",
+    "analysis_error": "Error during analysis: {e}",
+    "ensure_model_and_data_info": "Please ensure the OLMo-2-1124-7B model and function vector data are available.",
+    "example_queries_header": "<i class='bi bi-lightbulb'></i> Example Queries to Try",
+    "example_queries_desc": "*These examples showcase different function types from our balanced dataset:*",
+    "example_query_help": "Click to analyze: {example}",
+    "analysis_complete_success": "Analysis completed!",
+    "analyzed_text_header": "Analyzed Text",
+    "function_types_tab": "<i class='bi bi-bar-chart-line'></i> Function Type Attribution",
+    "category_analysis_tab": "<i class='bi bi-pie-chart'></i> Category Analysis",
+    "layer_evolution_tab": "<i class='bi bi-layers'></i> Layer Evolution Analysis",
+    "ai_explanation_header": "<i class='bi bi-robot'></i> AI-Powered Explanation",
+    "generating_ai_explanation_spinner": "Generating AI-powered explanation...",
+    "enable_ai_explanation_checkbox": "Enable AI Explanation",
+    "enable_ai_explanation_help": "Generate a natural language explanation of the analysis results using the Qwen-72B-VL model.",
+    "pca_explanation_prompt": "You are an expert AI analyst. Your task is to explain the positioning of a user's prompt on a 3D PCA plot of function vectors. The plot visualizes how a language model categorizes prompts based on their underlying function, with similar functions clustering together.\\n\\n**User's Prompt:** \"{input_text}\"\\n\\n**Analysis Data (Top 3 Closest Matches):**\\n- **Function Types:** {top_types}\\n- **Specific Categories:** {top_cats}\\n\\nBased on this data, please provide a concise, analytical explanation in three distinct parts. **Crucially, you MUST use markdown headings (`####`) for each part and follow the requested structure exactly.**\\n\\n#### Overall Placement\\nStart with a high-level summary of where the prompt is located in the PCA plot. Mention which general functional neighborhood it falls into.\\n\\n#### Top Function Type Attributions\\nAnalyze the top 3 most dominant function types. For each of the top 3 types, briefly explain why the user's prompt aligns with it, referencing the prompt's content and the nature of that function type.\\n\\n#### Top Specific Category Attribution\\nDiscuss the top 3 specific categories. For each category, briefly explain the connection and why it makes sense as a close neighbor to the user's prompt.\\n\\nStructure your answer with clear headings for each of the three parts. Ground your entire explanation in the provided data.",
+    "function_type_attribution_header": "This chart shows how strongly your input aligns with the six major function types defined in the model's training data. A higher score indicates a stronger match.",
+    "top_category_attribution_header": "This sunburst chart breaks down the attribution into more granular categories, showing the top 20 most similar functions to your input.",
+    "sunburst_chart_title": "Top 20 Category Attributions",
+    "missing_category_mapping_warning": "Some categories could not be assigned to a function type. They were skipped in the chart: {categories}",
+    "no_mapped_categories_info": "No categories with valid function-type mappings were available to display.",
+    "unmapped_function_type": "Unmapped Function Type",
+    "layer_evolution_header": "A language model isn't a single entity; it's composed of many sequential layers, much like a factory assembly line. When you provide a prompt, the information passes through each layer, getting progressively refined. Early layers handle basic syntax and word meanings, middle layers build more complex relationships, and final layers synthesize this information to produce an output. This analysis visualizes that journey, showing how the model's 'understanding' of your prompt evolves. The charts below reveal which parts of this 'assembly line' are most active for your specific text, offering clues into the model's reasoning process.",
+    "evolution_explanation_prompt": "You are an expert AI analyst. Your task is to explain two charts about layer evolution for a user's prompt.\\n\\n**User's Prompt:** \"{input_text}\"\\n\\n**Analysis Data:**\\n- **Peak Activation:** Layer {peak_activation_layer} (Strength: {peak_activation_strength:.2f})\\n- **Biggest Change:** Between Layer {biggest_change_start_layer} and {biggest_change_end_layer} (Change Magnitude: {biggest_change_magnitude:.2f})\\n\\nBased on this data, provide a detailed (2-3 sentences per part) explanation in two parts. **You MUST use markdown headings (`####`) for each part.**\\n\\n#### Activation Strength Analysis\\nExplain the significance of the peak activation occurring at layer {peak_activation_layer}. What does this suggest about the model's processing stage (e.g., early feature extraction, mid-level abstraction, or late-stage decision making)?\\n\\n#### Layer-to-Layer Change Analysis\\nExplain the significance of the largest change occurring between layers {biggest_change_start_layer} and {biggest_change_end_layer}. What does this shift imply about the model's processing?\\n\\nGround your explanation in the provided data.",
+    "attribution_score_xaxis": "Attribution Score (Cosine Similarity)",
+    "running_layer_evolution_spinner": "Running layer evolution analysis...",
+    "evolution_not_available_info": "Layer evolution analysis was not run or failed. Please enable it in the options and try again.",
+    "pca_3d_title": "3D PCA of {lang} Function Categories",
+    "legend_title": "Function Types",
+    "category_examples_desc": "",
+    "no_examples_for_type": "No examples available for this function type in the selected language.",
+    "prompt_examples_for_category": "Prompt Examples for {category}",
+    "no_examples_for_category_specific": "No examples available for this specific category.",
+    "function_types_subheader": "Function Types",
+    "select_function_type_label": "Select a function type to explore",
+    "prompt_examples_for_category_header": "Prompts Used for {category}",
+    "show_all_button": "Show all {count} categories",
+    "show_less_button": "Show less",
+    "abstractive_tasks": "Abstractive Tasks",
+    "multiple_choice_qa": "Multiple Choice QA",
+    "text_classification": "Text Classification",
+    "extractive_tasks": "Extractive Tasks",
+    "named_entity_recognition": "Named Entity Recognition",
+    "text_generation": "Text Generation",
+    "feedback_survey_header": "Feedback & Comprehension Survey",
+    "feedback_survey_desc": "Your feedback is valuable for improving this tool. Please take a moment to answer these questions.",
+    "ux_feedback_subheader": "User Experience Feedback",
+    "comprehension_subheader": "Comprehension Questions",
+    "likert_scale_meaning": "Rate on a scale of 1 (Not clear at all) to 5 (Very clear).",
+    "q1_pca_clarity": "How clear was the 3D PCA visualization for showing where your input fits among other functions?",
+    "q2_cognitive_load": "How mentally demanding did you find it to interpret the analysis results as a whole?",
+    "submit_feedback_button": "Submit Feedback",
+    "feedback_success_message": "Thank you for your feedback!",
+    "feedback_error_message": "Sorry, there was an error submitting your feedback: {e}",
+    "feedback_please_answer_all_qs": "Please answer all comprehension questions before submitting.",
+    "comprehension_qs_subheader": "Comprehension Questions",
+    "comprehension_qs_desc": "Please answer the following questions to the best of your ability. Your answers help us evaluate the clarity of the visualizations.",
+    "desc_text_generation": "Open-ended text generation, including creative writing or continuing a story.",
+    "how_vectors_are_made_header": "How Are These Vectors Created?",
+    "how_vectors_are_made_desc": "The process of creating a function vector is a multi-step pipeline that transforms raw text into a meaningful numerical representation. The diagram below illustrates this transformation, showing how a simple prompt is processed by the model to produce a vector that encapsulates its core function.",
+    "how_vectors_are_made_step1_title": "STEP 1: INPUT PROMPT",
+    "how_vectors_are_made_step2_title": "STEP 2: TOKENIZER",
+    "how_vectors_are_made_step3_title": "STEP 3: OLMo-2-7B MODEL",
+    "how_vectors_are_made_step3_desc": "Hidden States from all 32 Layers",
+    "how_vectors_are_made_step4_title": "STEP 4: FINAL LAYER EXTRACTION",
+    "how_vectors_are_made_step4_desc": "Vector of 4096 numbers",
+    "how_vectors_are_made_step5_title": "STEP 5: FUNCTION VECTOR",
+    "how_vectors_are_made_step1_example": "Translate 'Good morning' to German",
+    "how_vectors_are_made_step2_example": "[\"Translate\", \"'\", \"Good\", ..., \"German\"]",
+    "fv_q1": "What does a 'function vector' represent in this context?",
+    "fv_q1_option_a": "A single word from the input prompt.",
+    "fv_q1_option_b": "The grammatical structure of the prompt.",
+    "fv_q1_option_c": "A numerical fingerprint of the prompt's core purpose.",
+    "fv_q2": "What is the primary purpose of using Principal Component Analysis (PCA) for the 3D visualization?",
+    "fv_q2_option_a": "To make the plot look more colorful.",
+    "fv_q2_option_b": "To reduce high-dimensional vector data into a 3D space for visualization.",
+    "fv_q2_option_c": "To speed up the model's processing time.",
+    "fv_q3": "In the 3D PCA plot, what does the distance between two points indicate?",
+    "fv_q3_option_a": "The difference in length between two prompts.",
+    "fv_q3_option_c": "The functional similarity between the prompts (closer points are more similar).",
+    "fv_q3_option_d": "The number of layers activated by each prompt.",
+    "activation_strength_plot_title": "Activation Strength Across Layers",
+    "layer_changes_plot_title": "Representational Change Between Layers",
+    "fv_faithfulness_explanation_pca_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>How This Works:</strong> The faithfulness checker verifies three types of claims from the AI's explanation:<ul><li><strong>Ranking Claims:</strong> Checks if a claimed 'most similar' function type or category is actually within the top 3 matches based on cosine similarity scores.</li><li><strong>Positional Claims:</strong> Semantically verifies if the AI's description of the input's position (e.g., 'near text classification') is a plausible summary of the actual top-ranked functions.</li><li><strong>Justification Claims:</strong> Semantically analyzes whether the reasoning provided for a category's relevance is plausible and logically consistent with the input prompt.</li></ul></div>",
+    "fv_faithfulness_explanation_evolution_html": "<div style='font-size: 0.9rem; margin-bottom: 1rem;'><strong>How This Works:</strong> The faithfulness checker verifies three types of claims from the AI's explanation:<ul><li><strong>Peak/Trough Claims:</strong> Checks if a claim about a peak event correctly identifies the layer where the event occurred.</li><li><strong>Numerical Claims:</strong> Checks if a specific numerical value mentioned in the explanation correctly matches the calculated value.</li><li><strong>Layer Claims:</strong> Checks if a claim correctly identifies the layer index for a specific metric.</li></ul></div>",
+    "fv_claim_extraction_prompt_header": "You are an expert claim extraction system. Your task is to read an explanation of a data visualization and extract all verifiable, factual claims into a structured JSON list. A single sentence may contain multiple claims.",
+    "fv_claim_extraction_prompt_instruction": "Each object in the list MUST have the following keys:\n1.  `claim_text`: The exact sentence or phrase from the explanation that makes the claim.\n2.  `claim_type`: One of the available claim types for the given context.\n3.  `details`: An object containing the specific parameters for verification.",
+    "fv_claim_extraction_prompt_context_header": "**Context of this explanation:** {context}",
+    "fv_claim_extraction_prompt_types_header": "**Available Claim Types:**",
+    "fv_claim_extraction_prompt_pca_types_details": "-   `top_k_similarity`: A claim that one or more function types/categories are the most similar to the input.\n    -   `details`: {{ \"item_type\": \"function_type\" or \"category\", \"items\": [\"...\"], \"rank_description\": \"most/least\" }}\n-   `positional_claim`: A claim about the input's position relative to one or more clusters in the PCA plot.\n    -   `details`: {{ \"cluster_names\": [\"...\"], \"position\": \"near/far/between\" }}\n-   `category_justification_claim`: A claim that provides a specific reason for a category's relevance to the input prompt.\n    -   `details`: {{ \"category_name\": \"...\", \"justification\": \"...\" }}",
+    "fv_claim_extraction_prompt_evolution_types_details": "-   `peak_activation`: A claim about which layer had the highest activation strength.\n    -   `details`: {{ \"layer_index\": 12 }}\n-   `biggest_change`: A claim about which layer transition had the biggest change.\n    -   `details`: {{ \"start_layer\": 10, \"end_layer\": 11 }}\n-   `specific_value_claim`: A claim about a specific numerical value.\n    -   `details`: {{ \"metric\": \"activation_strength\" or \"change_magnitude\", \"layer_index\": 12, \"value\": 65.91 }}\n    -   **Note:** For \"change_magnitude\", `layer_index` refers to the **starting layer** of the transition (e.g., for layer 1->2, `layer_index` is 1).",
+    "fv_claim_extraction_prompt_pca_example_header": "**Example for a 'pca' context:**",
+    "fv_claim_extraction_prompt_pca_example_explanation": "- **Explanation sentence:** \"Specifically, it falls into a region characterized by abstractive tasks, text classification, and text generation.\"",
+    "fv_claim_extraction_prompt_pca_example_json": "- **Resulting JSON object:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"Specifically, it falls into a region characterized by abstractive tasks, text classification, and text generation.\",\n      \"claim_type\": \"positional_claim\",\n      \"details\": {{\n        \"cluster_names\": [\"abstractive tasks\", \"text classification\", \"text generation\"],\n        \"position\": \"near\"\n      }}\n    }},\n    {{\n      \"claim_text\": \"The prompt is closely linked to Language QA because it involves answering a question about a literary work.\",\n      \"claim_type\": \"category_justification_claim\",\n      \"details\": {{\n        \"category_name\": \"Language QA\",\n        \"justification\": \"it involves answering a question about a literary work\"\n      }}\n    }}\n  ]\n  ```",
+    "fv_claim_extraction_prompt_evolution_example_header": "**Example for an 'evolution' context:**",
+    "fv_claim_extraction_prompt_evolution_example_explanation": "- **Explanation sentence:** \"The biggest change occurring between Layer 1 and 2, with a magnitude of 0.40...\"",
+    "fv_claim_extraction_prompt_evolution_example_json": "- **Resulting JSON object:**\n  ```json\n  [\n    {{\n      \"claim_text\": \"The biggest change occurring between Layer 1 and 2, with a magnitude of 0.40...\",\n      \"claim_type\": \"biggest_change\",\n      \"details\": {{ \"start_layer\": 1, \"end_layer\": 2 }}\n    }},\n    {{\n      \"claim_text\": \"The biggest change occurring between Layer 1 and 2, with a magnitude of 0.40...\",\n      \"claim_type\": \"specific_value_claim\",\n      \"details\": {{ \"metric\": \"change_magnitude\", \"layer_index\": 1, \"value\": 0.40 }}\n    }}\n  ]\n  ```",
+    "fv_claim_extraction_prompt_analyze_header": "**Explanation to Analyze:**",
+    "fv_claim_extraction_prompt_footer": "Respond with ONLY the JSON list of claims. If no verifiable claims are found, return an empty list `[]`.",
+    "fv_semantic_verification_prompt_header": "You are an AI fact-checker specializing in semantic analysis. Your task is to determine if a claimed \"functional neighborhood\" is plausibly related to the actual top-ranked functions for a given prompt.",
+    "fv_semantic_verification_prompt_rule": "**Crucial Rule:** The claimed neighborhood does not need to be a direct summary of the top functions. It should be considered \"verified\" if it represents a plausible, contextually relevant, or semantically adjacent concept. Flag as \"not verified\" if the claimed neighborhood is unrelated or logically inconsistent with the top functions.",
+    "fv_semantic_verification_prompt_actual_header": "**Actual Top-Ranked Functions:**",
+    "fv_semantic_verification_prompt_claimed_header": "**Claimed Functional Neighborhood:**",
+    "fv_semantic_verification_prompt_task_header": "**Your Task:**",
+    "fv_semantic_verification_prompt_task_instruction": "Based on the rule above, is the \"Claimed Functional Neighborhood\" plausibly related to the \"Actual Top-Ranked Functions\"? Give a clear verdict and cite concrete evidence.",
+    "fv_semantic_verification_prompt_json_instruction": "Respond with a JSON object with two keys:\n1.  `is_verified`: boolean (true if plausibly related, false otherwise).\n2.  `reasoning`: A detailed 2-3 sentence explanation that references at least one item from the actual list, describes why the claim aligns or conflicts, and avoids simply repeating the claim verbatim.",
+    "fv_semantic_verification_prompt_footer": "Respond with ONLY the JSON object and nothing else.",
+    "fv_justification_verification_prompt_header": "You are an AI fact-checker specializing in semantic reasoning. Your task is to determine if a justification for a functional category's relevance to an input prompt is plausible and logically consistent.",
+    "fv_justification_verification_prompt_rule": "**Crucial Rule:** The justification does not need to be the strongest possible argument. It should be considered \"verified\" if it presents a plausible, creative, or contextually relevant connection, even if it seems like a stretch. Only flag it as \"not verified\" if the reasoning is completely illogical, factually incorrect, or directly contradicts the prompt.",
+    "fv_justification_verification_prompt_input_header": "**Input Prompt:**",
+    "fv_justification_verification_prompt_category_header": "**Functional Category:**",
+    "fv_justification_verification_prompt_justification_header": "**Provided Justification:**",
+    "fv_justification_verification_prompt_task_header": "**Your Task:**",
+    "fv_justification_verification_prompt_task_instruction": "Based on the rule above, is the justification plausible? Refer directly to the prompt and category when explaining your decision.",
+    "fv_justification_verification_prompt_json_instruction": "Respond with a JSON object with two keys:\n1. `is_verified`: boolean (true if the justification is plausible, false if it is illogical or incorrect).\n2. `reasoning`: A 2-3 sentence explanation that explicitly references the input prompt and category, and explains why the justification holds or fails without merely echoing the original wording.",
+    "fv_justification_verification_prompt_footer": "Respond with ONLY the JSON object and nothing else."
+}

locales/en/welcome_page.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "welcome_page_title": "Welcome & Setup",
+    "welcome_page_header": "Before you begin...",
+    "welcome_page_intro": "To help with our research on the usability of this tool, please provide some anonymous information. This will be stored securely and used only for academic purposes.",
+    "research_tool_intro": "An advanced research tool for exploring the inner workings of Large Language Models.",
+    "about_this_tool": "About This Tool",
+    "research_study_info": "This application is part of a research study aiming to understand how users interact with and interpret complex AI models. By using this tool, you are participating in this study.",
+    "your_role": "Your Role as a Participant:",
+    "role_1": "You will use the different analysis tools to explore the behavior of a language model.",
+    "role_2": "You will be asked to provide feedback on the usability and clarity of the visualizations.",
+    "role_3": "Your interactions and feedback will help us build better, more transparent AI tools.",
+    "data_privacy": "Data Privacy & Consent:",
+    "privacy_1": "Your responses and interactions are anonymous. We will only store your age, expertise level, and feedback.",
+    "privacy_2": "All collected data will be used exclusively for academic research purposes.",
+    "privacy_3": "By proceeding, you consent to the collection and use of this anonymous data.",
+    "tell_us_about_yourself": "Tell Us About Yourself",
+    "what_is_your_age_group": "What is your age group?",
+    "under_18": "Under 18",
+    "18_24": "18-24",
+    "25_34": "25-34",
+    "35_44": "35-44",
+    "45_54": "45-54",
+    "55_64": "55-64",
+    "65_or_over": "65 or over",
+    "prefer_not_to_say": "Prefer not to say",
+    "rate_your_expertise": "How would you rate your expertise with AI and language models?",
+    "novice": "Novice (Limited to no experience with AI tools)",
+    "intermediate": "Intermediate (Comfortable using AI for everyday tasks)",
+    "expert": "Expert (Deep technical knowledge or research in AI)",
+    "start_analysis_button": "Start Analysis",
+    "form_submitted": "form_submitted",
+    "thank_you_proceed": "Thank you! You can now proceed to the analysis.",
+    "thank_you_main_suite": "Thank you! Loading the main analysis suite...",
+    "welcome_to_llm_analysis_suite": "Welcome to the Explainable Language Interpretability Analysis Tool!",
+    "toolkit_description": "This toolkit offers a collection of advanced methods to interpret and understand the inner workings of language models. Select an analysis from the sidebar to begin.",
+    "attribution_analysis_description": "<strong>Attribution Analysis:</strong> Understand which parts of the input text influence the model's output using methods like Integrated Gradients, Occlusion, and Saliency.",
+    "function_vectors_description": "<strong>Function Vectors:</strong> Analyze how text activates different functional capabilities within the model.",
+    "circuit_tracing_description": "<strong>Circuit Tracing:</strong> Explore the computational pathways inside the model to see how information flows."
+}

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libgomp1
2	+

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+streamlit>=1.28.0
+streamlit-option-menu>=0.3.0
+torch>=2.0.0
+transformers>=4.30.0
+inseq>=0.5.0
+pandas>=1.5.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+plotly>=5.15.0
+requests>=2.25.0
+beautifulsoup4>=4.11.0
+Pillow>=9.0.0
+markdown>=3.0.0
+faiss-cpu>=1.7.0
+sentence-transformers>=2.2.0
+sentence-splitter>=1.0.0
+thefuzz>=0.19.0
+python-Levenshtein>=0.20.0
+networkx>=3.0
+matplotlib>=3.6.0

run_webapp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# A simple launcher script for the web app.
+import subprocess
+import sys
+import os
+def main():
+    print("LLM Attribution Analysis Web App")
+    print("=" * 50)
+    # Check if the script is being run from the correct directory.
+    if not os.path.exists("web_app.py"):
+        print("Error: web_app.py not found!")
+        print("Please run this script from the Bachelor Arbeit directory.")
+        return
+    # Check if streamlit is installed.
+    try:
+        import streamlit
+        print("Streamlit found")
+    except ImportError:
+        print("Streamlit not found. Installing dependencies...")
+        subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
+    print("Starting the web application...")
+    print("The app will open in your browser at http://localhost:8501")
+    print("To stop the app, press Ctrl+C isn this terminal")
+    print("=" * 50)
+    # Run the streamlit app.
+    try:
+        subprocess.run(["streamlit", "run", "web_app.py"])
+    except KeyboardInterrupt:
+        print("\nWeb app stopped. Goodbye!")
+    except FileNotFoundError:
+        print("Error: streamlit command not found.")
+        print("Please install streamlit: pip install streamlit")
+if __name__ == "__main__":
+    main()