wenbopan commited on 10 days ago

Commit

55b60a8

1 Parent(s): 12ad26e

Sync FlashTrace package from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -35
.gitignore +33 -0
.python-version +1 -0
.vscode/launch.json +25 -0
LICENSE +21 -0
MANIFEST.in +5 -0
README.md +293 -0
attribution_datasets.py +265 -0
docs/superpowers/plans/2026-05-03-flashtrace-public-package.md +1605 -0
docs/superpowers/specs/2026-05-03-flashtrace-public-package-design.md +231 -0
dump_exp2_hop_vh.py +412 -0
evaluations/attribution_recovery.py +490 -0
evaluations/attribution_recovery.sh +18 -0
evaluations/faithfulness.py +491 -0
evaluations/faithfulness.sh +80 -0
example.ipynb +0 -0
examples/quickstart.py +44 -0
exp/case_study/README.md +152 -0
exp/case_study/analysis.py +74 -0
exp/case_study/faithfulness_trace.py +183 -0
exp/case_study/run_ifr_case.py +1225 -0
exp/case_study/run_mas_case.py +805 -0
exp/case_study/viz.py +647 -0
exp/exp1/README.md +46 -0
exp/exp1/run_time_curve.py +757 -0
exp/exp2/DATASETS.md +231 -0
exp/exp2/README.md +106 -0
exp/exp2/dataset_utils.py +386 -0
exp/exp2/map_math_mine_to_exp2_cache.py +584 -0
exp/exp2/migrate_indices_to_explain_token_span.py +129 -0
exp/exp2/out.log +102 -0
exp/exp2/run_exp.py +1296 -0
exp/exp2/sample_and_filter.py +363 -0
exp/exp3/README.md +50 -0
exp/exp3/extract_segment_weights.py +250 -0
exp/exp3/part_weights.py +228 -0
exp/exp3/run_exp.py +430 -0
exp/exp3/sample_and_filter.py +628 -0
exp/exp4/README.md +85 -0
exp/exp4/run_exp.py +487 -0
exp/exp5/README.md +119 -0
exp/exp5/map_exp2_cache_token_spans.py +407 -0
exp/proc/README.md +98 -0
exp/proc/map_exp2_traces_to_proc.py +411 -0
exp/proc_1/README.md +72 -0
exp/proc_1/map_exp2_traces_to_proc_1.py +338 -0
flashtrace/__init__.py +7 -0
flashtrace/attribution.py +0 -0
flashtrace/baselines/__init__.py +5 -0
flashtrace/baselines/attnlrp.py +12 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Keep language stats focused on the public Python package.
+*.ipynb linguist-vendored
+*.html linguist-generated
+exp/** linguist-vendored
+evaluations/** linguist-vendored
+docs/** linguist-documentation

.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Local data
+data/
+# dev
+AGENTS.md
+readme_dev.md
+.superpowers/
+contribute/ruler/
+repos/.DS_Store
+repomix-output.xml
+# FlashTrace generated artifacts
+trace.json
+trace.html
+*.trace.json
+*.trace.html
+exp/**/output/
+exp/**/out/
+exp/**/out-*/
+*.npz
+.DS_Store
+repomix-output.xml

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Faithfulness Eval",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "evaluations.faithfulness",
+            "args": [
+                "--model",
+                "qwen-4B",
+                "--cuda_num",
+                "1",
+                "--num_examples",
+                "500",
+                "--attr_func",
+                "IG",
+                "--dataset",
+                "facts"
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": true
+        }
+    ]
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Wenbo Pan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+include README.md
+include LICENSE
+include pyproject.toml
+include examples/*.py
+include tests/*.py

README.md ADDED Viewed

	@@ -0,0 +1,293 @@

+<p align="center">
+  <img src="https://raw.githubusercontent.com/wbopan/flashtrace/master/docs/assets/flashtrace-logo.png" alt="FlashTrace logo" width="160">
+</p>
+<h1 align="center">FlashTrace</h1>
+<p align="center">
+  <em>Fast token attribution for reasoning language models.</em>
+</p>
+<p align="center">
+  <a href="https://pypi.org/project/flashtrace/"><img alt="PyPI" src="https://img.shields.io/pypi/v/flashtrace.svg?style=flat-square&logo=pypi&logoColor=white&label=PyPI"></a>
+  <a href="https://pypi.org/project/flashtrace/"><img alt="Python" src="https://img.shields.io/pypi/pyversions/flashtrace.svg?style=flat-square&logo=python&logoColor=white"></a>
+  <a href="https://github.com/wbopan/flashtrace/blob/master/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square"></a>
+  <a href="https://pytorch.org/"><img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-2.5%2B-EE4C2C.svg?style=flat-square&logo=pytorch&logoColor=white"></a>
+  <a href="https://arxiv.org/abs/2602.01914"><img alt="arXiv" src="https://img.shields.io/badge/arXiv-2602.01914-B31B1B.svg?style=flat-square&logo=arxiv&logoColor=white"></a>
+</p>
+FlashTrace traces generated answers back to the prompt tokens that shaped them. Use it from Python or the command line, export JSON traces, and render standalone HTML heatmaps for inspection and sharing.
+<p align="center">
+  <a href="https://arxiv.org/abs/2602.01914">📄 Paper</a>
+  &nbsp;·&nbsp;
+  <a href="#quickstart">🚀 Quickstart</a>
+  &nbsp;·&nbsp;
+  <a href="#command-line">💻 CLI</a>
+  &nbsp;·&nbsp;
+  <a href="#citation">📝 Citation</a>
+</p>
+## Why FlashTrace
+Reasoning models produce long generated chains, final answers, and intermediate spans that deserve targeted inspection. FlashTrace gives researchers a package-first workflow for tracing a selected generated span back to its supporting prompt tokens.
+You get:
+- top-k prompt tokens ranked by attribution score
+- JSON traces for downstream analysis
+- standalone HTML token heatmaps
+- optional per-hop attribution panels
+- inclusive generation-token span controls for answer and reasoning segments
+## Install
+From PyPI:
+```bash
+pip install flashtrace
+```
+From a local checkout:
+```bash
+pip install -e .
+```
+For development:
+```bash
+pip install -e ".[dev]"
+```
+FlashTrace uses PyTorch, Transformers, Accelerate, NumPy, and tqdm. A CUDA-capable GPU is recommended for public-scale Hugging Face models.
+## Quickstart
+```python
+from flashtrace import FlashTrace, load_model_and_tokenizer
+prompt = """Context: Paris is the capital of France.
+Question: What is the capital of France?"""
+target = "Paris"
+model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-8B", device_map="auto")
+tracer = FlashTrace(model, tokenizer, chunk_tokens=128, sink_chunk_tokens=32)
+trace = tracer.trace(
+    prompt=prompt,
+    target=target,
+    output_span=(0, 0),
+    hops=1,
+)
+print(trace.topk_inputs(10))
+trace.to_json("trace.json")
+trace.to_html("trace.html")
+```
+`trace.topk_inputs(10)` returns `TokenScore` objects aligned to prompt-token indices:
+```text
+rank  index  token      score
+1     2      Paris      0.184
+2     7      capital    0.131
+3     10     France     0.119
+```
+`trace.html` is a standalone heatmap that highlights prompt tokens by final attribution score and includes trace metadata for the selected generated span.
+`FlashTrace(..., use_chat_template=True)` formats prompts with the tokenizer chat template for chat-tuned models.
+## Command Line
+Create prompt and target files:
+```bash
+printf "Context: Paris is the capital of France.\nQuestion: What is the capital of France?\n" > prompt.txt
+printf "Paris" > target.txt
+```
+Run a trace:
+```bash
+flashtrace trace \
+  --model Qwen/Qwen3-8B \
+  --prompt prompt.txt \
+  --target target.txt \
+  --output-span 0:0 \
+  --hops 1 \
+  --html trace.html \
+  --json trace.json
+```
+The command prints a compact top-k table and writes the requested artifacts.
+Useful flags:
+- `--model`: Hugging Face model id or local model path
+- `--prompt`: UTF-8 prompt text file
+- `--target`: UTF-8 target text file
+- `--output-span`: inclusive `START:END` indices over generated tokens
+- `--reasoning-span`: inclusive `START:END` indices for a reasoning segment
+- `--method`: `flashtrace`, `ifr-span`, or `ifr-matrix`
+- `--recompute-attention`: lower-memory attention recomputation path
+- `--use-chat-template`: format prompts with the tokenizer chat template
+- `--device-map`: Transformers device map, default `auto`
+- `--dtype`: `auto`, `float16`, `bfloat16`, or `float32`
+## Token Spans
+`output_span` and `reasoning_span` use inclusive generation-token indices. The first generated token has index `0`.
+Use an initial trace to inspect tokenization:
+```python
+for index, token in enumerate(trace.generation_tokens):
+    print(index, repr(token))
+```
+Then choose spans:
+```python
+trace = tracer.trace(
+    prompt=prompt,
+    target=target,
+    reasoning_span=(0, 79),
+    output_span=(80, 85),
+    hops=1,
+)
+```
+Scores are aligned to `trace.prompt_tokens`. `trace.per_hop_scores` stores the same prompt-token alignment for each hop.
+## Interpreting Results
+High-scoring prompt tokens are the tokens FlashTrace attributes most strongly to the selected generated span. For answer inspection, use `output_span` around the final answer tokens. For chain-of-thought or reasoning inspection, use `reasoning_span` around the generated reasoning segment.
+Recommended workflow:
+1. Run a trace with your prompt and target.
+2. Inspect `trace.generation_tokens`.
+3. Select the answer or reasoning span.
+4. Export `trace.html`.
+5. Compare top-k tokens with the source prompt and any expected evidence.
+## Supported Models
+FlashTrace targets Llama/Qwen-style decoder-only Hugging Face causal LMs with:
+- `model.layers`
+- Q/K/V/O attention projections
+- RMSNorm or LayerNorm
+- RoPE metadata
+Validated model families for the first public release:
+- Qwen2
+- Qwen3
+- Llama
+## Python API
+The public package exports:
+```python
+from flashtrace import FlashTrace, TraceResult, load_model_and_tokenizer
+```
+`FlashTrace.trace(...)` accepts:
+- `prompt: str`
+- `target: str | None`
+- `output_span: tuple[int, int] | None`
+- `reasoning_span: tuple[int, int] | None`
+- `hops: int`
+- `method: "flashtrace" | "ifr-span" | "ifr-matrix"`
+- `renorm_threshold: float | None`
+`TraceResult` includes:
+- `prompt_tokens`
+- `generation_tokens`
+- `scores`
+- `per_hop_scores`
+- `thinking_ratios`
+- `output_span`
+- `reasoning_span`
+- `method`
+- `metadata`
+Export helpers:
+```python
+trace.topk_inputs(20)
+trace.to_dict()
+trace.to_json("trace.json")
+trace.to_html("trace.html")
+```
+## Examples
+```bash
+python examples/quickstart.py --help
+python examples/quickstart.py \
+  --model Qwen/Qwen3-8B \
+  --prompt "Context: Paris is the capital of France. Question: What is the capital of France?" \
+  --target "Paris" \
+  --output-span 0:0 \
+  --html trace.html
+```
+Heavy model examples are intended for GPU environments. CPU smoke tests use tiny randomly initialized models.
+## Repository Map
+- `flashtrace/`: reusable Python package
+- `examples/`: public quickstarts
+- `tests/`: CPU smoke tests
+- `exp/`: paper experiments and research artifacts
+- `docs/superpowers/`: design and implementation planning documents
+## Research Experiments
+The `exp/` directory contains the paper-era experiment runners, case studies, and saved artifacts. The public package API lives in `flashtrace/`; experiment scripts keep compatibility imports during the package migration.
+## Troubleshooting
+**CUDA memory**
+Use smaller models, lower precision, `device_map="auto"`, shorter prompts, or `--recompute-attention`.
+**Span selection**
+Print `trace.generation_tokens` and select inclusive generated-token indices. Tokenization can split visible words into multiple model tokens.
+**Deterministic generation**
+Pass a `target` file for attribution against a known output. Leave `--target` out when you want the CLI to generate with deterministic defaults.
+**Tokenizer alignment**
+Inspect `trace.prompt_tokens` and `trace.generation_tokens` when scores appear shifted from visible text. Attribution scores follow tokenizer-level alignment.
+**HTML export**
+`trace.to_html("trace.html")` writes a standalone file that can be opened locally or shared as an artifact.
+## Paper
+FlashTrace implements the method described in [Towards Long-Horizon Interpretability: Efficient and Faithful Multi-Token Attribution for Reasoning LLMs](https://arxiv.org/abs/2602.01914).
+## Citation
+```bibtex
+@misc{pan2026flashtrace,
+  title={Towards Long-Horizon Interpretability: Efficient and Faithful Multi-Token Attribution for Reasoning LLMs},
+  author={Pan, Wenbo and Liu, Zhichao and Wang, Xianlong and Yu, Haining and Jia, Xiaohua},
+  year={2026},
+  eprint={2602.01914},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+```

attribution_datasets.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence
+# Import sentence splitter from shared utils; fallback when unavailable
+try:
+    from shared_utils import create_sentences, create_sentences_fallback, nlp
+except Exception:
+    from shared_utils import create_sentences_fallback as create_sentences
+    nlp = None
+@dataclass
+class AttributionExample:
+    prompt: str
+    target: Optional[str] = None
+    indices_to_explain: Optional[List[int]] = None
+    attr_mask_indices: Optional[List[int]] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+class AttributionDataset(Iterable[AttributionExample]):
+    """Base iterable for attribution-ready datasets."""
+    name: str = "dataset"
+    def __init__(self) -> None:
+        self.examples: List[AttributionExample] = []
+    def __iter__(self) -> Iterator[AttributionExample]:
+        return iter(self.examples)
+    def __len__(self) -> int:  # pragma: no cover - trivial
+        return len(self.examples)
+    def __getitem__(self, item):  # pragma: no cover - convenience
+        return self.examples[item]
+def _add_dummy_facts_to_prompt(text_sentences: Sequence[str]) -> List[str]:
+    """
+    Reproduces the original behaviour of interleaving dummy sentences with the
+    provided text segments so attribution heads can be masked easily.
+    """
+    result: List[str] = []
+    for sentence in text_sentences:
+        result.append(sentence)
+        result.append(" Unrelated Sentence.")
+    return result
+class MathAttributionDataset(AttributionDataset):
+    """Dataset wrapper for synthetic math problems with dummy context facts."""
+    name = "math"
+    def __init__(self, path: str | Path, tokenizer: Any) -> None:
+        super().__init__()
+        data_path = Path(path)
+        with data_path.open("r", encoding="utf-8") as f:
+            raw_examples = json.load(f)
+        for entry in raw_examples:
+            question_text = entry["question"]
+            sentences = create_sentences(question_text, tokenizer)
+            if not sentences:
+                continue
+            context_sentences = sentences[:-1]
+            question_sentence = sentences[-1]
+            if question_sentence.startswith(" "):
+                question_sentence = question_sentence[1:]
+            context_with_dummy = _add_dummy_facts_to_prompt(context_sentences)
+            question_with_dummy = _add_dummy_facts_to_prompt([question_sentence])
+            prompt = "".join(context_with_dummy) + "\n" + "".join(question_with_dummy)
+            total_sentences = len(context_with_dummy) + len(question_with_dummy)
+            attr_mask_indices = list(range(0, total_sentences, 2))
+            self.examples.append(
+                AttributionExample(
+                    prompt=prompt,
+                    target=None,
+                    indices_to_explain=[-2],
+                    attr_mask_indices=attr_mask_indices,
+                    metadata={"raw_question": question_text},
+                )
+            )
+class FactsAttributionDataset(AttributionDataset):
+    """Dataset wrapper for curated factual prompts with explicit gold attributions."""
+    name = "facts"
+    def __init__(self, path: str | Path) -> None:
+        super().__init__()
+        data_path = Path(path)
+        with data_path.open("r", encoding="utf-8") as f:
+            raw_examples = json.load(f)
+        for entry in raw_examples:
+            metadata = {
+                key: value
+                for key, value in entry.items()
+                if key not in {"prompt", "target", "indices_to_explain", "attr_mask_indices"}
+            }
+            self.examples.append(
+                AttributionExample(
+                    prompt=entry["prompt"],
+                    target=entry.get("target"),
+                    indices_to_explain=entry.get("indices_to_explain"),
+                    attr_mask_indices=entry.get("attr_mask_indices"),
+                    metadata=metadata,
+                )
+            )
+class MoreHopQAAttributionDataset(AttributionDataset):
+    """Dataset wrapper for multi-hop QA prompts without explicit gold attribution."""
+    name = "morehopqa"
+    def __init__(self, path: str | Path) -> None:
+        super().__init__()
+        data_path = Path(path)
+        with data_path.open("r", encoding="utf-8") as f:
+            raw_examples = json.load(f)
+        for entry in raw_examples:
+            context_chunks = ["".join(item[1]) for item in entry.get("context", [])]
+            context = " ".join(context_chunks)
+            prompt = context + "\n" + entry["question"]
+            self.examples.append(
+                AttributionExample(
+                    prompt=prompt,
+                    target=None,
+                    indices_to_explain=[-2],
+                    attr_mask_indices=None,
+                    metadata={
+                        "answer": entry.get("answer"),
+                        "id": entry.get("_id"),
+                        "original_context": entry.get("context"),
+                    },
+                )
+            )
+# added
+class RulerAttributionDataset(AttributionDataset):
+    """Dataset wrapper for raw RULER JSONL files with needle spans.
+    Expects a JSONL file produced by repos/RULER (with added `needle_spans`).
+    Each line must contain at least: `input`, `answer_prefix`, `outputs`, and
+    optionally `needle_spans` with character spans relative to `input`.
+    Mapping logic:
+    - prompt = input + answer_prefix
+    - target = answer_prefix (+ optional space) + ", ".join(outputs)
+    - sentence indices computed over " " + prompt (leading space to match evaluator)
+    - each span is shifted by +1 to account for that leading space
+    - attr_mask_indices = union of all sentences covered by any span
+    - indices_to_explain = [0] when target is present
+    """
+    name = "ruler"
+    def __init__(self, path: str | Path) -> None:
+        super().__init__()
+        data_path = Path(path)
+        if not data_path.exists():
+            raise FileNotFoundError(f"RULER file not found: {data_path}")
+        # Use shared nlp pipeline; fallback to a naive splitter if unavailable
+        if nlp is not None:
+            def _sentence_bounds(text: str) -> List[tuple[int, int]]:
+                doc = nlp(text)
+                return [(s.start_char, s.end_char) for s in doc.sents]
+        else:
+            def _sentence_bounds(text: str) -> List[tuple[int, int]]:
+                # Naive fallback: split on newlines, produce contiguous ranges
+                bounds: List[tuple[int, int]] = []
+                start = 0
+                parts = text.split("\n")
+                for idx, part in enumerate(parts):
+                    end = start + len(part)
+                    if end > start:
+                        bounds.append((start, end))
+                    start = end + 1
+                if not bounds:
+                    bounds = [(0, len(text))]
+                return bounds
+        def _map_spans(bounds: Sequence[tuple[int, int]], spans: Sequence[tuple[int, int]]) -> List[int]:
+            indices: set[int] = set()
+            for start, end in spans:
+                matched = False
+                for i, (bs, be) in enumerate(bounds):
+                    if start >= bs and end <= be:
+                        indices.add(i)
+                        matched = True
+                        break
+                if not matched:
+                    # fallback: include all sentences with any overlap
+                    for i, (bs, be) in enumerate(bounds):
+                        if not (end <= bs or start >= be):
+                            indices.add(i)
+            return sorted(indices)
+        def _read_jsonl(fp: Path) -> Iterator[Dict[str, Any]]:
+            with fp.open("r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        yield json.loads(line)
+        for entry in _read_jsonl(data_path):
+            input_text: str = entry.get("input", "")
+            answer_prefix: str = entry.get("answer_prefix", "")
+            outputs = entry.get("outputs", []) or []
+            # Build prompt/target
+            prompt = input_text + answer_prefix
+            if outputs:
+                sep = " " if answer_prefix and not answer_prefix.endswith((" ", "\n", "\t")) else ""
+                target = answer_prefix + sep + ", ".join(outputs)
+            else:
+                target = answer_prefix
+            # Sentence bounds over leading-space prompt to match evaluator
+            prompt_for_seg = " " + prompt
+            bounds = _sentence_bounds(prompt_for_seg)
+            # Collect spans and shift by +1 for the leading space
+            spans_raw = []
+            for item in entry.get("needle_spans", []) or []:
+                span = item.get("span")
+                if isinstance(span, list) and len(span) == 2:
+                    spans_raw.append((int(span[0]) + 1, int(span[1]) + 1))
+            attr_indices = _map_spans(bounds, spans_raw) if spans_raw else None
+            self.examples.append(
+                AttributionExample(
+                    prompt=prompt,
+                    target=target or None,
+                    indices_to_explain=[0] if target else None,
+                    attr_mask_indices=attr_indices,
+                    metadata={
+                        "dataset": "ruler",
+                        "length": entry.get("length"),
+                        "length_w_model_temp": entry.get("length_w_model_temp"),
+                        "outputs": outputs,
+                        "answer_prefix": answer_prefix,
+                        "token_position_answer": entry.get("token_position_answer"),
+                        "needle_spans": entry.get("needle_spans"),
+                        "prompt_sentence_count": len(bounds),
+                    },
+                )
+            )

docs/superpowers/plans/2026-05-03-flashtrace-public-package.md ADDED Viewed

	@@ -0,0 +1,1605 @@

+# FlashTrace Public Package Implementation Plan
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+**Goal:** Build an installable `flashtrace` package with a stable Python API, CLI tracing command, JSON export, HTML heatmap export, README quickstart, and CPU smoke tests.
+**Architecture:** Create a package-first structure while preserving temporary root compatibility wrappers for existing experiment scripts. Move the IFR implementation and attribution engines into `flashtrace/`, wrap them with `FlashTrace` and `TraceResult`, then expose a CLI and public examples.
+**Tech Stack:** Python 3.10+, PyTorch, Transformers, Accelerate, NumPy, tqdm, argparse, pytest.
+---
+## File Structure
+Create or modify these files:
+- Create: `flashtrace/__init__.py` for public exports.
+- Create: `flashtrace/core.py` from `ifr_core.py`.
+- Create: `flashtrace/shared_utils.py` from `shared_utils.py`.
+- Create: `flashtrace/lrp_rules.py` from `lrp_rules.py`.
+- Create: `flashtrace/lrp_patches.py` from `lrp_patches.py`.
+- Create: `flashtrace/attribution.py` from `llm_attr.py`.
+- Create: `flashtrace/improved.py` from `ft_ifr_improve.py`.
+- Create: `flashtrace/result.py` for `TokenScore` and `TraceResult`.
+- Create: `flashtrace/viz.py` for standalone HTML token heatmaps.
+- Create: `flashtrace/tracer.py` for the `FlashTrace` facade.
+- Create: `flashtrace/model_io.py` for Hugging Face loading helpers.
+- Create: `flashtrace/cli.py` for `flashtrace trace`.
+- Create: `flashtrace/baselines/__init__.py`.
+- Create: `flashtrace/baselines/attnlrp.py`.
+- Modify: `ifr_core.py`, `shared_utils.py`, `lrp_rules.py`, `lrp_patches.py`, `llm_attr.py`, `ft_ifr_improve.py` into root compatibility wrappers.
+- Modify: `pyproject.toml` package metadata and console script.
+- Modify: `.gitignore` generated artifact rules.
+- Create: `README.md`.
+- Create: `LICENSE`.
+- Create: `examples/quickstart.py`.
+- Create: `tests/helpers.py`.
+- Create: `tests/test_imports.py`.
+- Create: `tests/test_core_recompute.py`.
+- Create: `tests/test_result.py`.
+- Create: `tests/test_tracer.py`.
+- Create: `tests/test_cli.py`.
+- Delete: `model_generation.py`.
+## Task 1: Package Metadata And Skeleton
+**Files:**
+- Modify: `pyproject.toml`
+- Create: `flashtrace/__init__.py`
+- Create: `flashtrace/tracer.py`
+- Create: `flashtrace/result.py`
+- Create: `flashtrace/model_io.py`
+- Create: `flashtrace/cli.py`
+- Create: `flashtrace/baselines/__init__.py`
+- Create: `flashtrace/baselines/attnlrp.py`
+- Test: `tests/test_imports.py`
+- [ ] **Step 1: Write the failing public import test**
+Create `tests/test_imports.py`:
+```python
+def test_public_imports():
+    import flashtrace
+    assert flashtrace.FlashTrace.__name__ == "FlashTrace"
+    assert flashtrace.TraceResult.__name__ == "TraceResult"
+    assert callable(flashtrace.load_model_and_tokenizer)
+```
+- [ ] **Step 2: Run the import test and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_imports.py -q
+```
+Expected: pytest reports an import failure for `flashtrace`.
+- [ ] **Step 3: Create package directories**
+Run:
+```bash
+mkdir -p flashtrace/baselines tests
+```
+- [ ] **Step 4: Add minimal public package files**
+Create `flashtrace/tracer.py`:
+```python
+class FlashTrace:
+    """Public facade for FlashTrace attribution."""
+    def __init__(self, model, tokenizer, **kwargs):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.options = dict(kwargs)
+```
+Create `flashtrace/result.py`:
+```python
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class TraceResult:
+    """Public attribution result returned by FlashTrace."""
+    prompt_tokens: list[str]
+    generation_tokens: list[str]
+    scores: list[float]
+```
+Create `flashtrace/model_io.py`:
+```python
+def load_model_and_tokenizer(*args, **kwargs):
+    """Load a Hugging Face causal LM and tokenizer."""
+    raise RuntimeError("load_model_and_tokenizer will be implemented in the model IO task.")
+```
+Create `flashtrace/cli.py`:
+```python
+def main(argv=None):
+    """FlashTrace command-line entrypoint."""
+    raise RuntimeError("CLI will be implemented in the CLI task.")
+```
+Create `flashtrace/baselines/__init__.py`:
+```python
+"""Baseline attribution methods for FlashTrace."""
+```
+Create `flashtrace/baselines/attnlrp.py`:
+```python
+"""AttnLRP baseline exports."""
+```
+Create `flashtrace/__init__.py`:
+```python
+"""FlashTrace: efficient multi-token attribution for reasoning LLMs."""
+from .model_io import load_model_and_tokenizer
+from .result import TraceResult
+from .tracer import FlashTrace
+__all__ = ["FlashTrace", "TraceResult", "load_model_and_tokenizer"]
+```
+- [ ] **Step 5: Update package metadata**
+Replace `pyproject.toml` with:
+```toml
+[project]
+name = "flashtrace"
+version = "0.1.0"
+description = "Efficient multi-token attribution for reasoning language models."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.11.0",
+    "matplotlib>=3.6",
+    "networkx>=3.3",
+    "numpy>=2.0",
+    "seaborn>=0.11",
+    "spacy>=3.8",
+    "torch>=2.5",
+    "tqdm>=4.67",
+    "transformers>=4.53",
+    "wordfreq>=3.1.1",
+]
+[project.optional-dependencies]
+baselines = [
+    "bert-score>=0.3.13",
+    "evaluate>=0.4.6",
+    "sentence-transformers>=4.1.0",
+]
+eval = [
+    "datasets>=2.21",
+    "evaluate>=0.4.6",
+]
+dev = [
+    "pytest>=8.0",
+]
+[project.scripts]
+flashtrace = "flashtrace.cli:main"
+[tool.setuptools.packages.find]
+include = ["flashtrace*"]
+```
+- [ ] **Step 6: Run the import test**
+Run:
+```bash
+uv run pytest tests/test_imports.py -q
+```
+Expected: `1 passed`.
+- [ ] **Step 7: Commit**
+Run:
+```bash
+git add pyproject.toml flashtrace tests/test_imports.py
+git commit -m "feat: add flashtrace package skeleton"
+```
+## Task 2: Core IFR Migration
+**Files:**
+- Create: `flashtrace/core.py`
+- Create: `flashtrace/shared_utils.py`
+- Modify: `ifr_core.py`
+- Modify: `shared_utils.py`
+- Create: `tests/helpers.py`
+- Create: `tests/test_core_recompute.py`
+- [ ] **Step 1: Add the tiny-model test helper**
+Create `tests/helpers.py`:
+```python
+from __future__ import annotations
+from tokenizers import Tokenizer, models, pre_tokenizers
+from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedTokenizerFast
+def make_tiny_qwen2_model_and_tokenizer(
+    *,
+    n_layers: int = 3,
+    d_model: int = 48,
+    n_heads: int = 4,
+    n_kv_heads: int = 2,
+    max_pos: int = 128,
+):
+    config = AutoConfig.for_model(
+        "qwen2",
+        vocab_size=500,
+        hidden_size=d_model,
+        intermediate_size=d_model * 2,
+        num_hidden_layers=n_layers,
+        num_attention_heads=n_heads,
+        num_key_value_heads=n_kv_heads,
+        max_position_embeddings=max_pos,
+        use_sliding_window=False,
+        attn_implementation="eager",
+    )
+    model = AutoModelForCausalLM.from_config(config, attn_implementation="eager")
+    model.eval()
+    backend = Tokenizer(models.WordLevel(vocab={f"t{i}": i for i in range(500)}, unk_token="t0"))
+    backend.pre_tokenizer = pre_tokenizers.Whitespace()
+    tokenizer = PreTrainedTokenizerFast(tokenizer_object=backend, eos_token="t1", pad_token="t2")
+    tokenizer.chat_template = "{% for m in messages %}{{ m['content'] }}{% endfor %}"
+    return model, tokenizer
+```
+- [ ] **Step 2: Write the failing core import smoke test**
+Create `tests/test_core_recompute.py`:
+```python
+import torch
+from flashtrace import core
+from tests.helpers import make_tiny_qwen2_model_and_tokenizer
+def test_core_metadata_and_weight_pack():
+    model, _ = make_tiny_qwen2_model_and_tokenizer()
+    metadata = core.extract_model_metadata(model)
+    weight_pack = core.build_weight_pack(metadata, next(model.parameters()).dtype)
+    assert metadata.n_layers == 3
+    assert metadata.n_heads_q == 4
+    assert metadata.n_kv_heads == 2
+    assert len(weight_pack) == 3
+    assert torch.is_tensor(weight_pack[0]["v_w"])
+```
+- [ ] **Step 3: Run the core smoke test and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_core_recompute.py::test_core_metadata_and_weight_pack -q
+```
+Expected: pytest reports missing `flashtrace.core`.
+- [ ] **Step 4: Copy the IFR core into the package**
+Run:
+```bash
+cp ifr_core.py flashtrace/core.py
+```
+- [ ] **Step 5: Copy shared utilities into the package**
+Run:
+```bash
+cp shared_utils.py flashtrace/shared_utils.py
+```
+- [ ] **Step 6: Replace root `ifr_core.py` with a compatibility wrapper**
+Replace `ifr_core.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.core import *  # noqa: F401,F403
+```
+- [ ] **Step 7: Replace root `shared_utils.py` with a compatibility wrapper**
+Replace `shared_utils.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.shared_utils import *  # noqa: F401,F403
+```
+- [ ] **Step 8: Run the core smoke test**
+Run:
+```bash
+uv run pytest tests/test_core_recompute.py::test_core_metadata_and_weight_pack -q
+```
+Expected: `1 passed`.
+- [ ] **Step 9: Commit**
+Run:
+```bash
+git add flashtrace/core.py flashtrace/shared_utils.py ifr_core.py shared_utils.py tests/helpers.py tests/test_core_recompute.py
+git commit -m "feat: move IFR core into package"
+```
+## Task 3: Attribution Engine Migration
+**Files:**
+- Create: `flashtrace/lrp_rules.py`
+- Create: `flashtrace/lrp_patches.py`
+- Create: `flashtrace/attribution.py`
+- Create: `flashtrace/improved.py`
+- Modify: `lrp_rules.py`
+- Modify: `lrp_patches.py`
+- Modify: `llm_attr.py`
+- Modify: `ft_ifr_improve.py`
+- Modify: `flashtrace/baselines/attnlrp.py`
+- Test: `tests/test_core_recompute.py`
+- [ ] **Step 1: Extend the recompute test with package attribution paths**
+Append to `tests/test_core_recompute.py`:
+```python
+from flashtrace.attribution import LLMIFRAttribution
+def test_package_attribution_recompute_matches_stored_attention():
+    model, tokenizer = make_tiny_qwen2_model_and_tokenizer(n_layers=2, d_model=32, n_heads=4, n_kv_heads=2)
+    prompt = "t10 t20 t30 t40"
+    target = "t60 t70"
+    stored = LLMIFRAttribution(model, tokenizer, recompute_attention=False).calculate_ifr_span(prompt, target)
+    recomputed = LLMIFRAttribution(model, tokenizer, recompute_attention=True).calculate_ifr_span(prompt, target)
+    diff = (stored.attribution_matrix - recomputed.attribution_matrix).abs().max().item()
+    assert diff < 1e-5
+```
+- [ ] **Step 2: Run the package attribution test and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_core_recompute.py::test_package_attribution_recompute_matches_stored_attention -q
+```
+Expected: pytest reports missing `flashtrace.attribution`.
+- [ ] **Step 3: Copy LRP helpers and attribution engines into the package**
+Run:
+```bash
+cp lrp_rules.py flashtrace/lrp_rules.py
+cp lrp_patches.py flashtrace/lrp_patches.py
+cp llm_attr.py flashtrace/attribution.py
+cp ft_ifr_improve.py flashtrace/improved.py
+```
+- [ ] **Step 4: Update imports in `flashtrace/attribution.py`**
+Edit package-local imports to this form:
+```python
+from .core import (
+    IFRParameters,
+    ModelMetadata,
+    attach_hooks,
+    build_weight_pack,
+    compute_ifr_for_all_positions,
+    compute_ifr_sentence_aggregate,
+    compute_multi_hop_ifr,
+    extract_model_metadata,
+)
+from .shared_utils import (
+    DEFAULT_GENERATE_KWARGS,
+    DEFAULT_PROMPT_TEMPLATE,
+    create_sentences,
+    create_sentence_masks,
+)
+from .lrp_patches import lrp_context, detect_model_type
+```
+- [ ] **Step 5: Update imports in `flashtrace/lrp_patches.py`**
+Edit the LRP helper import to:
+```python
+from .lrp_rules import stop_gradient, divide_gradient, identity_rule_implicit
+```
+- [ ] **Step 6: Update imports in `flashtrace/improved.py`**
+Edit the top-level package imports to:
+```python
+from . import attribution as llm_attr
+from .core import IFRAggregate, MultiHopIFRResult, compute_ifr_sentence_aggregate
+```
+- [ ] **Step 7: Replace root compatibility modules**
+Replace `lrp_rules.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.lrp_rules import *  # noqa: F401,F403
+```
+Replace `lrp_patches.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.lrp_patches import *  # noqa: F401,F403
+```
+Replace `llm_attr.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.attribution import *  # noqa: F401,F403
+```
+Replace `ft_ifr_improve.py` with:
+```python
+"""Compatibility wrapper for package-era imports."""
+from flashtrace.improved import *  # noqa: F401,F403
+```
+- [ ] **Step 8: Export the AttnLRP baseline**
+Replace `flashtrace/baselines/attnlrp.py` with:
+```python
+"""AttnLRP baseline API."""
+from flashtrace.attribution import AttnLRPSpanAggregate, LLMLRPAttribution, MultiHopAttnLRPResult
+from flashtrace.lrp_patches import detect_model_type, lrp_context
+__all__ = [
+    "AttnLRPSpanAggregate",
+    "LLMLRPAttribution",
+    "MultiHopAttnLRPResult",
+    "detect_model_type",
+    "lrp_context",
+]
+```
+Replace `flashtrace/baselines/__init__.py` with:
+```python
+"""Baseline attribution methods for FlashTrace."""
+from .attnlrp import LLMLRPAttribution
+__all__ = ["LLMLRPAttribution"]
+```
+- [ ] **Step 9: Run attribution migration tests**
+Run:
+```bash
+uv run pytest tests/test_core_recompute.py -q
+```
+Expected: all tests in the file pass.
+- [ ] **Step 10: Run a root compatibility import check**
+Run:
+```bash
+uv run python -c "import ifr_core, llm_attr, ft_ifr_improve; print(llm_attr.LLMIFRAttribution.__name__)"
+```
+Expected: prints `LLMIFRAttribution`.
+- [ ] **Step 11: Commit**
+Run:
+```bash
+git add flashtrace lrp_rules.py lrp_patches.py llm_attr.py ft_ifr_improve.py tests/test_core_recompute.py
+git commit -m "feat: move attribution engines into package"
+```
+## Task 4: TraceResult And HTML Heatmap
+**Files:**
+- Modify: `flashtrace/result.py`
+- Create: `flashtrace/viz.py`
+- Create: `tests/test_result.py`
+- [ ] **Step 1: Write result object tests**
+Create `tests/test_result.py`:
+```python
+import json
+from flashtrace.result import TokenScore, TraceResult
+def make_result():
+    return TraceResult(
+        prompt_tokens=[" alpha", " beta", " gamma"],
+        generation_tokens=[" answer"],
+        scores=[0.2, 0.7, 0.1],
+        per_hop_scores=[[0.1, 0.4, 0.0], [0.1, 0.3, 0.1]],
+        thinking_ratios=[0.5, 0.2],
+        output_span=(0, 0),
+        reasoning_span=(0, 0),
+        method="flashtrace",
+        metadata={"model": "tiny"},
+    )
+def test_topk_inputs_sorted():
+    result = make_result()
+    top = result.topk_inputs(2)
+    assert top == [
+        TokenScore(index=1, token=" beta", score=0.7),
+        TokenScore(index=0, token=" alpha", score=0.2),
+    ]
+def test_to_dict_is_json_serializable():
+    result = make_result()
+    payload = result.to_dict()
+    assert payload["method"] == "flashtrace"
+    assert payload["top_inputs"][0]["token"] == " beta"
+    json.dumps(payload)
+def test_to_dict_sanitizes_tensor_metadata():
+    import torch
+    result = TraceResult(
+        prompt_tokens=[" alpha"],
+        generation_tokens=[" answer"],
+        scores=[1.0],
+        metadata={"tensor": torch.tensor([1.0, 2.0]), "object": object()},
+    )
+    payload = result.to_dict()
+    assert payload["metadata"]["tensor"] == [1.0, 2.0]
+    assert isinstance(payload["metadata"]["object"], str)
+    json.dumps(payload)
+def test_json_and_html_export(tmp_path):
+    result = make_result()
+    json_path = tmp_path / "trace.json"
+    html_path = tmp_path / "trace.html"
+    result.to_json(json_path)
+    result.to_html(html_path)
+    assert json_path.read_text(encoding="utf-8").startswith("{")
+    html = html_path.read_text(encoding="utf-8")
+    assert "<html" in html
+    assert " beta" in html
+```
+- [ ] **Step 2: Run result tests and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_result.py -q
+```
+Expected: pytest reports missing `TokenScore` or missing methods.
+- [ ] **Step 3: Implement `TraceResult`**
+Replace `flashtrace/result.py` with:
+```python
+from __future__ import annotations
+import json
+from dataclasses import asdict, dataclass, field, is_dataclass
+from pathlib import Path
+from typing import Any
+@dataclass(frozen=True)
+class TokenScore:
+    index: int
+    token: str
+    score: float
+@dataclass(frozen=True)
+class TraceResult:
+    """Public attribution result returned by FlashTrace."""
+    prompt_tokens: list[str]
+    generation_tokens: list[str]
+    scores: list[float]
+    per_hop_scores: list[list[float]] = field(default_factory=list)
+    thinking_ratios: list[float] = field(default_factory=list)
+    output_span: tuple[int, int] | None = None
+    reasoning_span: tuple[int, int] | None = None
+    method: str = "flashtrace"
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def topk_inputs(self, k: int = 20) -> list[TokenScore]:
+        limit = max(0, int(k))
+        items = [
+            TokenScore(index=i, token=tok, score=float(score))
+            for i, (tok, score) in enumerate(zip(self.prompt_tokens, self.scores))
+        ]
+        items.sort(key=lambda item: item.score, reverse=True)
+        return items[:limit]
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "method": self.method,
+            "prompt_tokens": list(self.prompt_tokens),
+            "generation_tokens": list(self.generation_tokens),
+            "scores": [float(x) for x in self.scores],
+            "per_hop_scores": [[float(x) for x in row] for row in self.per_hop_scores],
+            "thinking_ratios": [float(x) for x in self.thinking_ratios],
+            "output_span": list(self.output_span) if self.output_span is not None else None,
+            "reasoning_span": list(self.reasoning_span) if self.reasoning_span is not None else None,
+            "top_inputs": [asdict(item) for item in self.topk_inputs()],
+            "metadata": _jsonable(self.metadata),
+        }
+    def to_json(self, path: str | Path) -> None:
+        target = Path(path)
+        target.write_text(json.dumps(self.to_dict(), indent=2, ensure_ascii=False), encoding="utf-8")
+    def to_html(self, path: str | Path) -> None:
+        from .viz import render_trace_html
+        target = Path(path)
+        target.write_text(render_trace_html(self), encoding="utf-8")
+def _jsonable(value: Any) -> Any:
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if hasattr(value, "detach") and hasattr(value, "cpu"):
+        try:
+            return value.detach().cpu().tolist()
+        except Exception:
+            return repr(value)
+    if is_dataclass(value):
+        return _jsonable(asdict(value))
+    if isinstance(value, dict):
+        return {str(k): _jsonable(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(v) for v in value]
+    return repr(value)
+```
+- [ ] **Step 4: Implement the standalone HTML renderer**
+Create `flashtrace/viz.py`:
+```python
+from __future__ import annotations
+from html import escape
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .result import TraceResult
+def _score_color(score: float, max_score: float) -> str:
+    if max_score <= 0.0:
+        return "rgba(245,245,245,0.75)"
+    ratio = min(1.0, abs(float(score)) / (max_score + 1e-12))
+    red = 255
+    green = int(246 - 105 * ratio)
+    blue = int(226 - 170 * ratio)
+    alpha = 0.22 + 0.58 * ratio
+    return f"rgba({red},{green},{blue},{alpha:.3f})"
+def _render_token_row(tokens: list[str], scores: list[float]) -> str:
+    max_score = max((abs(float(x)) for x in scores), default=0.0)
+    spans = []
+    for index, token in enumerate(tokens):
+        score = float(scores[index]) if index < len(scores) else 0.0
+        color = _score_color(score, max_score)
+        spans.append(
+            "<span class='tok' "
+            f"title='idx={index} score={score:.6f}' "
+            f"style='background:{color}'>{escape(token)}</span>"
+        )
+    return "".join(spans)
+def render_trace_html(result: "TraceResult") -> str:
+    top_rows = "\n".join(
+        f"<tr><td>{item.index}</td><td><code>{escape(item.token)}</code></td><td>{item.score:.6f}</td></tr>"
+        for item in result.topk_inputs(20)
+    )
+    hop_sections = []
+    for hop_index, hop_scores in enumerate(result.per_hop_scores):
+        hop_sections.append(
+            f"<section><h2>Hop {hop_index}</h2><div class='tokens'>{_render_token_row(result.prompt_tokens, hop_scores)}</div></section>"
+        )
+    hop_html = "\n".join(hop_sections)
+    metadata = escape(str(result.metadata))
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>FlashTrace</title>
+  <style>
+    body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; margin: 32px; color: #151515; }}
+    h1, h2 {{ margin: 0 0 12px; }}
+    section {{ margin: 24px 0; }}
+    .tokens {{ line-height: 2.2; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }}
+    .tok {{ display: inline-block; margin: 2px; padding: 2px 4px; border-radius: 4px; white-space: pre-wrap; }}
+    table {{ border-collapse: collapse; margin-top: 12px; }}
+    td, th {{ border-bottom: 1px solid #ddd; padding: 6px 10px; text-align: left; }}
+    .meta {{ color: #555; font-size: 13px; }}
+  </style>
+</head>
+<body>
+  <h1>FlashTrace</h1>
+  <p class="meta">method={escape(result.method)} output_span={escape(str(result.output_span))} reasoning_span={escape(str(result.reasoning_span))}</p>
+  <section>
+    <h2>Prompt Attribution</h2>
+    <div class="tokens">{_render_token_row(result.prompt_tokens, result.scores)}</div>
+  </section>
+  {hop_html}
+  <section>
+    <h2>Top Input Tokens</h2>
+    <table><thead><tr><th>Index</th><th>Token</th><th>Score</th></tr></thead><tbody>{top_rows}</tbody></table>
+  </section>
+  <section>
+    <h2>Metadata</h2>
+    <pre>{metadata}</pre>
+  </section>
+</body>
+</html>
+"""
+```
+- [ ] **Step 5: Run result tests**
+Run:
+```bash
+uv run pytest tests/test_result.py -q
+```
+Expected: `4 passed`.
+- [ ] **Step 6: Commit**
+Run:
+```bash
+git add flashtrace/result.py flashtrace/viz.py tests/test_result.py
+git commit -m "feat: add trace result exports"
+```
+## Task 5: FlashTrace Facade
+**Files:**
+- Modify: `flashtrace/tracer.py`
+- Modify: `flashtrace/__init__.py`
+- Create: `tests/test_tracer.py`
+- [ ] **Step 1: Write tracer API tests**
+Create `tests/test_tracer.py`:
+```python
+from flashtrace import FlashTrace, TraceResult
+from tests.helpers import make_tiny_qwen2_model_and_tokenizer
+def test_flashtrace_trace_returns_public_result():
+    model, tokenizer = make_tiny_qwen2_model_and_tokenizer(n_layers=2, d_model=32, n_heads=4, n_kv_heads=2)
+    tracer = FlashTrace(model, tokenizer, chunk_tokens=16, sink_chunk_tokens=4, recompute_attention=True)
+    result = tracer.trace(
+        prompt="t10 t20 t30 t40",
+        target="t60 t70 t80",
+        output_span=(1, 2),
+        reasoning_span=(0, 1),
+        hops=1,
+    )
+    assert isinstance(result, TraceResult)
+    assert result.method == "flashtrace"
+    assert len(result.prompt_tokens) > 0
+    assert len(result.scores) == len(result.prompt_tokens)
+    assert result.output_span == (1, 2)
+    assert result.reasoning_span == (0, 1)
+def test_ifr_span_method_returns_public_result():
+    model, tokenizer = make_tiny_qwen2_model_and_tokenizer(n_layers=2, d_model=32, n_heads=4, n_kv_heads=2)
+    tracer = FlashTrace(model, tokenizer, chunk_tokens=16, sink_chunk_tokens=4, recompute_attention=True)
+    result = tracer.trace(
+        prompt="t10 t20 t30 t40",
+        target="t60 t70",
+        output_span=(0, 1),
+        method="ifr-span",
+    )
+    assert result.method == "ifr-span"
+    assert len(result.scores) == len(result.prompt_tokens)
+```
+- [ ] **Step 2: Run tracer tests and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_tracer.py -q
+```
+Expected: pytest reports missing `trace`.
+- [ ] **Step 3: Implement result adaptation helpers and facade**
+Replace `flashtrace/tracer.py` with:
+```python
+from __future__ import annotations
+from typing import Any, Literal
+import torch
+from .attribution import LLMIFRAttribution, LLMAttributionResult
+from .improved import LLMIFRAttributionBoth
+from .result import TraceResult
+TraceMethod = Literal["flashtrace", "ifr-span", "ifr-matrix"]
+def _to_float_list(values: Any) -> list[float]:
+    if torch.is_tensor(values):
+        values = values.detach().cpu().to(dtype=torch.float32).tolist()
+    return [float(x) for x in (values or [])]
+class FlashTrace:
+    """Public facade for FlashTrace attribution."""
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        *,
+        chunk_tokens: int = 128,
+        sink_chunk_tokens: int = 32,
+        recompute_attention: bool = False,
+        generate_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.chunk_tokens = int(chunk_tokens)
+        self.sink_chunk_tokens = int(sink_chunk_tokens)
+        self.recompute_attention = bool(recompute_attention)
+        self.generate_kwargs = generate_kwargs
+    def trace(
+        self,
+        *,
+        prompt: str,
+        target: str | None = None,
+        output_span: tuple[int, int] | None = None,
+        reasoning_span: tuple[int, int] | None = None,
+        hops: int = 1,
+        method: TraceMethod = "flashtrace",
+        renorm_threshold: float | None = None,
+    ) -> TraceResult:
+        if method == "flashtrace":
+            engine = LLMIFRAttributionBoth(
+                self.model,
+                self.tokenizer,
+                generate_kwargs=self.generate_kwargs,
+                chunk_tokens=self.chunk_tokens,
+                sink_chunk_tokens=self.sink_chunk_tokens,
+                recompute_attention=self.recompute_attention,
+            )
+            raw = engine.calculate_ifr_multi_hop_both(
+                prompt,
+                target=target,
+                sink_span=output_span,
+                thinking_span=reasoning_span,
+                n_hops=int(hops),
+                renorm_threshold=renorm_threshold,
+            )
+        elif method == "ifr-span":
+            engine = LLMIFRAttribution(
+                self.model,
+                self.tokenizer,
+                generate_kwargs=self.generate_kwargs,
+                chunk_tokens=self.chunk_tokens,
+                sink_chunk_tokens=self.sink_chunk_tokens,
+                recompute_attention=self.recompute_attention,
+            )
+            raw = engine.calculate_ifr_span(
+                prompt,
+                target=target,
+                span=output_span,
+                renorm_threshold=renorm_threshold,
+            )
+        elif method == "ifr-matrix":
+            engine = LLMIFRAttribution(
+                self.model,
+                self.tokenizer,
+                generate_kwargs=self.generate_kwargs,
+                chunk_tokens=self.chunk_tokens,
+                sink_chunk_tokens=self.sink_chunk_tokens,
+                recompute_attention=self.recompute_attention,
+            )
+            raw = engine.calculate_ifr_for_all_positions_output_only(
+                prompt,
+                target=target,
+                sink_span=output_span,
+                renorm_threshold=renorm_threshold,
+            )
+        else:
+            raise ValueError(f"Unsupported method: {method}")
+        return self._build_result(raw, method=method, output_span=output_span, reasoning_span=reasoning_span)
+    def _build_result(
+        self,
+        raw: LLMAttributionResult,
+        *,
+        method: str,
+        output_span: tuple[int, int] | None,
+        reasoning_span: tuple[int, int] | None,
+    ) -> TraceResult:
+        prompt_tokens = list(raw.prompt_tokens)
+        generation_tokens = list(raw.generation_tokens)
+        prompt_len = len(prompt_tokens)
+        metadata = dict(raw.metadata or {})
+        if "method" not in metadata:
+            metadata["method"] = method
+        ifr_meta = metadata.get("ifr") if isinstance(metadata.get("ifr"), dict) else {}
+        observation = ifr_meta.get("observation_projected") if isinstance(ifr_meta, dict) else None
+        per_hop_projected = ifr_meta.get("per_hop_projected") if isinstance(ifr_meta, dict) else None
+        if isinstance(observation, dict) and "sum" in observation:
+            vector = _to_float_list(observation["sum"])
+            scores = vector[:prompt_len]
+        else:
+            matrix = torch.nan_to_num(raw.attribution_matrix.detach().cpu().to(dtype=torch.float32), nan=0.0)
+            if output_span is not None:
+                start, end = output_span
+                selected = matrix[int(start) : int(end) + 1, :prompt_len]
+            else:
+                selected = matrix[:, :prompt_len]
+            scores = selected.mean(dim=0).tolist() if selected.numel() else [0.0 for _ in prompt_tokens]
+        per_hop_scores: list[list[float]] = []
+        if per_hop_projected:
+            for hop_vector in per_hop_projected:
+                per_hop_scores.append(_to_float_list(hop_vector)[:prompt_len])
+        ratios = ifr_meta.get("thinking_ratios", []) if isinstance(ifr_meta, dict) else []
+        return TraceResult(
+            prompt_tokens=prompt_tokens,
+            generation_tokens=generation_tokens,
+            scores=[float(x) for x in scores],
+            per_hop_scores=per_hop_scores,
+            thinking_ratios=_to_float_list(ratios),
+            output_span=output_span,
+            reasoning_span=reasoning_span,
+            method=method,
+            metadata=metadata,
+        )
+```
+- [ ] **Step 4: Confirm public exports**
+Keep `flashtrace/__init__.py` as:
+```python
+"""FlashTrace: efficient multi-token attribution for reasoning LLMs."""
+from .model_io import load_model_and_tokenizer
+from .result import TokenScore, TraceResult
+from .tracer import FlashTrace
+__all__ = ["FlashTrace", "TraceResult", "TokenScore", "load_model_and_tokenizer"]
+```
+- [ ] **Step 5: Run tracer tests**
+Run:
+```bash
+uv run pytest tests/test_tracer.py -q
+```
+Expected: `2 passed`.
+- [ ] **Step 6: Run package tests created so far**
+Run:
+```bash
+uv run pytest tests/test_imports.py tests/test_core_recompute.py tests/test_result.py tests/test_tracer.py -q
+```
+Expected: all selected tests pass.
+- [ ] **Step 7: Commit**
+Run:
+```bash
+git add flashtrace/tracer.py flashtrace/__init__.py tests/test_tracer.py
+git commit -m "feat: add FlashTrace public facade"
+```
+## Task 6: Model IO And CLI
+**Files:**
+- Modify: `flashtrace/model_io.py`
+- Modify: `flashtrace/cli.py`
+- Create: `tests/test_cli.py`
+- [ ] **Step 1: Write CLI tests**
+Create `tests/test_cli.py`:
+```python
+import pytest
+from flashtrace.cli import main, parse_span
+def test_parse_span():
+    assert parse_span("3:8") == (3, 8)
+    assert parse_span(None) is None
+@pytest.mark.parametrize("value", ["3", "8:3", "a:b"])
+def test_parse_span_rejects_invalid_values(value):
+    with pytest.raises(ValueError):
+        parse_span(value)
+def test_cli_help_exits_successfully(capsys):
+    with pytest.raises(SystemExit) as exc:
+        main(["--help"])
+    assert exc.value.code == 0
+    assert "trace" in capsys.readouterr().out
+```
+- [ ] **Step 2: Run CLI tests and see the expected failure**
+Run:
+```bash
+uv run pytest tests/test_cli.py -q
+```
+Expected: pytest reports missing `parse_span`.
+- [ ] **Step 3: Implement model loading**
+Replace `flashtrace/model_io.py` with:
+```python
+from __future__ import annotations
+from typing import Any
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def _resolve_dtype(dtype: str | torch.dtype = "auto") -> str | torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    value = str(dtype).lower()
+    if value == "auto":
+        return "auto"
+    mapping = {
+        "float16": torch.float16,
+        "fp16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "bf16": torch.bfloat16,
+        "float32": torch.float32,
+        "fp32": torch.float32,
+    }
+    if value not in mapping:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    return mapping[value]
+def load_model_and_tokenizer(
+    model_name_or_path: str,
+    *,
+    device_map: str | dict[str, Any] | None = "auto",
+    dtype: str | torch.dtype = "auto",
+    trust_remote_code: bool = True,
+    **model_kwargs: Any,
+):
+    """Load a Hugging Face causal LM and matching tokenizer."""
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        torch_dtype=_resolve_dtype(dtype),
+        device_map=device_map,
+        trust_remote_code=trust_remote_code,
+        **model_kwargs,
+    )
+    model.eval()
+    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+```
+- [ ] **Step 4: Implement CLI**
+Replace `flashtrace/cli.py` with:
+```python
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from typing import Sequence
+from .model_io import load_model_and_tokenizer
+from .tracer import FlashTrace
+def parse_span(value: str | None) -> tuple[int, int] | None:
+    if value is None:
+        return None
+    parts = str(value).split(":")
+    if len(parts) != 2:
+        raise ValueError("Span must use START:END format.")
+    try:
+        start = int(parts[0])
+        end = int(parts[1])
+    except ValueError as exc:
+        raise ValueError("Span bounds must be integers.") from exc
+    if start < 0 or end < start:
+        raise ValueError("Span must satisfy 0 <= START <= END.")
+    return start, end
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="flashtrace", description="Trace language model outputs with FlashTrace.")
+    sub = parser.add_subparsers(dest="command")
+    trace = sub.add_parser("trace", help="Run attribution for a prompt and target.")
+    trace.add_argument("--model", required=True, help="Hugging Face model id or local path.")
+    trace.add_argument("--prompt", required=True, help="UTF-8 text file containing the prompt.")
+    trace.add_argument("--target", help="UTF-8 text file containing the target response.")
+    trace.add_argument("--output-span", help="Inclusive generation-token span START:END.")
+    trace.add_argument("--reasoning-span", help="Inclusive generation-token span START:END.")
+    trace.add_argument("--hops", type=int, default=1)
+    trace.add_argument("--method", default="flashtrace", choices=["flashtrace", "ifr-span", "ifr-matrix"])
+    trace.add_argument("--html", help="Write standalone HTML heatmap.")
+    trace.add_argument("--json", help="Write JSON trace.")
+    trace.add_argument("--device-map", default="auto")
+    trace.add_argument("--dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
+    trace.add_argument("--chunk-tokens", type=int, default=128)
+    trace.add_argument("--sink-chunk-tokens", type=int, default=32)
+    trace.add_argument("--recompute-attention", action="store_true")
+    return parser
+def _read_text(path: str | None) -> str | None:
+    if path is None:
+        return None
+    return Path(path).read_text(encoding="utf-8")
+def _run_trace(args: argparse.Namespace) -> int:
+    model, tokenizer = load_model_and_tokenizer(args.model, device_map=args.device_map, dtype=args.dtype)
+    tracer = FlashTrace(
+        model,
+        tokenizer,
+        chunk_tokens=args.chunk_tokens,
+        sink_chunk_tokens=args.sink_chunk_tokens,
+        recompute_attention=args.recompute_attention,
+    )
+    result = tracer.trace(
+        prompt=_read_text(args.prompt) or "",
+        target=_read_text(args.target),
+        output_span=parse_span(args.output_span),
+        reasoning_span=parse_span(args.reasoning_span),
+        hops=args.hops,
+        method=args.method,
+    )
+    for item in result.topk_inputs(20):
+        print(f"{item.index}\t{item.score:.6f}\t{item.token!r}")
+    if args.json:
+        result.to_json(args.json)
+    if args.html:
+        result.to_html(args.html)
+    return 0
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if args.command == "trace":
+        return _run_trace(args)
+    parser.print_help()
+    return 0
+```
+- [ ] **Step 5: Run CLI tests**
+Run:
+```bash
+uv run pytest tests/test_cli.py -q
+```
+Expected: all CLI tests pass.
+- [ ] **Step 6: Verify console script metadata**
+Run:
+```bash
+uv run flashtrace --help
+```
+Expected: help text includes `trace`.
+- [ ] **Step 7: Commit**
+Run:
+```bash
+git add flashtrace/model_io.py flashtrace/cli.py tests/test_cli.py
+git commit -m "feat: add model loader and CLI"
+```
+## Task 7: README, Example, License, And Release Hygiene
+**Files:**
+- Create: `README.md`
+- Create: `LICENSE`
+- Create: `examples/quickstart.py`
+- Modify: `.gitignore`
+- Delete: `model_generation.py`
+- [ ] **Step 1: Create the quickstart example**
+Create `examples/quickstart.py`:
+```python
+from __future__ import annotations
+import argparse
+from flashtrace import FlashTrace, load_model_and_tokenizer
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="FlashTrace quickstart example.")
+    parser.add_argument("--model", required=True, help="Hugging Face model id or local model path.")
+    parser.add_argument("--prompt", required=True, help="Prompt text.")
+    parser.add_argument("--target", help="Target response text.")
+    parser.add_argument("--output-span", default=None, help="Inclusive generation-token span START:END.")
+    parser.add_argument("--reasoning-span", default=None, help="Inclusive generation-token span START:END.")
+    parser.add_argument("--html", default="trace.html", help="Output HTML path.")
+    return parser
+def parse_span(value: str | None) -> tuple[int, int] | None:
+    from flashtrace.cli import parse_span as parse_cli_span
+    return parse_cli_span(value)
+def main() -> int:
+    args = build_parser().parse_args()
+    model, tokenizer = load_model_and_tokenizer(args.model)
+    tracer = FlashTrace(model, tokenizer)
+    trace = tracer.trace(
+        prompt=args.prompt,
+        target=args.target,
+        output_span=parse_span(args.output_span),
+        reasoning_span=parse_span(args.reasoning_span),
+    )
+    for item in trace.topk_inputs(10):
+        print(f"{item.index}\t{item.score:.6f}\t{item.token!r}")
+    trace.to_html(args.html)
+    print(f"wrote {args.html}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())
+```
+- [ ] **Step 2: Add README**
+Create `README.md`:
+```markdown
+# FlashTrace
+FlashTrace is an efficient multi-token attribution toolkit for reasoning language models. It implements the method described in [Towards Long-Horizon Interpretability: Efficient and Faithful Multi-Token Attribution for Reasoning LLMs](https://arxiv.org/abs/2602.01914).
+## Install
+```bash
+pip install -e .
+```
+## Python Quickstart
+```python
+from flashtrace import FlashTrace, load_model_and_tokenizer
+model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-8B")
+tracer = FlashTrace(model, tokenizer)
+trace = tracer.trace(
+    prompt="Context: Paris is the capital of France.\nQuestion: What is the capital of France?",
+    target="Paris",
+    output_span=(0, 0),
+    hops=1,
+)
+print(trace.topk_inputs(10))
+trace.to_html("trace.html")
+trace.to_json("trace.json")
+```
+## CLI Quickstart
+```bash
+flashtrace trace \
+  --model Qwen/Qwen3-8B \
+  --prompt prompt.txt \
+  --target target.txt \
+  --output-span 0:0 \
+  --hops 1 \
+  --html trace.html \
+  --json trace.json
+```
+## Token Spans
+`output_span` and `reasoning_span` use inclusive generation-token indices. Inspect `trace.generation_tokens` after an initial run to choose spans for a target answer or reasoning segment.
+## Supported Models
+The package targets Llama/Qwen-style decoder-only Hugging Face causal LMs with standard Q/K/V/O projections, RMSNorm or LayerNorm, and RoPE metadata. Qwen2, Qwen3, and Llama are the first validated model families.
+## Repository Map
+- `flashtrace/`: reusable package
+- `examples/`: public examples
+- `tests/`: CPU smoke tests
+- `exp/`: paper experiments and artifacts
+## Citation
+```bibtex
+@misc{pan2026flashtrace,
+  title={Towards Long-Horizon Interpretability: Efficient and Faithful Multi-Token Attribution for Reasoning LLMs},
+  author={Pan, Wenbo and Liu, Zhichao and Wang, Xianlong and Yu, Haining and Jia, Xiaohua},
+  year={2026},
+  eprint={2602.01914},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+```
+```
+- [ ] **Step 3: Add MIT license**
+Create `LICENSE`:
+```text
+MIT License
+Copyright (c) 2026 Wenbo Pan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+```
+- [ ] **Step 4: Update generated artifact ignore rules**
+Append to `.gitignore`:
+```gitignore
+# FlashTrace generated artifacts
+trace.json
+trace.html
+*.trace.json
+*.trace.html
+exp/**/output/
+exp/**/out/
+exp/**/out-*/
+*.npz
+```
+- [ ] **Step 5: Remove the template artifact**
+Run:
+```bash
+git rm model_generation.py
+```
+- [ ] **Step 6: Verify quickstart help**
+Run:
+```bash
+uv run python examples/quickstart.py --help
+```
+Expected: help text includes `FlashTrace quickstart example`.
+- [ ] **Step 7: Commit**
+Run:
+```bash
+git add README.md LICENSE examples/quickstart.py .gitignore
+git commit -m "docs: add public quickstart and release hygiene"
+```
+## Task 8: Final Verification And Package Audit
+**Files:**
+- Modify: any files needed to fix verification failures.
+- [ ] **Step 1: Run the full CPU test suite**
+Run:
+```bash
+uv run pytest tests -q
+```
+Expected: all tests pass.
+- [ ] **Step 2: Verify editable install import**
+Run:
+```bash
+uv run python -c "import flashtrace; print(flashtrace.FlashTrace.__name__)"
+```
+Expected: prints `FlashTrace`.
+- [ ] **Step 3: Verify CLI help**
+Run:
+```bash
+uv run flashtrace --help
+uv run flashtrace trace --help
+```
+Expected: both commands print help text.
+- [ ] **Step 4: Verify root compatibility imports**
+Run:
+```bash
+uv run python -c "import ifr_core, llm_attr, ft_ifr_improve; print(ifr_core.compute_multi_hop_ifr.__name__)"
+```
+Expected: prints `compute_multi_hop_ifr`.
+- [ ] **Step 5: Inspect package file list**
+Run:
+```bash
+git status --short
+find flashtrace -maxdepth 3 -type f | sort
+```
+Expected: package files match the design spec and only intended changes appear.
+- [ ] **Step 6: Commit final fixes**
+Run after any verification fixes:
+```bash
+git add .
+git commit -m "test: verify public package smoke tests"
+```
+If verification passes with a clean tree after prior commits, record the passing commands in the final implementation response.
+## Self-Review Checklist
+- Spec coverage: package layout, public API, result export, CLI, visualization, packaging, compatibility, tests, README, and release hygiene each have at least one task.
+- Type consistency: `FlashTrace.trace`, `TraceResult`, `TokenScore`, `load_model_and_tokenizer`, and CLI span parsing use the same names across tests and implementation steps.
+- Test path: every implementation task starts with a failing test or a verification command, then ends with a passing command and commit.

docs/superpowers/specs/2026-05-03-flashtrace-public-package-design.md ADDED Viewed

	@@ -0,0 +1,231 @@

+# FlashTrace Public Package Design
+## Goal
+Turn the current FlashTrace research repository into an installable, documented Python package that researchers can use from Python or the command line to trace LLM outputs, export JSON traces, and render HTML token heatmaps.
+## Release Scope
+This first public release ships four user-facing capabilities:
+- A stable Python API centered on `FlashTrace`.
+- A `flashtrace trace` CLI for prompt/target files and Hugging Face model ids or local model paths.
+- A `TraceResult` object with top-k, JSON, and HTML export helpers.
+- A README quickstart that demonstrates Python, CLI, and heatmap workflows.
+Paper experiment runners and saved experiment artifacts remain in `exp/` as research assets. Their full reproducibility cleanup belongs to a later phase.
+## Repository Shape
+The reusable package lives under `flashtrace/`, examples under `examples/`, tests under `tests/`, and paper experiments under `exp/`.
+```text
+flashtrace/
+  __init__.py
+  tracer.py
+  result.py
+  core.py
+  model_io.py
+  viz.py
+  cli.py
+  baselines/
+    __init__.py
+    attnlrp.py
+examples/
+  quickstart.py
+tests/
+  test_core_recompute.py
+  test_tracer.py
+  test_result.py
+  test_cli.py
+exp/
+  exp1/
+  exp2/
+  case_study/
+```
+Existing root modules are migrated gradually. During migration, compatibility wrappers remain at the root for experiment scripts that still import `llm_attr`, `ifr_core`, or `ft_ifr_improve`.
+## Core Implementation Mapping
+`flashtrace.core` contains the IFR tensor implementation from `ifr_core.py`:
+- `extract_model_metadata`
+- `build_weight_pack`
+- `attach_hooks`
+- `recompute_layer_attention`
+- `compute_ifr_sentence_aggregate`
+- `compute_multi_hop_ifr`
+- `compute_ifr_for_all_positions`
+`flashtrace.tracer` wraps the current high-level attribution classes:
+- Default `method="flashtrace"` uses the current `LLMIFRAttributionBoth.calculate_ifr_multi_hop_both` behavior.
+- `method="ifr-span"` uses `LLMIFRAttribution.calculate_ifr_span`.
+- `method="ifr-matrix"` uses `LLMIFRAttribution.calculate_ifr_for_all_positions_output_only`.
+`flashtrace.baselines.attnlrp` contains the AttnLRP patching and recursive baseline code from `lrp_rules.py`, `lrp_patches.py`, and `LLMLRPAttribution`.
+## Public Python API
+The package exports `FlashTrace`, `TraceResult`, and `load_model_and_tokenizer`.
+```python
+from flashtrace import FlashTrace, load_model_and_tokenizer
+model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-8B", device_map="auto")
+tracer = FlashTrace(model, tokenizer, chunk_tokens=128, sink_chunk_tokens=32)
+trace = tracer.trace(
+    prompt=prompt,
+    target=target,
+    output_span=(80, 85),
+    reasoning_span=(0, 79),
+    hops=1,
+)
+print(trace.topk_inputs(20))
+trace.to_json("trace.json")
+trace.to_html("trace.html")
+```
+`FlashTrace.trace(...)` accepts:
+- `prompt: str`
+- `target: str | None`
+- `output_span: tuple[int, int] | None`
+- `reasoning_span: tuple[int, int] | None`
+- `hops: int`
+- `method: Literal["flashtrace", "ifr-span", "ifr-matrix"]`
+- `renorm_threshold: float | None`
+Generation-token spans are inclusive and use the tokenizer alignment already produced by the attribution path. The README explains this convention and shows how to inspect `trace.generation_tokens`.
+## TraceResult
+`TraceResult` is a small dataclass that hides the older `LLMAttributionResult` shape from public users.
+Fields:
+- `prompt_tokens: list[str]`
+- `generation_tokens: list[str]`
+- `scores: list[float]`
+- `per_hop_scores: list[list[float]]`
+- `thinking_ratios: list[float]`
+- `output_span: tuple[int, int] | None`
+- `reasoning_span: tuple[int, int] | None`
+- `method: str`
+- `metadata: dict[str, Any]`
+Methods:
+- `topk_inputs(k: int = 20) -> list[TokenScore]`
+- `to_dict() -> dict[str, Any]`
+- `to_json(path: str | Path) -> None`
+- `to_html(path: str | Path) -> None`
+`TokenScore` contains `index`, `token`, and `score`. Scores are aligned to `prompt_tokens`.
+## CLI
+The package exposes one console script:
+```bash
+flashtrace trace \
+  --model Qwen/Qwen3-8B \
+  --prompt prompt.txt \
+  --target target.txt \
+  --output-span 80:85 \
+  --reasoning-span 0:79 \
+  --hops 1 \
+  --html trace.html \
+  --json trace.json
+```
+CLI behavior:
+- `--model` accepts a Hugging Face id or local path.
+- `--prompt` and `--target` read UTF-8 text files.
+- `--target` is optional; the model generates with deterministic defaults when this flag is absent.
+- `--output-span` and `--reasoning-span` parse inclusive `START:END` generation-token spans.
+- `--method` defaults to `flashtrace`.
+- `--recompute-attention` enables lower-memory attention recomputation.
+- `--device-map` defaults to `auto`.
+- `--dtype` accepts `auto`, `float16`, `bfloat16`, or `float32`.
+The command prints a compact top-k table to stdout and writes requested artifacts.
+## Visualization
+`flashtrace.viz` adapts the token heatmap renderer from `exp/case_study/viz.py`.
+The public heatmap focuses on:
+- prompt tokens colored by final attribution score,
+- optional per-hop panels,
+- output and reasoning span summary,
+- model/method metadata.
+The renderer returns a standalone HTML string and writes standalone HTML files through `TraceResult.to_html`.
+## Packaging
+`pyproject.toml` becomes package metadata for `flashtrace`:
+- `name = "flashtrace"`
+- realistic `requires-python` support for current PyTorch and Transformers use,
+- console script `flashtrace = "flashtrace.cli:main"`,
+- core dependencies: `torch`, `transformers`, `accelerate`, `numpy`, `tqdm`,
+- optional extras: `viz`, `eval`, `dev`, `baselines`.
+The root README includes:
+- project tagline,
+- paper link and citation,
+- install instructions,
+- Python quickstart,
+- CLI quickstart,
+- supported model family notes,
+- output interpretation,
+- experiment directory map,
+- troubleshooting for GPU memory and tokenizer spans.
+## Compatibility
+The release supports Llama/Qwen-style decoder-only Hugging Face causal LMs with `model.layers`, Q/K/V/O projections, RMSNorm/LayerNorm, and RoPE metadata. The README names Qwen2, Qwen3, and Llama as validated families.
+Existing experiment scripts continue to run through temporary root-level compatibility modules while package imports are introduced. A later cleanup can remove the compatibility layer after `exp/` imports are migrated.
+## Testing
+Tests use a tiny randomly initialized Qwen2 model on CPU, following the existing `test_recompute.py` approach.
+Required coverage:
+- stored-attention and recomputed-attention paths return close values on the tiny model,
+- `FlashTrace.trace(...)` returns a `TraceResult`,
+- `TraceResult.topk_inputs(...)` sorts and truncates correctly,
+- `TraceResult.to_dict()` is JSON serializable,
+- `TraceResult.to_html()` writes standalone HTML containing token spans,
+- `flashtrace trace --help` exits successfully.
+Heavy GPU model tests remain manual examples.
+## Release Hygiene
+The release cleanup updates `.gitignore` to cover generated traces, experiment outputs, checkpoints, caches, and HTML/JSON artifacts created by examples.
+Tracked historical experiment outputs stay untouched during the first package migration. A later artifact cleanup can move them to release assets or remove them with a dedicated confirmation step.
+`model_generation.py` is a template artifact and is removed or moved outside the package path during implementation.
+## Success Criteria
+The release work is complete when:
+- `pip install -e .` exposes `flashtrace`,
+- `python examples/quickstart.py --help` works,
+- `flashtrace trace --help` works,
+- package smoke tests pass on CPU,
+- README quickstart matches the implemented API,
+- existing experiment entrypoints either run with compatibility imports or document their package-era invocation.

dump_exp2_hop_vh.py ADDED Viewed

	@@ -0,0 +1,412 @@

+#!/usr/bin/env python3
+"""One-off: add per-hop IFR vectors (vh) into an existing exp2 trace .npz.
+This is useful when the original exp2 run saved sample-level traces but did not
+include per-hop vectors for some multi-hop IFR variants (e.g. ifr_multi_hop_both).
+Defaults are written to match the reference commands in `exp/exp2/README.md`.
+Example (matches the path in the question):
+python dump_exp2_hop_vh.py \
+  --trace_npz exp/exp2/output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/ifr_multi_hop_both_n1_mfaithfulness_gen_95ex/ex_000026.npz \
+  --dataset exp/exp2/data/morehopqa.jsonl \
+  --attr_func ifr_multi_hop_both \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 2,3,4,5,6,7 \
+  --n_hops 1 \
+  --chunk_tokens 128 \
+  --sink_chunk_tokens 32 \
+  --inplace
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+def _early_set_cuda_visible_devices() -> None:
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    if args.cuda and "," in str(args.cuda):
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.cuda)
+_early_set_cuda_visible_devices()
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import ft_ifr_improve
+import llm_attr
+from exp.exp2 import dataset_utils as ds_utils
+def _sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def _resolve_device(cuda: Optional[str], cuda_num: int) -> str:
+    """Mirror exp/exp2/run_exp.py device selection policy."""
+    if cuda is not None and "," in cuda:
+        # _early_set_cuda_visible_devices already applied.
+        return "auto"
+    if cuda is not None and str(cuda).strip():
+        return f"cuda:{cuda}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{int(cuda_num)}" if torch.cuda.is_available() else "cpu"
+def _load_model(model_name: str, device: str):
+    """Mirror exp/exp2/run_exp.py model loading knobs."""
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="auto" if device == "auto" else {"": int(device.split(":")[1])} if device.startswith("cuda:") else None,
+        torch_dtype=torch.float16,
+        attn_implementation="eager",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+@dataclass(frozen=True)
+class ManifestRecord:
+    example_idx: int
+    prompt_sha1: str
+    target_sha1: Optional[str]
+def _load_manifest_record(manifest_path: Path, *, example_idx: int) -> Optional[ManifestRecord]:
+    if not manifest_path.exists():
+        return None
+    with manifest_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            if int(obj.get("example_idx", -1)) != int(example_idx):
+                continue
+            return ManifestRecord(
+                example_idx=int(example_idx),
+                prompt_sha1=str(obj.get("prompt_sha1") or ""),
+                target_sha1=str(obj["target_sha1"]) if obj.get("target_sha1") is not None else None,
+            )
+    return None
+def _parse_example_idx_from_npz_name(path: Path) -> Optional[int]:
+    m = re.match(r"^ex_(\d+)$", path.stem)
+    if not m:
+        return None
+    try:
+        return int(m.group(1))
+    except Exception:
+        return None
+def _pick_example(
+    examples: list[ds_utils.CachedExample],
+    *,
+    example_idx: int,
+    record: Optional[ManifestRecord],
+) -> ds_utils.CachedExample:
+    if record is not None and record.prompt_sha1:
+        matches: list[ds_utils.CachedExample] = []
+        for ex in examples:
+            if _sha1_text(ex.prompt) != record.prompt_sha1:
+                continue
+            if record.target_sha1 is None:
+                if ex.target is None:
+                    matches.append(ex)
+            else:
+                if ex.target is not None and _sha1_text(ex.target) == record.target_sha1:
+                    matches.append(ex)
+        if len(matches) == 1:
+            return matches[0]
+        if len(matches) > 1:
+            raise SystemExit(
+                f"Manifest sha1 matched multiple dataset entries ({len(matches)}). "
+                "Please pass --example_idx to select by index or use a smaller dataset cache."
+            )
+        raise SystemExit(
+            "Failed to locate the trace example in the provided dataset by sha1. "
+            "Ensure --dataset points to the same cached JSONL used to produce the trace."
+        )
+    if not (0 <= int(example_idx) < len(examples)):
+        raise SystemExit(f"example_idx out of range: {example_idx} not in [0, {len(examples)}).")
+    return examples[int(example_idx)]
+def _extract_vh(attr: Any) -> np.ndarray:
+    ifr = (getattr(attr, "metadata", None) or {}).get("ifr") or {}
+    per_hop = ifr.get("per_hop_projected") or []
+    if not per_hop:
+        raise RuntimeError("Attribution result missing metadata['ifr']['per_hop_projected']; cannot build vh.")
+    stacked = torch.stack([torch.as_tensor(v, dtype=torch.float32).reshape(-1) for v in per_hop], dim=0)
+    return stacked.detach().cpu().numpy().astype(np.float32, copy=False)
+def _run_ifr_attr(
+    attr_func: str,
+    *,
+    model: Any,
+    tokenizer: Any,
+    prompt: str,
+    target: str,
+    sink_span: Optional[tuple[int, int]],
+    thinking_span: Optional[tuple[int, int]],
+    n_hops: int,
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+) -> Any:
+    if attr_func == "ifr_multi_hop":
+        attributor = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        return attributor.calculate_ifr_multi_hop(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+    if attr_func == "ifr_in_all_gen":
+        attributor = ft_ifr_improve.LLMIFRAttributionInAllGen(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        return attributor.calculate_ifr_in_all_gen(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+    if attr_func == "ifr_multi_hop_stop_words":
+        attributor = ft_ifr_improve.LLMIFRAttributionImproved(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        return attributor.calculate_ifr_multi_hop_stop_words(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+    if attr_func == "ifr_multi_hop_both":
+        attributor = ft_ifr_improve.LLMIFRAttributionBoth(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        return attributor.calculate_ifr_multi_hop_both(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+    if attr_func == "ifr_multi_hop_split_hop":
+        attributor = ft_ifr_improve.LLMIFRAttributionSplitHop(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        return attributor.calculate_ifr_multi_hop_split_hop(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+    raise SystemExit(
+        f"Unsupported --attr_func '{attr_func}'. "
+        "Supported (vh-capable IFR variants): "
+        "ifr_multi_hop, ifr_in_all_gen, ifr_multi_hop_stop_words, ifr_multi_hop_both, ifr_multi_hop_split_hop."
+    )
+def _save_npz(
+    out_path: Path,
+    *,
+    payload: dict[str, np.ndarray],
+    inplace_src: Optional[Path] = None,
+    backup: bool = True,
+    overwrite_backup: bool = False,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if inplace_src is not None:
+        if backup and inplace_src.exists():
+            backup_path = inplace_src.with_name(inplace_src.name + ".bak")
+            if overwrite_backup and backup_path.exists():
+                backup_path.unlink()
+            if not backup_path.exists():
+                backup_path.write_bytes(inplace_src.read_bytes())
+        # NOTE: numpy.savez* appends ".npz" if the filename does not already end with ".npz".
+        # So we must ensure our temporary path ends with ".npz", otherwise we'd write
+        # "<name>.tmp.npz" but later try to os.replace("<name>.tmp", ...).
+        tmp_path = out_path.with_name(out_path.stem + ".tmp.npz")
+        if tmp_path.exists():
+            tmp_path.unlink()
+        np.savez_compressed(tmp_path, **payload)
+        os.replace(tmp_path, out_path)
+        return
+    if out_path.exists():
+        raise SystemExit(f"Refusing to overwrite existing file: {out_path} (use --inplace).")
+    np.savez_compressed(out_path, **payload)
+def main() -> None:
+    parser = argparse.ArgumentParser("One-off exp2 trace patcher: add per-hop vh vectors.")
+    parser.add_argument(
+        "--trace_npz",
+        type=str,
+        default=(
+            "exp/exp2/output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/"
+            "ifr_multi_hop_both_n1_mfaithfulness_gen_95ex/ex_000026.npz"
+        ),
+        help="Path to the existing exp2 trace npz (ex_*.npz).",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="exp/exp2/data/morehopqa.jsonl",
+        help="Path to the exp2 cached dataset JSONL used to produce the trace.",
+    )
+    parser.add_argument(
+        "--attr_func",
+        type=str,
+        default="ifr_multi_hop_both",
+        help="Attribution method to rerun (vh-capable IFR variants only).",
+    )
+    parser.add_argument("--example_idx", type=int, default=None, help="Override example_idx (0-based).")
+    parser.add_argument("--sample", type=int, default=None, help="If the original run used --sample, set it here.")
+    parser.add_argument("--seed", type=int, default=42, help="Seed for --sample shuffling (must match original).")
+    parser.add_argument("--model", type=str, default="qwen-8B", help="HF repo id (used when --model_path not set).")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="/opt/share/models/Qwen/Qwen3-8B/",
+        help="Local model path; overrides --model for loading (matches exp2 README examples).",
+    )
+    parser.add_argument(
+        "--cuda",
+        type=str,
+        default="2,3,4,5,6,7",
+        help="CUDA selection (same semantics as exp2): '0' or '0,1,2'.",
+    )
+    parser.add_argument("--cuda_num", type=int, default=0, help="Single-device index when --cuda not set.")
+    parser.add_argument("--chunk_tokens", type=int, default=128)
+    parser.add_argument("--sink_chunk_tokens", type=int, default=32)
+    parser.add_argument("--n_hops", type=int, default=1)
+    parser.add_argument(
+        "--inplace",
+        action="store_true",
+        help="Overwrite the trace npz in place (recommended so manifest.jsonl stays valid).",
+    )
+    parser.add_argument("--no_backup", action="store_true", help="Disable .bak creation when using --inplace.")
+    parser.add_argument(
+        "--overwrite_backup",
+        action="store_true",
+        help="Allow replacing an existing .bak when using --inplace.",
+    )
+    args = parser.parse_args()
+    trace_npz = Path(args.trace_npz)
+    if not trace_npz.exists():
+        raise SystemExit(f"Missing trace npz: {trace_npz}")
+    example_idx = args.example_idx
+    if example_idx is None:
+        example_idx = _parse_example_idx_from_npz_name(trace_npz)
+    if example_idx is None:
+        raise SystemExit("Failed to infer --example_idx from trace filename; please pass --example_idx explicitly.")
+    manifest_path = trace_npz.with_name("manifest.jsonl")
+    record = _load_manifest_record(manifest_path, example_idx=int(example_idx))
+    dataset_path = Path(args.dataset)
+    if not dataset_path.exists():
+        raise SystemExit(f"Missing cached dataset JSONL: {dataset_path}")
+    examples = ds_utils.load_cached(dataset_path, sample=args.sample, seed=args.seed)
+    ex = _pick_example(examples, example_idx=int(example_idx), record=record)
+    if ex.target is None:
+        raise SystemExit("Cached dataset example has target=None; this script requires cached targets (CoT+answer).")
+    prompt = ex.prompt
+    target = ex.target
+    sink_span = tuple(ex.sink_span) if ex.sink_span else None
+    thinking_span = tuple(ex.thinking_span) if ex.thinking_span else None
+    model_name = str(args.model_path or args.model).strip()
+    if not model_name:
+        raise SystemExit("Please set --model or --model_path.")
+    device = _resolve_device(args.cuda, args.cuda_num)
+    model, tokenizer = _load_model(model_name, device)
+    attr = _run_ifr_attr(
+        str(args.attr_func),
+        model=model,
+        tokenizer=tokenizer,
+        prompt=prompt,
+        target=target,
+        sink_span=sink_span,
+        thinking_span=thinking_span,
+        n_hops=int(args.n_hops),
+        chunk_tokens=int(args.chunk_tokens),
+        sink_chunk_tokens=int(args.sink_chunk_tokens),
+    )
+    vh = _extract_vh(attr)
+    with np.load(trace_npz, allow_pickle=False) as old:
+        payload = {k: old[k] for k in old.files}
+    payload["vh"] = vh
+    if args.inplace:
+        out_path = trace_npz
+    else:
+        out_path = trace_npz.with_name(trace_npz.stem + "_with_vh.npz")
+    _save_npz(
+        out_path,
+        payload=payload,
+        inplace_src=trace_npz if args.inplace else None,
+        backup=not bool(args.no_backup),
+        overwrite_backup=bool(args.overwrite_backup),
+    )
+    print(f"Saved vh -> {out_path}")
+    print(f"vh shape: {vh.shape} (n_hops+1, prompt_len+gen_len)")
+if __name__ == "__main__":
+    main()

evaluations/attribution_recovery.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import os
+import sys
+# Ensure project root is importable regardless of CWD
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import csv
+import json
+import math
+import random
+import time
+from itertools import islice
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, utils
+import llm_attr
+import llm_attr_eval
+from exp.exp2 import dataset_utils as ds_utils
+utils.logging.set_verbosity_error()
+def _first_json_obj(path: Path) -> dict:
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                return json.loads(line)
+    return {}
+def _load_ruler_examples(args) -> Tuple[str, List[ds_utils.CachedExample]]:
+    ds_arg = args.dataset
+    cache_dir = Path(args.data_root)
+    # 1) If dataset points to an existing file, detect cache vs raw RULER.
+    p = Path(ds_arg)
+    if p.exists():
+        obj = _first_json_obj(p)
+        if "prompt" in obj:
+            return p.stem, ds_utils.load_cached(p, sample=args.sample, seed=args.seed)
+        if "input" in obj and "needle_spans" in obj:
+            return p.stem, ds_utils.load_ruler(p, sample=args.sample, seed=args.seed)
+        raise SystemExit(
+            f"Unsupported JSONL schema for recovery_ruler: {p}. "
+            "Expected either exp2 cache (has 'prompt') or raw RULER JSONL (has 'input'+'needle_spans')."
+        )
+    # 2) Prefer exp2 cache under --data_root by dataset name.
+    cached = cache_dir / f"{ds_arg}.jsonl"
+    if cached.exists():
+        return ds_arg, ds_utils.load_cached(cached, sample=args.sample, seed=args.seed)
+    # 3) Fall back to raw RULER resolution by name.
+    resolved = ds_utils.dataset_from_name(ds_arg)
+    if resolved is None:
+        raise SystemExit(f"Could not resolve RULER dataset name '{ds_arg}'.")
+    return ds_arg, ds_utils.load_ruler(resolved, sample=args.sample, seed=args.seed)
+def _resolve_indices_to_explain_token_span(
+    attr_result: llm_attr.LLMAttributionResult, indices_to_explain: list[int] | None
+) -> list[int]:
+    if (
+        isinstance(indices_to_explain, list)
+        and len(indices_to_explain) == 2
+        and all(isinstance(x, int) and x >= 0 for x in indices_to_explain)
+        and indices_to_explain[0] <= indices_to_explain[1]
+    ):
+        return indices_to_explain
+    gen_len = int(attr_result.attribution_matrix.shape[0])
+    if gen_len <= 0:
+        return [0, 0]
+    # Default: explain the full generation excluding the appended EOS token.
+    end_tok = max(0, gen_len - 2)
+    return [0, end_tok]
+def run_attribution(
+    testing_dict, example: ds_utils.CachedExample, batch_size: int, target: Optional[str]
+) -> tuple[List[torch.Tensor], dict | None]:
+    model = testing_dict["model"]
+    tokenizer = testing_dict["tokenizer"]
+    attr_func = testing_dict["attr_func"]
+    if "IG" in attr_func:
+        llm_attributor = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        attr = llm_attributor.calculate_IG_per_generation(
+            example.prompt,
+            20,
+            tokenizer.eos_token_id,
+            batch_size=batch_size,
+            target=target,
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if "perturbation" in attr_func:
+        llm_attributor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+        if attr_func == "perturbation_all":
+            attr = llm_attributor.calculate_feature_ablation_sentences(
+                example.prompt, baseline=tokenizer.eos_token_id, measure="log_loss", target=target
+            )
+        elif attr_func == "perturbation_CLP":
+            attr = llm_attributor.calculate_feature_ablation_sentences(
+                example.prompt, baseline=tokenizer.eos_token_id, measure="KL", target=target
+            )
+        elif attr_func == "perturbation_REAGENT":
+            attr = llm_attributor.calculate_feature_ablation_sentences_mlm(example.prompt, target=target)
+        else:
+            raise ValueError(f"Unsupported perturbation attr_func {attr_func}")
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if "attention" in attr_func:
+        llm_attributor = llm_attr.LLMAttentionAttribution(model, tokenizer)
+        llm_attributor_ig = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        attr = llm_attributor.calculate_attention_attribution(example.prompt, target=target)
+        if attr_func == "attention_I_G":
+            attr_b = llm_attributor_ig.calculate_IG_per_generation(
+                example.prompt, 20, tokenizer.eos_token_id, batch_size=batch_size, target=target
+            )
+            attr.attribution_matrix = attr.attribution_matrix * attr_b.attribution_matrix
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_all_positions":
+        llm_attributor = llm_attr.LLMIFRAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_for_all_positions(example.prompt, target=target)
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_all_positions_output_only":
+        llm_attributor = llm_attr.LLMIFRAttribution(model, tokenizer)
+        sink_span = tuple(example.sink_span) if example.sink_span else None
+        attr = llm_attributor.calculate_ifr_for_all_positions_output_only(
+            example.prompt,
+            target=target,
+            sink_span=sink_span,
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_span":
+        llm_attributor = llm_attr.LLMIFRAttribution(model, tokenizer)
+        span = example.sink_span if example.sink_span else None
+        attr = llm_attributor.calculate_ifr_span(example.prompt, target=target, span=tuple(span) if span else None)
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_multi_hop":
+        llm_attributor = llm_attr.LLMIFRAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_multi_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_in_all_gen":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionInAllGen(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_in_all_gen(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "ifr_multi_hop_stop_words":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionImproved(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_multi_hop_stop_words(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        extra = {
+            "keep_prompt_token_indices": ft_ifr_improve.keep_token_indices(list(attr.prompt_tokens)),
+        }
+        return list(attr.get_all_token_attrs(token_span)), extra
+    if attr_func == "ifr_multi_hop_both":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionBoth(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_multi_hop_both(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        extra = {
+            "keep_prompt_token_indices": ft_ifr_improve.keep_token_indices(list(attr.prompt_tokens)),
+        }
+        return list(attr.get_all_token_attrs(token_span)), extra
+    if attr_func == "ifr_multi_hop_split_hop":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionSplitHop(model, tokenizer)
+        attr = llm_attributor.calculate_ifr_multi_hop_split_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "basic":
+        llm_attributor = llm_attr.LLMBasicAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_basic_attribution(example.prompt, target=target)
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "attnlrp":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        sink_span = getattr(example, "sink_span", None)
+        thinking_span = getattr(example, "thinking_span", None)
+        attr = llm_attributor.calculate_attnlrp_ft_hop0(
+            example.prompt,
+            target=target,
+            sink_span=tuple(sink_span) if sink_span else None,
+            thinking_span=tuple(thinking_span) if thinking_span else None,
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "attnlrp_aggregated":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_aggregated(example.prompt, target=target)
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    if attr_func == "attnlrp_aggregated_multi_hop":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_aggregated_multi_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, example.indices_to_explain)
+        return list(attr.get_all_token_attrs(token_span)), None
+    raise ValueError(f"Unsupported attribution function '{attr_func}'.")
+def evaluate_dataset_recovery_ruler(testing_dict, dataset_name: str, examples: List[ds_utils.CachedExample]) -> Tuple[np.ndarray, np.ndarray, float, int, int]:
+    tokenizer = testing_dict["tokenizer"]
+    llm_evaluator = llm_attr_eval.LLMAttributionEvaluator(testing_dict["model"], tokenizer)
+    results: List[np.ndarray] = []
+    durations: List[float] = []
+    skipped = 0
+    num_examples = testing_dict["num_examples"]
+    total = min(len(examples), num_examples)
+    iterator = islice(examples, total)
+    description = f"Recovery@10pct {testing_dict['model_name']} {dataset_name} {testing_dict['attr_func']}"
+    for ex in tqdm(iterator, desc=description, total=total):
+        needle_spans = (ex.metadata or {}).get("needle_spans")
+        if not isinstance(needle_spans, list) or not needle_spans:
+            raise SystemExit(
+                "recovery_ruler only supports RULER examples with metadata.needle_spans; "
+                f"dataset={dataset_name} has missing/empty needle_spans."
+            )
+        gold_prompt = ds_utils.ruler_gold_prompt_token_indices(ex, tokenizer)
+        if not gold_prompt:
+            skipped += 1
+            continue
+        # Batch size is set based on the max_input_len (same policy as faithfulness).
+        target = ex.target
+        if target is None:
+            generation, full_output = llm_evaluator.response(ex.prompt)
+            target = generation
+            response_len = len(tokenizer(full_output).input_ids)
+        else:
+            response_len = len(tokenizer(llm_evaluator.format_prompt(" " + ex.prompt) + target).input_ids)
+        batch_size = max(1, math.floor((testing_dict["max_input_len"] - 100) / max(1, response_len)))
+        sample_start = time.perf_counter()
+        attr_list, extra = run_attribution(testing_dict, ex, batch_size, target)
+        durations.append(time.perf_counter() - sample_start)
+        seq_attr = attr_list[0]
+        prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])  # cols=(P+G), rows=G
+        if prompt_len <= 0:
+            skipped += 1
+            continue
+        if testing_dict["attr_func"] in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both") and extra is not None:
+            import ft_ifr_improve
+            keep_prompt_token_indices = extra.get("keep_prompt_token_indices") or []
+            gold_filtered = [idx for idx in gold_prompt if int(idx) in set(int(x) for x in keep_prompt_token_indices)]
+            if not gold_filtered:
+                skipped += 1
+                continue
+            scores = [
+                ft_ifr_improve.evaluate_attr_recovery_skip_tokens(
+                    attr[:, :prompt_len],
+                    keep_prompt_token_indices=keep_prompt_token_indices,
+                    gold_prompt_token_indices=gold_prompt,
+                    top_fraction=0.1,
+                )
+                for attr in attr_list
+            ]
+        else:
+            scores = [
+                llm_evaluator.evaluate_attr_recovery(
+                    attr,
+                    prompt_len=prompt_len,
+                    gold_prompt_token_indices=gold_prompt,
+                    top_fraction=0.1,
+                )
+                for attr in attr_list
+            ]
+        results.append(np.asarray(scores, dtype=np.float64))
+    scores = np.stack(results, axis=0) if results else np.zeros((0, 3), dtype=np.float64)
+    used = int(scores.shape[0])
+    mean = scores.mean(0) if used else np.full((3,), np.nan, dtype=np.float64)
+    std = scores.std(0) if used else np.full((3,), np.nan, dtype=np.float64)
+    avg_time = float(np.mean(durations)) if durations else 0.0
+    return mean, std, avg_time, used, int(skipped)
+def load_model(model_name: str, device: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    seed = 42
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if device == "auto":
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="auto",
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    elif isinstance(device, str) and device.startswith("cuda:"):
+        try:
+            gpu_idx = int(device.split(":")[1])
+        except Exception:
+            gpu_idx = 0
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map={"": gpu_idx},
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def main(args) -> None:
+    if args.cuda is not None and isinstance(args.cuda, str) and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+        device = "auto"
+    elif args.cuda is not None and isinstance(args.cuda, str) and args.cuda.strip() != "":
+        try:
+            idx = int(args.cuda)
+        except Exception:
+            idx = 0
+        device = f"cuda:{idx}" if torch.cuda.is_available() else "cpu"
+    else:
+        device = f"cuda:{args.cuda_num}" if torch.cuda.is_available() else "cpu"
+    if args.model == "llama-1B":
+        model_name = "meta-llama/Llama-3.2-1B-Instruct"
+        max_input_len = 5500
+    elif args.model == "llama-3B":
+        model_name = "meta-llama/Llama-3.2-3B-Instruct"
+        max_input_len = 4800
+    elif args.model == "llama-8B":
+        model_name = "meta-llama/Llama-3.1-8B-Instruct"
+        max_input_len = 3500
+    elif args.model == "qwen-1.7B":
+        model_name = "Qwen/Qwen3-1.7B"
+        max_input_len = 5500
+    elif args.model == "qwen-4B":
+        model_name = "Qwen/Qwen3-4B-Instruct-2507"
+        max_input_len = 3500
+    elif args.model == "qwen-8B":
+        model_name = "Qwen/Qwen3-8B"
+        max_input_len = 3000
+    elif args.model == "qwen-32B":
+        model_name = "Qwen/Qwen3-32B"
+        max_input_len = 1500
+    elif args.model == "gemma-12B":
+        model_name = "gemma/gemma-3-12b-it"
+        max_input_len = 1500
+    elif args.model == "gemma-27B":
+        model_name = "gemma/gemma-3-27b-it"
+        max_input_len = 2000
+    else:
+        model_name = args.model_path if args.model_path is not None else args.model
+        max_input_len = 2000
+    model, tokenizer = load_model(model_name if args.model_path is None else args.model_path, device)
+    dataset_name, examples = _load_ruler_examples(args)
+    testing_dict = {
+        "model": model,
+        "model_name": args.model,
+        "tokenizer": tokenizer,
+        "dataset_name": dataset_name,
+        "attr_func": args.attr_func,
+        "num_examples": args.num_examples,
+        "max_input_len": max_input_len,
+        "n_hops": args.n_hops,
+    }
+    mean, std, avg_time, used, skipped = evaluate_dataset_recovery_ruler(testing_dict, dataset_name, examples)
+    out_dir = Path("./test_results") / "attribution_recovery" / dataset_name / args.model
+    out_dir.mkdir(parents=True, exist_ok=True)
+    file_name = f"{args.attr_func}_{args.num_examples}_examples.csv"
+    with open(out_dir / file_name, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["Method", "Recovery@10pct"])
+        writer.writerow(["Seq Attr Recovery Mean", mean[0]])
+        writer.writerow(["Row Attr Recovery Mean", mean[1]])
+        writer.writerow(["Recursive Attr Recovery Mean", mean[2]])
+        writer.writerow(["Seq Attr Recovery Std", std[0]])
+        writer.writerow(["Row Attr Recovery Std", std[1]])
+        writer.writerow(["Recursive Attr Recovery Std", std[2]])
+        writer.writerow(["Examples Used", used])
+        writer.writerow(["Examples Skipped", skipped])
+        writer.writerow(["Avg Sample Time (s)", avg_time])
+    print(f"[{dataset_name}] {args.attr_func} -> {out_dir/file_name} (used={used} skipped={skipped} avg {avg_time:.2f}s)")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("RULER-only token-level attribution recovery evaluation (Recall@10pct).")
+    parser.add_argument("--num_examples", type=int, default=100, help="How many examples to evaluate.")
+    parser.add_argument("--sample", type=int, default=None, help="Optional subsample before num_examples.")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--model", type=str, default="qwen-8B")
+    parser.add_argument("--model_path", type=str, default=None, help="Optional local model path to load.")
+    parser.add_argument("--attr_func", type=str, default="ifr_multi_hop")
+    parser.add_argument("--cuda_num", type=int, default=0)
+    parser.add_argument("--cuda", type=str, default=None)
+    parser.add_argument("--dataset", type=str, required=True, help="RULER dataset name or JSONL path (raw or exp2 cache).")
+    parser.add_argument("--data_root", type=str, default="exp/exp2/data", help="Cache directory to search by dataset name.")
+    parser.add_argument("--n_hops", type=int, default=3)
+    args, _ = parser.parse_known_args()
+    main(args)

evaluations/attribution_recovery.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+# RULER-only token-level recovery (Recall@10pct) examples.
+#
+# Dataset can be:
+# - a RULER name (hotpotqa_long / niah_* / vt_*) resolved under data/ruler_multihop/<len>/.../validation.jsonl
+# - a raw RULER JSONL path
+# - an exp2 cache JSONL path (must contain metadata.needle_spans)
+# Example: evaluate on exp2 cache
+# CUDA_VISIBLE_DEVICES=0 python3 evaluations/attribution_recovery.py \
+#   --model qwen-8B --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+#   --cuda 0 --num_examples 50 --attr_func ifr_multi_hop \
+#   --dataset exp/exp2/data/hotpotqa.jsonl
+# Example: evaluate on raw RULER JSONL
+# CUDA_VISIBLE_DEVICES=0 python3 evaluations/attribution_recovery.py \
+#   --model qwen-8B --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+#   --cuda 0 --num_examples 50 --attr_func ifr_multi_hop \
+#   --dataset data/ruler_multihop/4096/hotpotqa_long/validation.jsonl

evaluations/faithfulness.py ADDED Viewed

	@@ -0,0 +1,491 @@

+import os
+import sys
+# Ensure project root is importable regardless of CWD
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
+import torch
+import numpy as np
+from transformers import utils
+import math
+from tqdm import tqdm
+import random
+import argparse
+import csv
+from itertools import islice
+from typing import Tuple
+from huggingface_hub import login
+from attribution_datasets import (
+    AttributionDataset,
+    FactsAttributionDataset,
+    MathAttributionDataset,
+    MoreHopQAAttributionDataset,
+)
+utils.logging.set_verbosity_error()  # Suppress standard warnings
+import llm_attr
+import llm_attr_eval
+def _resolve_indices_to_explain_token_span(
+    attr_result: llm_attr.LLMAttributionResult, indices_to_explain: list[int] | None
+) -> list[int]:
+    if (
+        isinstance(indices_to_explain, list)
+        and len(indices_to_explain) == 2
+        and all(isinstance(x, int) and x >= 0 for x in indices_to_explain)
+        and indices_to_explain[0] <= indices_to_explain[1]
+    ):
+        return indices_to_explain
+    gen_len = int(attr_result.attribution_matrix.shape[0])
+    if gen_len <= 0:
+        return [0, 0]
+    # Default: explain the full generation excluding the appended EOS token.
+    end_tok = max(0, gen_len - 2)
+    return [0, end_tok]
+def run_attribution(testing_dict, prompt, batch_size, indices_to_explain = [1], target = None) -> tuple[list[torch.Tensor], dict | None]:
+    model = testing_dict["model"]
+    tokenizer = testing_dict["tokenizer"]
+    # Now we create an attribution for the full response
+    if "IG" in testing_dict["attr_func"]:
+        llm_attributor = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        if testing_dict["attr_func"] == "IG":
+            attr = llm_attributor.calculate_IG_per_generation(prompt, 20, tokenizer.eos_token_id, batch_size = batch_size, target = target)
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif "perturbation" in testing_dict["attr_func"]:
+        llm_attributor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+        if testing_dict["attr_func"] == "perturbation_all":
+            attr = llm_attributor.calculate_feature_ablation_sentences(prompt, baseline = tokenizer.eos_token_id, measure="log_loss", target = target)
+        elif testing_dict["attr_func"] == "perturbation_CLP":
+            attr = llm_attributor.calculate_feature_ablation_sentences(prompt, baseline = tokenizer.eos_token_id, measure="KL", target = target)
+        elif testing_dict["attr_func"] == "perturbation_REAGENT":
+            attr = llm_attributor.calculate_feature_ablation_sentences_mlm(prompt, target = target)
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif "attention" in testing_dict["attr_func"]:
+        llm_attributor = llm_attr.LLMAttentionAttribution(model, tokenizer)
+        llm_attributor_ig = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        if testing_dict["attr_func"] == "attention_I_G":
+            attr = llm_attributor.calculate_attention_attribution(prompt, target = target)
+            attr_b = llm_attributor_ig.calculate_IG_per_generation(prompt, 20, tokenizer.eos_token_id, batch_size = batch_size, target = target)
+            attr.attribution_matrix = attr.attribution_matrix * attr_b.attribution_matrix
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif "ifr" in testing_dict["attr_func"].lower():
+        llm_attributor = llm_attr.LLMIFRAttribution(model, tokenizer)
+        attr_func = testing_dict["attr_func"].lower()
+        renorm_threshold = testing_dict.get("renorm_threshold")
+        if attr_func == "ifr_all_positions":
+            attr = llm_attributor.calculate_ifr_for_all_positions(prompt, target=target, renorm_threshold=renorm_threshold)
+        elif attr_func == "ifr_all_positions_output_only":
+            attr = llm_attributor.calculate_ifr_for_all_positions_output_only(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                renorm_threshold=renorm_threshold,
+            )
+        elif attr_func == "ifr_span":
+            span = testing_dict.get("sink_span")
+            attr = llm_attributor.calculate_ifr_span(
+                prompt,
+                target=target,
+                span=tuple(span) if span is not None else None,
+                renorm_threshold=renorm_threshold,
+            )
+        elif attr_func == "ifr_multi_hop":
+            attr = llm_attributor.calculate_ifr_multi_hop(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+                n_hops=testing_dict.get("n_hops", 1),
+                renorm_threshold=renorm_threshold,
+                observation_mask=testing_dict.get("observation_mask"),
+            )
+        elif attr_func == "ifr_in_all_gen":
+            import ft_ifr_improve
+            llm_attributor = ft_ifr_improve.LLMIFRAttributionInAllGen(model, tokenizer)
+            attr = llm_attributor.calculate_ifr_in_all_gen(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+                n_hops=testing_dict.get("n_hops", 1),
+                renorm_threshold=renorm_threshold,
+                observation_mask=testing_dict.get("observation_mask"),
+            )
+        elif attr_func == "ifr_multi_hop_stop_words":
+            import ft_ifr_improve
+            llm_attributor = ft_ifr_improve.LLMIFRAttributionImproved(model, tokenizer)
+            attr = llm_attributor.calculate_ifr_multi_hop_stop_words(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+                n_hops=testing_dict.get("n_hops", 1),
+                renorm_threshold=renorm_threshold,
+                observation_mask=testing_dict.get("observation_mask"),
+            )
+        elif attr_func == "ifr_multi_hop_both":
+            import ft_ifr_improve
+            llm_attributor = ft_ifr_improve.LLMIFRAttributionBoth(model, tokenizer)
+            attr = llm_attributor.calculate_ifr_multi_hop_both(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+                n_hops=testing_dict.get("n_hops", 1),
+                renorm_threshold=renorm_threshold,
+                observation_mask=testing_dict.get("observation_mask"),
+            )
+        elif attr_func == "ifr_multi_hop_split_hop":
+            import ft_ifr_improve
+            llm_attributor = ft_ifr_improve.LLMIFRAttributionSplitHop(model, tokenizer)
+            attr = llm_attributor.calculate_ifr_multi_hop_split_hop(
+                prompt,
+                target=target,
+                sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+                thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+                n_hops=testing_dict.get("n_hops", 1),
+                renorm_threshold=renorm_threshold,
+                observation_mask=testing_dict.get("observation_mask"),
+            )
+        else:
+            raise ValueError(f"Unsupported IFR attribution function '{testing_dict['attr_func']}'.")
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif "basic" in testing_dict["attr_func"]:
+        llm_attributor = llm_attr.LLMBasicAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_basic_attribution(prompt, target = target)
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif testing_dict["attr_func"] == "attnlrp":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_ft_hop0(prompt, target=target)
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif testing_dict["attr_func"] == "attnlrp_aggregated":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_aggregated(prompt, target=target)
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    elif testing_dict["attr_func"] == "attnlrp_aggregated_multi_hop":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_aggregated_multi_hop(
+            prompt,
+            target=target,
+            sink_span=tuple(testing_dict.get("sink_span")) if testing_dict.get("sink_span") is not None else None,
+            thinking_span=tuple(testing_dict.get("thinking_span")) if testing_dict.get("thinking_span") is not None else None,
+            n_hops=testing_dict.get("n_hops", 1),
+        )
+        token_span = _resolve_indices_to_explain_token_span(attr, indices_to_explain)
+        attributions = list(attr.get_all_token_attrs(token_span))
+    else:
+        raise ValueError(f"Unsupported attribution function '{testing_dict['attr_func']}'.")
+    extra = None
+    if testing_dict["attr_func"].lower() in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both"):
+        import ft_ifr_improve
+        extra = {
+            "keep_prompt_token_indices": ft_ifr_improve.keep_token_indices(list(attr.prompt_tokens)),
+            "user_prompt_indices": list(getattr(llm_attributor, "user_prompt_indices", []) or []),
+        }
+    return attributions, extra
+def faithfulness_test(testing_dict, llm_evaluator, prompt, indices_to_explain, target = None) -> np.ndarray[float]:
+    tokenizer = testing_dict["tokenizer"]
+    faithfulness_k = int(testing_dict.get("faithfulness_k", 20))
+    scores = []
+    # batch size is set based on the max_input_len in main(). Currently set to fully fill a 196GB GPU.
+    if target is None:
+        generation, full_output = llm_evaluator.response(prompt)
+        batch_size = math.floor((testing_dict["max_input_len"] - 100) / len(tokenizer(full_output).input_ids))
+    else:
+        generation = target
+        batch_size = math.floor(
+            (testing_dict["max_input_len"] - 100)
+            / len(tokenizer(llm_evaluator.format_prompt(" " + prompt) + generation).input_ids)
+        )
+    # We run an attribution on the input
+    # A list of attribution tensors will be returned and scored individually.
+    attr_list, extra = run_attribution(testing_dict, prompt, batch_size, indices_to_explain = indices_to_explain, target = target)
+    seq_attr = attr_list[0]
+    prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])  # cols=(P+G), rows=G
+    for i in range(len(attr_list)):
+        attr = attr_list[i][:, :prompt_len]
+        if testing_dict["attr_func"].lower() in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both") and extra is not None:
+            import ft_ifr_improve
+            scores.append(
+                ft_ifr_improve.faithfulness_test_skip_tokens(
+                    llm_evaluator,
+                    attr,
+                    prompt,
+                    generation,
+                    keep_prompt_token_indices=extra.get("keep_prompt_token_indices") or [],
+                    user_prompt_indices=extra.get("user_prompt_indices"),
+                    k=faithfulness_k,
+                )
+            )
+        else:
+            scores.append(llm_evaluator.faithfulness_test(attr, prompt, generation, k=faithfulness_k)) # [3 scores]
+    return np.array(scores)
+def clean_trailing_space(text) -> str:
+    if text[-1] == ' ':
+        return text[:-1]
+    else:
+        return text
+def evaluate_attribution(testing_dict) -> None:
+    model = testing_dict["model"]
+    tokenizer = testing_dict["tokenizer"]
+    llm_evaluator = llm_attr_eval.LLMAttributionEvaluator(model, tokenizer)
+    scores = []
+    description = "Faithfulness " + testing_dict["model_name"] + " " + testing_dict["dataset_name"] + " " + testing_dict["attr_func"]
+    dataset: AttributionDataset = testing_dict["dataset"]
+    num_examples = testing_dict["num_examples"]
+    total = min(len(dataset), num_examples) if hasattr(dataset, "__len__") else num_examples
+    example_iterator = islice(dataset, num_examples)
+    for example in tqdm(example_iterator, desc=description, total=total):
+        indices_to_explain = example.indices_to_explain if example.indices_to_explain is not None else [-2]
+        scores.append(
+            faithfulness_test(
+                testing_dict,
+                llm_evaluator,
+                example.prompt,
+                indices_to_explain=indices_to_explain,
+                target=example.target,
+            )
+        )
+    scores = np.array(scores) # [num_examples, num_attrs, 3 scores]
+    scores_mean = scores.mean(0) # [num_attrs, 3 scores]
+    scores_var = scores.std(0) # [num_attrs, 3 scores]
+    # make the test folder if it doesn't exist
+    folder = "./test_results/faithfulness/" + testing_dict["dataset_name"] + "/" + testing_dict["model_name"] + "/"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    # save all data
+    file_name = testing_dict["attr_func"] + "_" + str(testing_dict["num_examples"]) + "_examples"
+    with open(folder + file_name + ".csv", 'w') as f:
+        write = csv.writer(f)
+        write.writerow(["Method", "RISE", "MAS", "RISE + AP"])
+        write.writerow(["Seq Attr Scores Mean"] + scores_mean[0].tolist())
+        write.writerow(["Row Attr Scores Mean"] + scores_mean[1].tolist())
+        write.writerow(["Recursive Attr Scores Mean"] + scores_mean[2].tolist())
+        write.writerow(["Seq Attr Scores Var"] + scores_var[0].tolist())
+        write.writerow(["Row Attr Scores Var"] + scores_var[1].tolist())
+        write.writerow(["Recursive Attr Scores Var"] + scores_var[2].tolist())
+    return
+def load_model(model_name, device) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    seed = 42
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # if multi-GPU
+    # Respect three modes:
+    # - device == 'auto'               -> multi-GPU sharding across all visible devices
+    # - device startswith('cuda:IDX')   -> place entire model on a single GPU IDX (relative to visible devices)
+    # - device == 'cpu'                -> CPU
+    if device == "auto":
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="auto",
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    elif isinstance(device, str) and device.startswith("cuda:"):
+        try:
+            gpu_idx = int(device.split(":")[1])
+        except Exception:
+            gpu_idx = 0
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map={"": gpu_idx},
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+        )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Needed for LLaMA tokenizer
+    tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def main(args) -> None:
+    # login(token = "")
+    # Device selection policy (mirrors attribution_recovery):
+    # - If --cuda is a comma-separated list (e.g. "0,1"), set visibility to that list and shard with device_map='auto'.
+    # - If --cuda is a single index (e.g. "0"), do NOT override CUDA_VISIBLE_DEVICES; place model on cuda:{index}.
+    # - Else (no --cuda), use --cuda_num as single-device index relative to current visibility.
+    if args.cuda is not None and isinstance(args.cuda, str) and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+        device = "auto"
+    elif args.cuda is not None and isinstance(args.cuda, str) and args.cuda.strip() != "":
+        try:
+            idx = int(args.cuda)
+        except Exception:
+            idx = 0
+        device = f"cuda:{idx}" if torch.cuda.is_available() else "cpu"
+    else:
+        device = f"cuda:{args.cuda_num}" if torch.cuda.is_available() else "cpu"
+    # set up model
+    if args.model == "llama-1B":
+        model_name = "meta-llama/Llama-3.2-1B-Instruct"
+        max_input_len = 5500
+    elif args.model == "llama-3B":
+        model_name = "meta-llama/Llama-3.2-3B-Instruct"
+        max_input_len = 4800
+    elif args.model == "llama-8B":
+        model_name = "meta-llama/Llama-3.1-8B-Instruct"
+        max_input_len = 3500
+    elif args.model == "qwen-1.7B":
+        model_name = "Qwen/Qwen3-1.7B"
+        max_input_len = 5500
+    elif args.model == "qwen-4B":
+        model_name = "Qwen/Qwen3-4B-Instruct-2507"
+        max_input_len = 3500
+    elif args.model == "qwen-8B":
+        model_name = "Qwen/Qwen3-8B"
+        max_input_len = 3000
+    elif args.model == "qwen-32B":
+        model_name = "Qwen/Qwen3-32B"
+        max_input_len = 1500
+    elif args.model == "gemma-12B":
+        model_name = "gemma/gemma-3-12b-it"
+        max_input_len = 1500
+    elif args.model == "gemma-27B":
+        model_name = "gemma/gemma-3-27b-it"
+        max_input_len = 2000
+    else:
+        model_name = args.model_path if args.model_path is not None else args.model
+        max_input_len = 2000
+    model, tokenizer = load_model(model_name if args.model_path is None else args.model_path, device)
+    dataset_registry = {
+        "math": lambda: MathAttributionDataset("./data/math_mine.json", tokenizer),
+        "facts": lambda: FactsAttributionDataset("./data/10000_facts_9_choose_3.json"),
+        "morehopqa": lambda: MoreHopQAAttributionDataset("./data/with_human_verification.json"),
+    }
+    dataset_loader = dataset_registry.get(args.dataset)
+    if dataset_loader is None:
+        print("You have not specified an acceptable dataset. Exiting.")
+        exit()
+    dataset = dataset_loader()
+    testing_dict = {
+        "model" : model,
+        "model_name": args.model,
+        "tokenizer" : tokenizer,
+        "dataset" : dataset,
+        "dataset_name" : args.dataset,
+        "max_input_len": max_input_len,
+        "attr_func": args.attr_func,
+        "num_examples": args.num_examples,
+        "device": device,
+        "faithfulness_k": args.faithfulness_k,
+    }
+    # call the test function
+    evaluate_attribution(testing_dict)
+    return
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('')
+    parser.add_argument('--num_examples',
+                        type = int, default = 100,
+                        help='How many dataset examples to test with.')
+    parser.add_argument('--model',
+                        type = str,
+                        default = "llama",
+                        help='Model to use: llama or qwen')
+    parser.add_argument('--model_path',
+                        type=str, default=None,
+                        help='Optional local model path to load (overrides model repo id only).')
+    parser.add_argument('--attr_func',
+                        type = str,
+                        default = "IG",
+                        help="attr to use: \
+                            grad, IG, IG_captum, contextcite, attention, rollout, perturbation \
+                        ")
+    parser.add_argument('--cuda_num',
+                        type=int, default = 0,
+                        help='The number of the GPU you want to use.')
+    parser.add_argument('--cuda',
+                        type=str, default=None,
+                        help='GPU selection: use comma-separated ids for multi-GPU sharding (e.g. "0,1"); use a single index for one GPU relative to current CUDA_VISIBLE_DEVICES (e.g. "0").')
+    parser.add_argument('--dataset',
+            type = str, default = "math",
+            help = 'The dataset to evaluate on: math, facts, or morehopqa')
+    parser.add_argument(
+        "--faithfulness_k",
+        type=int,
+        default=20,
+        help="Total perturbation steps k for MAS/RISE (each step perturbs ~1/k of prompt tokens).",
+    )
+    args, unparsed = parser.parse_known_args()
+    main(args)

evaluations/faithfulness.sh ADDED Viewed

	@@ -0,0 +1,80 @@

+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset facts
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset facts
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset facts
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset facts
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset facts
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset facts
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset facts
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset facts
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset facts
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset facts
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset facts
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset facts
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset facts
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset facts
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset facts
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset facts
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset facts
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset facts
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset facts
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset facts
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset math
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset math
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset math
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset math
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset math
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset math
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset math
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset math
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset math
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset math
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset math
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset math
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset math
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset math
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset math
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset math
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset math
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset math
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset math
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset math
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset morehopqa
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func IG --dataset morehopqa
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset morehopqa
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func IG --dataset morehopqa
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset morehopqa
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset morehopqa
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset morehopqa
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func attention_I_G --dataset morehopqa
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset morehopqa
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset morehopqa
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset morehopqa
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_CLP --dataset morehopqa
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset morehopqa
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset morehopqa
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset morehopqa
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_REAGENT --dataset morehopqa
+# python3 faithfulness.py --model llama-3B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset morehopqa
+# python3 faithfulness.py --model llama-8B  --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset morehopqa
+# python3 faithfulness.py --model qwen-4B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset morehopqa
+# python3 faithfulness.py --model qwen-8B   --cuda_num 0 --num_examples 500 --attr_func perturbation_all --dataset morehopqa
+CUDA_VISIBLE_DEVICES=4,6 python3 evaluations/faithfulness.py --model qwen-8B --model_path /opt/share/models/Qwen/Qwen3-8B/ --cuda '0,1' --num_examples 50 --attr_func IG --dataset math

example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/quickstart.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import annotations
+import argparse
+from flashtrace import FlashTrace, load_model_and_tokenizer
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="FlashTrace quickstart example.")
+    parser.add_argument("--model", required=True, help="Hugging Face model id or local model path.")
+    parser.add_argument("--prompt", required=True, help="Prompt text.")
+    parser.add_argument("--target", help="Target response text.")
+    parser.add_argument("--output-span", default=None, help="Inclusive generation-token span START:END.")
+    parser.add_argument("--reasoning-span", default=None, help="Inclusive generation-token span START:END.")
+    parser.add_argument("--html", default="trace.html", help="Output HTML path.")
+    parser.add_argument("--use-chat-template", action="store_true", help="Format prompts with the tokenizer chat template.")
+    return parser
+def parse_span(value: str | None) -> tuple[int, int] | None:
+    from flashtrace.cli import parse_span as parse_cli_span
+    return parse_cli_span(value)
+def main() -> int:
+    args = build_parser().parse_args()
+    model, tokenizer = load_model_and_tokenizer(args.model)
+    tracer = FlashTrace(model, tokenizer, use_chat_template=args.use_chat_template)
+    trace = tracer.trace(
+        prompt=args.prompt,
+        target=args.target,
+        output_span=parse_span(args.output_span),
+        reasoning_span=parse_span(args.reasoning_span),
+    )
+    for item in trace.topk_inputs(10):
+        print(f"{item.index}\t{item.score:.6f}\t{item.token!r}")
+    trace.to_html(args.html)
+    print(f"wrote {args.html}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

exp/case_study/README.md ADDED Viewed

	@@ -0,0 +1,152 @@

+# FT 多跳案例分析 & IFR 标准可视化（exp/case_study）
+此目录提供一个轻量的单样本 IFR 可视化流程，不改动核心评测代码。
+## 功能
+- 读取单个样本（默认 `exp/exp2/data/morehopqa.jsonl`，索引 0）。
+- 支持多种模式：
+  - `ft`：当前使用的多跳 FT 归因（内部调用 `LLMIFRAttribution.calculate_ifr_multi_hop`）。
+  - `ifr`：标准 IFR（单 hop），默认对指定 sink span 做**聚合 IFR**（只显示 1 个面板）。
+  - `ifr_all_positions_output_only`：只对 `sink_span` 范围内的 output tokens 计算 IFR token-level 矩阵，并基于该矩阵得到 Row / Recursive（CAGE）两张面板。
+  - `attnlrp`：AttnLRP hop0（复用 FT-AttnLRP 的 span-aggregate 逻辑，等价于 `LLMLRPAttribution.calculate_attnlrp_multi_hop(n_hops=0)`，并可视化 `raw_attributions[0].token_importance_total`）。
+  - `ft_attnlrp`：FT-AttnLRP（严格复用 `LLMLRPAttribution.calculate_attnlrp_aggregated_multi_hop`，与 `exp/exp2/` 保持一致；直接可视化每 hop 的 `token_importance_total`）。
+- 可视化两个视图：
+  - **裁剪前 token 级（full）**：带 chat template 的完整序列热力图（template + user prompt + generation）。
+  - **Prompt-only token 级**：只显示 user prompt tokens 的热力图（不包含 generation tokens）。
+- 热力图按 `|score|` 上色（不区分正负）；每个面板的 full/prompt 两张图各自用 p99.5(`|score|`) 独立归一化颜色深度。
+- 输出 JSON（完整数值）和 HTML（逐跳热力图）。
+- 额外提供 MAS（faithfulness / token perturbation）可视化：对指定归因方法做 token 级扰动评估，并渲染扰动影响热力图 + MAS 分数。
+## 快速开始
+```bash
+# 根据本地模型修改 model/model_path
+# 多跳 FT（默认） ft_split_hop,ft_improve
+python exp/case_study/run_ifr_case.py \
+  --mode ft_split_hop \
+  --dataset exp/exp2/data/morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0 \
+  --n_hops 3
+# 标准 IFR（单 hop，可指定 sink span）
+python exp/case_study/run_ifr_case.py \
+  --mode ifr \
+  --dataset exp/exp2/data/morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0 \
+  --sink_span 0 0
+# IFR output-only：只在 output 范围计算 IFR 矩阵，并生成 Row/Recursive（CAGE）两面板
+python exp/case_study/run_ifr_case.py \
+  --mode ifr_all_positions_output_only \
+  --dataset exp/exp2/data/short-morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0
+# AttnLRP hop0（复用 FT-AttnLRP span-aggregate；可视化 hop0 raw 向量）
+python exp/case_study/run_ifr_case.py \
+  --mode attnlrp \
+  --dataset exp/exp2/data/morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0 \
+  --sink_span 0 20
+# FT-attnLRP（多跳递归 AttnLRP）
+python exp/case_study/run_ifr_case.py \
+  --mode ft_attnlrp \
+  --dataset exp/exp2/data/morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0,2,3,4,5,7 \
+  --n_hops 3 \
+  --attnlrp_neg_handling abs \
+  --attnlrp_norm_mode norm
+```
+产物位于 `exp/case_study/out/`，文件名前缀根据模式变化，例如：
+- `ft_case_<dataset>_idx<idx>.json/html`
+- `ifr_case_<dataset>_idx<idx>.json/html`
+- `ifr_output_only_case_<dataset>_idx<idx>.json/html`
+- `attnlrp_case_<dataset>_idx<idx>.json/html`
+- `ft_attnlrp_case_<dataset>_idx<idx>.json/html`
+## MAS（Faithfulness / Token Perturbation）可视化
+> 说明：这里的 MAS 与项目 `llm_attr_eval.LLMAttributionEvaluator.faithfulness_test()` 保持一致：
+> 1) 先对样本跑指定方法的归因，并取 token-level attribution（Seq / Row / Recursive）。
+> 2) 按 prompt token 的重要性排序，逐步将 token id 替换为 `tokenizer.pad_token_id`（token 级扰动）。
+> 3) 用 `sum log p(generation + EOS | prompt)` 得到分数曲线，计算 RISE / MAS / RISE+AP。
+> 4) 可视化时用“每一步扰动带来的边际 logprob 变化”作为 token 分数，渲染为 token spans 的“扰动影响热力图”。
+```bash
+# FT-IFR（ifr_multi_hop；默认 --method ft）
+python exp/case_study/run_mas_case.py \
+  --dataset exp/exp2/data/short-morehopqa.jsonl \
+  --index 0 \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 0 \
+  --method ft \
+  --n_hops 3
+```
+常用方法选择（与 `run_ifr_case.py` 的模式名对齐）：
+```bash
+# IFR（需要 sink_span；默认会优先使用数据集缓存字段）
+python exp/case_study/run_mas_case.py --method ifr --sink_span 0 20 ...
+# IFR output-only（仅对 sink_span 内的 output token 计算 IFR token-level matrix）
+python exp/case_study/run_mas_case.py --method ifr_all_positions_output_only --sink_span 0 20 ...
+# FT-IFR（ifr_multi_hop）
+python exp/case_study/run_mas_case.py --method ft --n_hops 1 --sink_span 0 20 --thinking_span 0 20 ...
+# AttnLRP hop0（复用 FT-AttnLRP hop0；仍然需要 indices_to_explain/sink_span 来取 Seq/Row/Rec）
+python exp/case_study/run_mas_case.py --method attnlrp --sink_span 0 20 ...
+# FT-AttnLRP（attnlrp_aggregated_multi_hop）
+python exp/case_study/run_mas_case.py --method ft_attnlrp --n_hops 1 --sink_span 0 20 --thinking_span 0 20 ...
+```
+产物位于 `exp/case_study/out/`，文件名前缀为：
+- `mas_case_<method>_<dataset>_idx<idx>.json/html`
+HTML 默认包含 3 个 attribution 视角面板（Seq / Row / Recursive），每个面板里有 2 行 token 级热力图：
+- **Method attribution（token weights）**：该方法的 token 归因权重（用于排序/密度）。
+- **Attribution-guided MAS marginal（path deltas）**：按归因排序逐步替换的边际影响（这就是评测中实际使用的扰动路径）。
+## 在浏览器中查看 HTML
+1) 先运行上面的命令生成 `.html`（终端会打印形如 `wrote exp/case_study/out/...html`）。
+2) 在仓库根目录启动一个静态文件服务（任选一个端口，例如 8888）：
+```bash
+python -m http.server 8888 --directory exp/case_study/out
+```
+3) 用浏览器打开（注意是 `http://`，不是 `https://`）：
+- 本机：`http://127.0.0.1:8888/<你的html文件名>`
+- 远程机器（推荐端口转发）：在本地执行 `ssh -L 8888:127.0.0.1:8888 <user>@<server>`，然后在本地浏览器打开 `http://127.0.0.1:8888/<你的html文件名>`
+如果你在 `http.server` 日志里看到大量 `400 Bad request version` 且伴随乱码，通常是有客户端用 HTTPS 去连了 HTTP 端口；请确认浏览器地址栏是 `http://...`。
+## 可选参数
+- `--sink_span a b` / `--thinking_span a b`：覆盖生成侧的 sink/thinking 句子 span（默认使用缓存字段）。
+- `--attnlrp_neg_handling drop|abs`：FT-AttnLRP 每跳负值处理（drop=clamp>=0，abs=取绝对值）。
+- `--attnlrp_norm_mode norm|no_norm`：FT-AttnLRP 正则化与 hop ratio 开关（norm=全局+thinking 归一化并启用 ratio；no_norm=三者都禁用）。
+- `--chunk_tokens` / `--sink_chunk_tokens`：IFR 分块参数。
+- `--output_dir`：修改输出目录。
+## 文件说明
+- `run_ifr_case.py`：命令行入口与落盘（支持 `ft`/`ifr`/`ifr_all_positions_output_only`/`attnlrp`/`ft_attnlrp` 模式）。
+- `run_mas_case.py`：MAS（faithfulness / token perturbation）可视化入口与落盘（支持 `ifr`/`ifr_all_positions_output_only`/`ft`/`attnlrp`/`ft_attnlrp`）。
+- `analysis.py`：逐跳清洗与封装（token-level）。
+- `viz.py`：HTML 渲染与热力图。

exp/case_study/analysis.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Helpers for IFR case studies (hop-wise aggregation + sanitization).
+All utilities stay local to exp/case_study to avoid touching core eval code.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Iterable, List, Optional, Sequence
+import torch
+def vector_stats(vec: torch.Tensor) -> Dict[str, float]:
+    if vec.numel() == 0:
+        return {"min": 0.0, "max": 0.0, "abs_max": 0.0, "mean": 0.0, "sum": 0.0}
+    v = vec.detach().to(dtype=torch.float32)
+    return {
+        "min": float(v.min().item()),
+        "max": float(v.max().item()),
+        "abs_max": float(v.abs().max().item()),
+        "mean": float(v.mean().item()),
+        "sum": float(v.sum().item()),
+    }
+def tensor_to_list(x: Any) -> Any:
+    if torch.is_tensor(x):
+        return x.detach().cpu().tolist()
+    if isinstance(x, list):
+        return [tensor_to_list(v) for v in x]
+    if isinstance(x, dict):
+        return {k: tensor_to_list(v) for k, v in x.items()}
+    return x
+def sanitize_ifr_meta(meta: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Drop bulky raw objects and convert tensors to Python lists for JSON."""
+    if meta is None:
+        return None
+    cleaned: Dict[str, Any] = {}
+    for key, value in meta.items():
+        if key == "raw":
+            continue
+        cleaned[key] = tensor_to_list(value)
+    return cleaned
+def package_token_hops(
+    hop_vectors: Iterable[Sequence[float]],
+) -> List[Dict[str, Any]]:
+    """Package per-hop token vectors without sentence aggregation.
+    hop_vectors are assumed to already match the experiment's configured
+    postprocessing (e.g., FT-AttnLRP neg_handling/norm_mode).
+    """
+    packaged: List[Dict[str, Any]] = []
+    for hop_idx, vec in enumerate(hop_vectors):
+        vec_tensor = torch.nan_to_num(torch.as_tensor(vec, dtype=torch.float32), nan=0.0)
+        token_scores = vec_tensor.tolist()
+        token_max = float(vec_tensor.abs().max().item()) if vec_tensor.numel() > 0 else 0.0
+        total = float(vec_tensor.sum().item())
+        packaged.append(
+            {
+                "hop": hop_idx,
+                "token_scores": token_scores,
+                "token_score_max": token_max,
+                "token_stats": vector_stats(vec_tensor),
+                "total_mass": total,
+            }
+        )
+    return packaged

exp/case_study/faithfulness_trace.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Faithfulness (MAS/RISE) trace utilities for exp/case_study.
+This module is intentionally aligned with `llm_attr_eval.LLMAttributionEvaluator.faithfulness_test`,
+but additionally returns the full trace arrays needed for visualization and supports providing
+`user_prompt_indices` to avoid fragile subsequence matching.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Optional, Sequence, List
+import numpy as np
+import torch
+import llm_attr_eval
+def _auc(arr: np.ndarray) -> float:
+    return float((arr.sum() - arr[0] / 2 - arr[-1] / 2) / max(1, (arr.shape[0] - 1)))
+@torch.inference_mode()
+def mas_trace(
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    *,
+    attribution: torch.Tensor,
+    prompt: str,
+    generation: str,
+    user_prompt_indices: Optional[Sequence[int]] = None,
+    k: int = 20,
+) -> Dict[str, Any]:
+    """Return a token-level faithfulness trace (RISE/MAS/RISE+AP) plus per-token deltas.
+    attribution: [R, P] token attribution on prompt-side tokens only.
+    prompt: raw prompt string.
+    generation: target generation string; scored as generation + eos (if defined).
+    user_prompt_indices: optional absolute positions of each prompt token inside formatted prompt ids.
+    k: number of perturbation steps; each step perturbs ~1/k of prompt tokens.
+    """
+    if attribution.ndim != 2:
+        raise ValueError("Expected 2D prompt-side attribution matrix [R, P].")
+    pad_token_id = llm_evaluator._ensure_pad_token_id()
+    user_prompt = " " + prompt
+    formatted_prompt = llm_evaluator.format_prompt(user_prompt)
+    formatted_ids = llm_evaluator.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False).input_ids
+    prompt_ids = formatted_ids.to(llm_evaluator.device)
+    prompt_ids_perturbed = prompt_ids.clone()
+    eos = llm_evaluator.tokenizer.eos_token or ""
+    generation_ids = llm_evaluator.tokenizer(
+        generation + eos,
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).input_ids.to(llm_evaluator.device)
+    attr_cpu = attribution.detach().cpu()
+    w = attr_cpu.sum(0)
+    sorted_attr_indices = torch.argsort(w, descending=True)
+    attr_sum = float(w.sum().item())
+    P = int(w.numel())
+    prompt_positions: List[int]
+    if user_prompt_indices is not None:
+        prompt_positions = [int(x) for x in user_prompt_indices]
+        if len(prompt_positions) != P:
+            raise ValueError(
+                "user_prompt_indices length does not match prompt-side attribution length: "
+                f"indices P={len(prompt_positions)}, attr P={P}."
+            )
+        if P and max(prompt_positions) >= int(prompt_ids_perturbed.shape[1]):
+            raise ValueError("user_prompt_indices contains an out-of-bounds index for formatted prompt ids.")
+    else:
+        user_ids = llm_evaluator.tokenizer(user_prompt, return_tensors="pt", add_special_tokens=False).input_ids
+        user_start = llm_evaluator._find_subsequence_start(formatted_ids[0], user_ids[0])
+        if user_start is None:
+            raise RuntimeError("Failed to locate user prompt token span inside formatted chat prompt.")
+        if int(user_ids.shape[1]) != P:
+            raise ValueError(
+                "Prompt-side attribution length does not match tokenized user prompt length: "
+                f"attr P={P}, user_prompt P={int(user_ids.shape[1])}."
+            )
+        prompt_positions = [int(user_start) + j for j in range(P)]
+    if P > 0:
+        steps = int(k) if k is not None else 0
+        if steps <= 0:
+            steps = 1
+        steps = min(steps, P)
+    else:
+        steps = 0
+    scores = np.zeros(steps + 1, dtype=np.float64)
+    density = np.zeros(steps + 1, dtype=np.float64)
+    scores[0] = (
+        llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+    )
+    density[0] = 1.0
+    if P == 0:
+        return {
+            "num_tokens": 0,
+            "sorted_attr_indices": [],
+            "scores_raw": scores.tolist(),
+            "density": density.tolist(),
+            "normalized_model_response": [1.0],
+            "alignment_penalty": [0.0],
+            "corrected_scores": [1.0],
+            "token_deltas_raw": [],
+            "attr_weights": [],
+            "metrics": {"RISE": 0.0, "MAS": 0.0, "RISE+AP": 0.0},
+        }
+    if attr_sum <= 0:
+        density = np.linspace(1.0, 0.0, steps + 1)
+    per_token_delta = np.zeros(P, dtype=np.float64)
+    base = P // steps
+    remainder = P % steps
+    start = 0
+    for step in range(steps):
+        size = base + (1 if step < remainder else 0)
+        group = sorted_attr_indices[start : start + size]
+        start += size
+        for idx_t in group:
+            idx = int(idx_t.item())
+            abs_pos = int(prompt_positions[idx])
+            prompt_ids_perturbed[0, abs_pos] = pad_token_id
+        scores[step + 1] = (
+            llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+        )
+        if attr_sum > 0:
+            dec = float(w.index_select(0, group).sum().item()) / attr_sum
+            density[step + 1] = density[step] - dec
+        delta = scores[step] - scores[step + 1]
+        for idx_t in group:
+            idx = int(idx_t.item())
+            per_token_delta[idx] = delta
+    min_normalized_pred = 1.0
+    normalized_model_response = scores.copy()
+    for i in range(len(scores)):
+        normalized_pred = (normalized_model_response[i] - scores[-1]) / (abs(scores[0] - scores[-1]))
+        normalized_pred = np.clip(normalized_pred, 0.0, 1.0)
+        min_normalized_pred = min(min_normalized_pred, normalized_pred)
+        normalized_model_response[i] = min_normalized_pred
+    alignment_penalty = np.abs(normalized_model_response - density)
+    corrected_scores = normalized_model_response + alignment_penalty
+    corrected_scores = corrected_scores.clip(0.0, 1.0)
+    corrected_scores = (corrected_scores - np.min(corrected_scores)) / (np.max(corrected_scores) - np.min(corrected_scores))
+    if np.isnan(corrected_scores).any():
+        corrected_scores = np.linspace(1.0, 0.0, len(scores))
+    rise = _auc(normalized_model_response)
+    mas = _auc(corrected_scores)
+    rise_ap = _auc(normalized_model_response + alignment_penalty)
+    if attr_sum > 0:
+        attr_weights = (w.numpy() / (attr_sum + 1e-12)).astype(np.float64)
+    else:
+        attr_weights = np.zeros(P, dtype=np.float64)
+    return {
+        "num_tokens": P,
+        "sorted_attr_indices": [int(i.item()) for i in sorted_attr_indices],
+        "scores_raw": scores.tolist(),
+        "density": density.tolist(),
+        "normalized_model_response": normalized_model_response.tolist(),
+        "alignment_penalty": alignment_penalty.tolist(),
+        "corrected_scores": corrected_scores.tolist(),
+        "token_deltas_raw": per_token_delta.tolist(),
+        "attr_weights": attr_weights.tolist(),
+        "metrics": {"RISE": rise, "MAS": mas, "RISE+AP": rise_ap},
+    }

exp/case_study/run_ifr_case.py ADDED Viewed

	@@ -0,0 +1,1225 @@

+#!/usr/bin/env python3
+"""Case study runner for FlashTrace and attribution baselines.
+Modes supported (all emit JSON + HTML under ``exp/case_study/out``):
+- ``ft``: FlashTrace (current project implementation; multi-hop IFR)
+- ``ifr_in_all_gen``: Experimental multi-hop IFR variant (hops over CoT+output; scheme B, aligns with exp/exp2)
+- ``ifr``: IFR span-aggregate visualization (single hop; one panel)
+- ``ifr_all_positions``: IFR full matrix + CAGE (Row/Recursive panels)
+- ``ifr_all_positions_output_only``: IFR output-only token matrix + CAGE (Row/Recursive panels)
+- ``attnlrp``: AttnLRP hop0 (reuse FT-AttnLRP span-aggregate; visualize raw hop0 vector)
+- ``ft_attnlrp``: FT-AttnLRP (multi-hop aggregated AttnLRP; matches exp/exp2)
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import types
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+# Avoid torchvision dependency when importing transformers (Longformer).
+os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")
+os.environ.setdefault("DISABLE_TRANSFORMERS_IMAGE_TRANSFORMS", "1")
+def _early_set_cuda_visible_devices() -> None:
+    """Set CUDA_VISIBLE_DEVICES before importing torch/transformers.
+    Note: CUDA device indices are re-mapped inside the process after applying the mask.
+    """
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    cuda = args.cuda.strip() if isinstance(args.cuda, str) else ""
+    if cuda and "," in cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+if __name__ == "__main__":
+    _early_set_cuda_visible_devices()
+import torch
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+def _stub_torchvision() -> None:
+    """Provide minimal torchvision stubs so Longformer imports succeed without the real package."""
+    if "torchvision" in sys.modules:
+        return
+    from importlib.machinery import ModuleSpec
+    def _mk(name: str) -> types.ModuleType:
+        mod = types.ModuleType(name)
+        mod.__spec__ = ModuleSpec(name, loader=None)
+        return mod
+    tv = _mk("torchvision")
+    tv.__dict__["__path__"] = []
+    submods = ["transforms", "_meta_registrations", "datasets", "io", "models", "ops", "utils"]
+    for name in submods:
+        mod = _mk(f"torchvision.{name}")
+        sys.modules[f"torchvision.{name}"] = mod
+        setattr(tv, name, mod)
+    class _InterpolationMode:
+        NEAREST = 0
+        NEAREST_EXACT = 0
+        BILINEAR = 1
+        BICUBIC = 2
+        LANCZOS = 3
+        BOX = 4
+        HAMMING = 5
+    sys.modules["torchvision.transforms"].InterpolationMode = _InterpolationMode
+    sys.modules["torchvision.transforms"].__all__ = ["InterpolationMode"]
+    # ops + misc stub for timm/transformers imports
+    ops_mod = sys.modules.get("torchvision.ops") or _mk("torchvision.ops")
+    sys.modules["torchvision.ops"] = ops_mod
+    setattr(tv, "ops", ops_mod)
+    misc_mod = _mk("torchvision.ops.misc")
+    sys.modules["torchvision.ops.misc"] = misc_mod
+    setattr(ops_mod, "misc", misc_mod)
+    class _FrozenBatchNorm2d:
+        def __init__(self, *args, **kwargs):
+            pass
+    misc_mod.FrozenBatchNorm2d = _FrozenBatchNorm2d
+    sys.modules["torchvision"] = tv
+_stub_torchvision()
+def _stub_timm() -> None:
+    """Provide minimal timm stubs to avoid optional vision deps."""
+    if "timm" in sys.modules:
+        return
+    from importlib.machinery import ModuleSpec
+    def _mk(name: str) -> types.ModuleType:
+        mod = types.ModuleType(name)
+        mod.__spec__ = ModuleSpec(name, loader=None)
+        return mod
+    timm = _mk("timm")
+    timm.__dict__["__path__"] = []
+    sys.modules["timm"] = timm
+    data_mod = _mk("timm.data")
+    sys.modules["timm.data"] = data_mod
+    timm.data = data_mod
+    class _ImageNetInfo:
+        pass
+    def _infer_imagenet_subset(*args, **kwargs):
+        return None
+    data_mod.ImageNetInfo = _ImageNetInfo
+    data_mod.infer_imagenet_subset = _infer_imagenet_subset
+    layers_mod = _mk("timm.layers")
+    sys.modules["timm.layers"] = layers_mod
+    timm.layers = layers_mod
+    create_norm_mod = _mk("timm.layers.create_norm")
+    sys.modules["timm.layers.create_norm"] = create_norm_mod
+    layers_mod.create_norm = create_norm_mod
+    def _get_norm_layer(*args, **kwargs):
+        return None
+    create_norm_mod.get_norm_layer = _get_norm_layer
+    classifier_mod = _mk("timm.layers.classifier")
+    sys.modules["timm.layers.classifier"] = classifier_mod
+    layers_mod.classifier = classifier_mod
+_stub_timm()
+import transformers
+# Provide light stubs if Longformer classes are unavailable; IFR case study does not use them.
+if not hasattr(transformers, "LongformerTokenizer"):
+    class _DummyLongformerTokenizer:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("LongformerTokenizer stubbed; install full transformers+torchvision if needed.")
+    transformers.LongformerTokenizer = _DummyLongformerTokenizer
+if not hasattr(transformers, "LongformerForMaskedLM"):
+    class _DummyLongformerForMaskedLM:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("LongformerForMaskedLM stubbed; install full transformers+torchvision if needed.")
+    transformers.LongformerForMaskedLM = _DummyLongformerForMaskedLM
+if hasattr(transformers, "__all__"):
+    for _name in ["LongformerTokenizer", "LongformerForMaskedLM"]:
+        if _name not in transformers.__all__:
+            transformers.__all__.append(_name)
+# Gemma3n stubs (transformers may attempt to import even if unused)
+if "transformers.models.gemma3n.configuration_gemma3n" not in sys.modules:
+    from importlib.machinery import ModuleSpec
+    gemma_pkg = types.ModuleType("transformers.models.gemma3n")
+    gemma_pkg.__spec__ = ModuleSpec("transformers.models.gemma3n", loader=None, is_package=True)
+    sys.modules["transformers.models.gemma3n"] = gemma_pkg
+    gemma_conf = types.ModuleType("transformers.models.gemma3n.configuration_gemma3n")
+    gemma_conf.__spec__ = ModuleSpec("transformers.models.gemma3n.configuration_gemma3n", loader=None)
+    class Gemma3nConfig:
+        def __init__(self, *args, **kwargs):
+            self.model_type = "gemma3n"
+    class Gemma3nTextConfig(Gemma3nConfig):
+        pass
+    gemma_conf.Gemma3nConfig = Gemma3nConfig
+    gemma_conf.Gemma3nTextConfig = Gemma3nTextConfig
+    gemma_conf.__all__ = ["Gemma3nConfig", "Gemma3nTextConfig"]
+    sys.modules["transformers.models.gemma3n.configuration_gemma3n"] = gemma_conf
+    setattr(gemma_pkg, "configuration_gemma3n", gemma_conf)
+    if hasattr(transformers, "__all__"):
+        for _nm in ["Gemma3nConfig", "Gemma3nTextConfig"]:
+            if _nm not in transformers.__all__:
+                transformers.__all__.append(_nm)
+import llm_attr
+from exp.exp2 import dataset_utils as ds_utils
+from evaluations.attribution_recovery import load_model
+from exp.case_study import analysis, viz
+def resolve_device(cuda: Optional[str], cuda_num: int) -> str:
+    if cuda and isinstance(cuda, str) and "," in cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+        return "auto"
+    if cuda and isinstance(cuda, str) and cuda.strip():
+        try:
+            idx = int(cuda)
+        except Exception:
+            idx = 0
+        return f"cuda:{idx}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{cuda_num}" if torch.cuda.is_available() else "cpu"
+def load_example(dataset: str, index: int, data_root: Path) -> Tuple[ds_utils.CachedExample, str]:
+    """Load a single example from a cache path or dataset name."""
+    ds_path = Path(dataset)
+    if ds_path.exists():
+        examples = ds_utils.read_cached_jsonl(ds_path)
+        dataset_name = ds_path.name
+    else:
+        loader = ds_utils.DatasetLoader(data_root=data_root)
+        examples = loader.load(dataset)
+        dataset_name = dataset
+    if not examples:
+        raise ValueError(f"No examples found for dataset={dataset}")
+    if index < 0:
+        index = len(examples) + index
+    if not (0 <= index < len(examples)):
+        raise IndexError(f"index {index} out of range for dataset with {len(examples)} examples")
+    return examples[index], dataset_name
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser("IFR multi-hop case study")
+    parser.add_argument("--dataset", type=str, default="exp/exp2/data/morehopqa.jsonl", help="Dataset name or JSONL path.")
+    parser.add_argument("--data_root", type=str, default="exp/exp2/data", help="Cache root for dataset names.")
+    parser.add_argument("--index", type=int, default=0, help="Sample index (supports negative for reverse).")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=[
+            "ft",
+            "ft_improve",
+            "ft_split_hop",
+            "ifr_in_all_gen",
+            "ifr",
+            "ifr_all_positions",
+            "ifr_all_positions_output_only",
+            "attnlrp",
+            "ft_attnlrp",
+        ],
+        default="ft",
+        help=(
+            "ft = FlashTrace (multi-hop IFR); ifr = standard IFR span-aggregate; "
+            "ifr_in_all_gen = multi-hop IFR over CoT+output (scheme B; exp2-aligned); "
+            "ifr_all_positions = full IFR matrix + CAGE row/rec; "
+            "ft_improve = FlashTrace (multi-hop IFR, stop-token soft deletion); "
+            "ft_split_hop = FlashTrace (split-hop IFR over segmented thinking span); "
+            "ifr_all_positions_output_only = output-only IFR matrix + CAGE row/rec; "
+            "attnlrp = AttnLRP hop0 (FT-AttnLRP span-aggregate); "
+            "ft_attnlrp = FT-AttnLRP (multi-hop aggregated; exp2)."
+        ),
+    )
+    parser.add_argument("--model", type=str, default="qwen-8B", help="HF repo id (ignored if --model_path set).")
+    parser.add_argument("--model_path", type=str, default=None, help="Local model path to override --model.")
+    parser.add_argument("--cuda", type=str, default=None, help="CUDA spec (e.g., '0' or '0,1').")
+    parser.add_argument("--cuda_num", type=int, default=0, help="Fallback GPU index when --cuda unset.")
+    parser.add_argument("--n_hops", type=int, default=1, help="Number of hops for IFR multi-hop.")
+    parser.add_argument("--sink_span", type=int, nargs=2, default=None, help="Optional sink span over generation tokens.")
+    parser.add_argument("--thinking_span", type=int, nargs=2, default=None, help="Optional thinking span over generation tokens.")
+    parser.add_argument(
+        "--attnlrp_neg_handling",
+        type=str,
+        choices=["drop", "abs"],
+        default="drop",
+        help="FT-AttnLRP: how to handle negative values after each hop (drop=clamp>=0, abs=absolute value).",
+    )
+    parser.add_argument(
+        "--attnlrp_norm_mode",
+        type=str,
+        choices=["norm", "no_norm"],
+        default="norm",
+        help="FT-AttnLRP: norm enables per-hop global+thinking normalization + ratios; no_norm disables all three.",
+    )
+    parser.add_argument("--chunk_tokens", type=int, default=128, help="IFR chunk size.")
+    parser.add_argument("--sink_chunk_tokens", type=int, default=32, help="IFR sink chunk size.")
+    parser.add_argument("--output_dir", type=str, default="exp/case_study/out", help="Where to write HTML/JSON artifacts.")
+    return parser.parse_args()
+def run_ft_multihop(
+    example: ds_utils.CachedExample,
+    model: Any,
+    tokenizer: Any,
+    *,
+    n_hops: int,
+    sink_span: Optional[Sequence[int]],
+    thinking_span: Optional[Sequence[int]],
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+) -> Tuple[Any, Optional[Tuple[int, int]], Optional[Tuple[int, int]], Dict[str, Any]]:
+    """Execute FT (current multi-hop IFR) attribution for the selected example."""
+    attr = llm_attr.LLMIFRAttribution(
+        model,
+        tokenizer,
+        chunk_tokens=chunk_tokens,
+        sink_chunk_tokens=sink_chunk_tokens,
+    )
+    sink = tuple(sink_span) if sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+    thinking = (
+        tuple(thinking_span)
+        if thinking_span is not None
+        else tuple(example.thinking_span) if example.thinking_span else None
+    )
+    result = attr.calculate_ifr_multi_hop(
+        example.prompt,
+        target=example.target,
+        sink_span=sink,
+        thinking_span=thinking,
+        n_hops=n_hops,
+    )
+    debug_info: Dict[str, Any] = {
+        "full_prompt_tokens": list(getattr(attr, "prompt_tokens", []) or []),
+        "generation_tokens": list(getattr(attr, "generation_tokens", []) or []),
+        "user_prompt_indices": list(getattr(attr, "user_prompt_indices", []) or []),
+        "chat_prompt_indices": list(getattr(attr, "chat_prompt_indices", []) or []),
+        "prompt_ids": getattr(attr, "prompt_ids", None).detach().cpu().tolist() if getattr(attr, "prompt_ids", None) is not None else None,
+        "generation_ids": getattr(attr, "generation_ids", None).detach().cpu().tolist() if getattr(attr, "generation_ids", None) is not None else None,
+    }
+    raw_vectors = []
+    if result.metadata and "ifr" in result.metadata:
+        raw_ifr = result.metadata["ifr"].get("raw")
+        if raw_ifr is not None and hasattr(raw_ifr, "raw_attributions"):
+            try:
+                raw_vectors = [r.token_importance_total.detach().cpu() for r in raw_ifr.raw_attributions]
+            except Exception:
+                raw_vectors = []
+    debug_info["raw_hop_vectors"] = raw_vectors
+    return result, sink, thinking, debug_info
+def run_ft_multihop_improve(
+    example: ds_utils.CachedExample,
+    model: Any,
+    tokenizer: Any,
+    *,
+    n_hops: int,
+    sink_span: Optional[Sequence[int]],
+    thinking_span: Optional[Sequence[int]],
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+) -> Tuple[Any, Optional[Tuple[int, int]], Optional[Tuple[int, int]], Dict[str, Any]]:
+    """Execute experimental FT (multi-hop IFR) with stop-token soft deletion."""
+    import ft_ifr_improve
+    attr = ft_ifr_improve.LLMIFRAttributionImproved(
+        model,
+        tokenizer,
+        chunk_tokens=chunk_tokens,
+        sink_chunk_tokens=sink_chunk_tokens,
+    )
+    sink = tuple(sink_span) if sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+    thinking = (
+        tuple(thinking_span)
+        if thinking_span is not None
+        else tuple(example.thinking_span) if example.thinking_span else None
+    )
+    result = attr.calculate_ifr_multi_hop_stop_words(
+        example.prompt,
+        target=example.target,
+        sink_span=sink,
+        thinking_span=thinking,
+        n_hops=n_hops,
+    )
+    debug_info: Dict[str, Any] = {
+        "full_prompt_tokens": list(getattr(attr, "prompt_tokens", []) or []),
+        "generation_tokens": list(getattr(attr, "generation_tokens", []) or []),
+        "user_prompt_indices": list(getattr(attr, "user_prompt_indices", []) or []),
+        "chat_prompt_indices": list(getattr(attr, "chat_prompt_indices", []) or []),
+        "prompt_ids": getattr(attr, "prompt_ids", None).detach().cpu().tolist() if getattr(attr, "prompt_ids", None) is not None else None,
+        "generation_ids": getattr(attr, "generation_ids", None).detach().cpu().tolist() if getattr(attr, "generation_ids", None) is not None else None,
+    }
+    raw_vectors = []
+    if result.metadata and "ifr" in result.metadata:
+        raw_ifr = result.metadata["ifr"].get("raw")
+        if raw_ifr is not None and hasattr(raw_ifr, "raw_attributions"):
+            try:
+                raw_vectors = [r.token_importance_total.detach().cpu() for r in raw_ifr.raw_attributions]
+            except Exception:
+                raw_vectors = []
+    debug_info["raw_hop_vectors"] = raw_vectors
+    return result, sink, thinking, debug_info
+def run_ft_multihop_split_hop(
+    example: ds_utils.CachedExample,
+    model: Any,
+    tokenizer: Any,
+    *,
+    n_hops: int,
+    sink_span: Optional[Sequence[int]],
+    thinking_span: Optional[Sequence[int]],
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+) -> Tuple[Any, Optional[Tuple[int, int]], Optional[Tuple[int, int]], Dict[str, Any]]:
+    """Execute experimental FT (split-hop IFR over segmented thinking span)."""
+    import ft_ifr_improve
+    attr = ft_ifr_improve.LLMIFRAttributionSplitHop(
+        model,
+        tokenizer,
+        chunk_tokens=chunk_tokens,
+        sink_chunk_tokens=sink_chunk_tokens,
+    )
+    sink = tuple(sink_span) if sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+    thinking = (
+        tuple(thinking_span)
+        if thinking_span is not None
+        else tuple(example.thinking_span) if example.thinking_span else None
+    )
+    result = attr.calculate_ifr_multi_hop_split_hop(
+        example.prompt,
+        target=example.target,
+        sink_span=sink,
+        thinking_span=thinking,
+        n_hops=int(n_hops),
+    )
+    debug_info: Dict[str, Any] = {
+        "full_prompt_tokens": list(getattr(attr, "prompt_tokens", []) or []),
+        "generation_tokens": list(getattr(attr, "generation_tokens", []) or []),
+        "user_prompt_indices": list(getattr(attr, "user_prompt_indices", []) or []),
+        "chat_prompt_indices": list(getattr(attr, "chat_prompt_indices", []) or []),
+        "prompt_ids": getattr(attr, "prompt_ids", None).detach().cpu().tolist() if getattr(attr, "prompt_ids", None) is not None else None,
+        "generation_ids": getattr(attr, "generation_ids", None).detach().cpu().tolist() if getattr(attr, "generation_ids", None) is not None else None,
+    }
+    raw_vectors = []
+    if result.metadata and "ifr" in result.metadata:
+        raw_ifr = result.metadata["ifr"].get("raw")
+        if raw_ifr is not None and hasattr(raw_ifr, "raw_attributions"):
+            try:
+                raw_vectors = [r.token_importance_total.detach().cpu() for r in raw_ifr.raw_attributions]
+            except Exception:
+                raw_vectors = []
+    debug_info["raw_hop_vectors"] = raw_vectors
+    return result, sink, thinking, debug_info
+def run_ifr_in_all_gen(
+    example: ds_utils.CachedExample,
+    model: Any,
+    tokenizer: Any,
+    *,
+    n_hops: int,
+    sink_span: Optional[Sequence[int]],
+    thinking_span: Optional[Sequence[int]],
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+) -> Tuple[Any, Optional[Tuple[int, int]], Optional[Tuple[int, int]], Dict[str, Any]]:
+    """Execute experimental IFR variant: multi-hop over all generation (CoT + output)."""
+    import ft_ifr_improve
+    attr = ft_ifr_improve.LLMIFRAttributionInAllGen(
+        model,
+        tokenizer,
+        chunk_tokens=chunk_tokens,
+        sink_chunk_tokens=sink_chunk_tokens,
+    )
+    sink = tuple(sink_span) if sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+    thinking = (
+        tuple(thinking_span)
+        if thinking_span is not None
+        else tuple(example.thinking_span) if example.thinking_span else None
+    )
+    result = attr.calculate_ifr_in_all_gen(
+        example.prompt,
+        target=example.target,
+        sink_span=sink,
+        thinking_span=thinking,
+        n_hops=int(n_hops),
+    )
+    debug_info: Dict[str, Any] = {
+        "full_prompt_tokens": list(getattr(attr, "prompt_tokens", []) or []),
+        "generation_tokens": list(getattr(attr, "generation_tokens", []) or []),
+        "user_prompt_indices": list(getattr(attr, "user_prompt_indices", []) or []),
+        "chat_prompt_indices": list(getattr(attr, "chat_prompt_indices", []) or []),
+        "prompt_ids": getattr(attr, "prompt_ids", None).detach().cpu().tolist() if getattr(attr, "prompt_ids", None) is not None else None,
+        "generation_ids": getattr(attr, "generation_ids", None).detach().cpu().tolist() if getattr(attr, "generation_ids", None) is not None else None,
+    }
+    raw_vectors = []
+    if result.metadata and "ifr" in result.metadata:
+        raw_ifr = result.metadata["ifr"].get("raw")
+        if raw_ifr is not None and hasattr(raw_ifr, "raw_attributions"):
+            try:
+                raw_vectors = [r.token_importance_total.detach().cpu() for r in raw_ifr.raw_attributions]
+            except Exception:
+                raw_vectors = []
+    debug_info["raw_hop_vectors"] = raw_vectors
+    return result, sink, thinking, debug_info
+def make_output_stem(dataset_name: str, index: int, mode: str) -> str:
+    safe_name = dataset_name.replace("/", "_").replace(" ", "_")
+    prefix = {
+        "ft": "ft_case_",
+        "ft_improve": "ft_improve_case_",
+        "ifr": "ifr_case_",
+        "ifr_all_positions": "ifr_all_positions_case_",
+        "ifr_all_positions_output_only": "ifr_output_only_case_",
+        "attnlrp": "attnlrp_case_",
+        "ft_attnlrp": "ft_attnlrp_case_",
+    }.get(mode, f"{mode}_case_")
+    return f"{prefix}{safe_name}_idx{index}"
+def _decode_token_ids(tokenizer: Any, ids: Sequence[int]) -> List[str]:
+    """Decode each token id into a readable text piece (keeps special tokens)."""
+    pieces: List[str] = []
+    for tok_id in ids:
+        try:
+            pieces.append(
+                tokenizer.decode([int(tok_id)], skip_special_tokens=False, clean_up_tokenization_spaces=False)
+            )
+        except Exception:
+            pieces.append(str(tok_id))
+    return pieces
+def build_raw_tokens_from_ids(tokenizer: Any, prompt_ids: Optional[Sequence[int]], generation_ids: Optional[Sequence[int]]) -> List[str]:
+    if not prompt_ids:
+        prompt_ids = []
+    if not generation_ids:
+        generation_ids = []
+    return _decode_token_ids(tokenizer, prompt_ids) + _decode_token_ids(tokenizer, generation_ids)
+def build_trimmed_roles(tokens: Sequence[str], segments: Dict[str, Any]) -> List[str]:
+    """Assign role labels for trimmed tokens (prompt + generation)."""
+    roles = ["prompt" for _ in range(len(tokens))]
+    prompt_len_tokens = segments.get("prompt_len", 0)
+    for idx in range(prompt_len_tokens, len(tokens)):
+        roles[idx] = "gen"
+    thinking_span = segments.get("thinking_span")
+    sink_span = segments.get("sink_span")
+    if thinking_span is not None:
+        start = prompt_len_tokens + int(thinking_span[0])
+        end = prompt_len_tokens + int(thinking_span[1])
+        for i in range(start, min(len(tokens), end + 1)):
+            roles[i] = "think"
+    if sink_span is not None:
+        start = prompt_len_tokens + int(sink_span[0])
+        end = prompt_len_tokens + int(sink_span[1])
+        for i in range(start, min(len(tokens), end + 1)):
+            roles[i] = "output"
+    return roles
+def build_raw_roles(
+    tokens: Sequence[str],
+    prompt_len_full: int,
+    user_indices: Sequence[int],
+    template_indices: Sequence[int],
+    thinking_span_abs: Optional[Sequence[int]],
+    sink_span_abs: Optional[Sequence[int]],
+) -> List[str]:
+    """Assign role labels for raw tokens (template + user + generation)."""
+    roles = ["template" for _ in range(len(tokens))]
+    user_set = set(int(i) for i in user_indices)
+    tmpl_set = set(int(i) for i in template_indices)
+    for i in range(min(len(tokens), prompt_len_full)):
+        if i in user_set:
+            roles[i] = "user"
+        elif i in tmpl_set:
+            roles[i] = "template"
+        else:
+            roles[i] = "prompt"
+    for i in range(prompt_len_full, len(tokens)):
+        roles[i] = "gen"
+    if thinking_span_abs is not None:
+        start, end = int(thinking_span_abs[0]), int(thinking_span_abs[1])
+        for i in range(start, min(len(tokens), end + 1)):
+            roles[i] = "think"
+    if sink_span_abs is not None:
+        start, end = int(sink_span_abs[0]), int(sink_span_abs[1])
+        for i in range(start, min(len(tokens), end + 1)):
+            roles[i] = "output"
+    return roles
+def extract_prompt_only_vectors(hop_vectors: Sequence[torch.Tensor], prompt_len: int) -> List[torch.Tensor]:
+    """Slice hop vectors down to user-prompt tokens only (no generation tokens)."""
+    if prompt_len < 0:
+        raise ValueError("prompt_len must be >= 0.")
+    out: List[torch.Tensor] = []
+    for vec in hop_vectors:
+        v = torch.as_tensor(vec, dtype=torch.float32).detach().cpu()
+        if int(v.numel()) < int(prompt_len):
+            raise ValueError(f"Hop vector too short for prompt-only slice: len={int(v.numel())} prompt_len={int(prompt_len)}.")
+        out.append(v[:prompt_len])
+    return out
+def _lift_trimmed_to_full(
+    trimmed: torch.Tensor,
+    *,
+    prompt_len_full: int,
+    gen_len: int,
+    user_prompt_indices: Sequence[int],
+) -> torch.Tensor:
+    """Lift a trimmed (user prompt + generation) vector into full token space with zeros for chat-template tokens."""
+    t = torch.as_tensor(trimmed, dtype=torch.float32).detach().cpu()
+    user_len = len(user_prompt_indices)
+    expected = int(user_len + gen_len)
+    if int(t.numel()) != expected:
+        raise ValueError(f"Trimmed vector length mismatch: got {int(t.numel())}, expected {expected}.")
+    total_len = int(prompt_len_full + gen_len)
+    full = torch.zeros((total_len,), dtype=torch.float32)
+    for j, abs_pos in enumerate(user_prompt_indices):
+        full[int(abs_pos)] = t[j]
+    full[int(prompt_len_full) : int(prompt_len_full + gen_len)] = t[user_len:]
+    return full
+def _postprocess_attnlrp_full_vector(
+    raw_full: torch.Tensor,
+    *,
+    prompt_len_full: int,
+    gen_len: int,
+    user_prompt_indices: Sequence[int],
+    neg_handling: str,
+    norm_mode: str,
+) -> torch.Tensor:
+    """Mirror FT-AttnLRP hop postprocessing while preserving stripped-token normalization.
+    The underlying AttnLRP implementation postprocesses the *stripped* vector (user prompt + generation):
+      - NaN->0, then neg_handling ('drop' or 'abs')
+      - if norm_mode=='norm': normalize by sum over stripped tokens
+    For the pre-trim full view (chat template + generation), we apply the same non-negativity transform
+    to the full vector and normalize using *only the stripped indices*, so overlapping token scores
+    match the trimmed vectors used by the evaluation/case-study hop outputs.
+    """
+    v = torch.as_tensor(raw_full, dtype=torch.float32).detach().cpu()
+    v = torch.nan_to_num(v, nan=0.0)
+    if neg_handling == "drop":
+        v = v.clamp(min=0.0)
+    elif neg_handling == "abs":
+        v = v.abs()
+    else:
+        raise ValueError(f"Unsupported neg_handling={neg_handling!r} (expected 'drop' or 'abs').")
+    ratio_enabled = norm_mode == "norm"
+    if not ratio_enabled:
+        return v
+    keep = list(int(i) for i in user_prompt_indices) + list(range(int(prompt_len_full), int(prompt_len_full + gen_len)))
+    if not keep:
+        return torch.zeros_like(v)
+    keep_idx = torch.as_tensor(keep, dtype=torch.long)
+    denom = float(v.index_select(0, keep_idx).sum().item())
+    if denom <= 0.0:
+        return torch.zeros_like(v)
+    return v / (denom + 1e-12)
+def main() -> None:
+    args = parse_args()
+    device = resolve_device(args.cuda, args.cuda_num)
+    if torch.cuda.is_available():
+        visible = os.environ.get("CUDA_VISIBLE_DEVICES")
+        print(f"[info] CUDA_VISIBLE_DEVICES={visible!r} torch.cuda.device_count()={torch.cuda.device_count()} device={device}")
+    model_name = args.model_path if args.model_path is not None else args.model
+    # Align with exp/exp2: always use the shared fp16 loader.
+    model, tokenizer = load_model(model_name, device)
+    example, ds_name = load_example(args.dataset, args.index, Path(args.data_root))
+    mode = args.mode
+    sink_span: Optional[Tuple[int, int]] = None
+    thinking_span: Optional[Tuple[int, int]] = None
+    thinking_ratios: Optional[Sequence[float]] = None
+    prompt_tokens_trimmed: List[str] = []
+    generation_tokens_trimmed: List[str] = []
+    hop_vectors_trimmed: List[torch.Tensor] = []
+    hop_vectors_raw: List[torch.Tensor] = []
+    prompt_len_full: Optional[int] = None
+    user_prompt_indices: List[int] = []
+    chat_prompt_indices: List[int] = []
+    method_meta: Dict[str, Any] = {}
+    raw_prompt_ids: Optional[List[int]] = None
+    raw_generation_ids: Optional[List[int]] = None
+    attnlrp_raw_attributions: Optional[List[Any]] = None
+    if mode in ("ft", "ft_improve", "ft_split_hop", "ifr_in_all_gen"):
+        if mode == "ft":
+            attr_result, sink_span, thinking_span, debug_info = run_ft_multihop(
+                example,
+                model,
+                tokenizer,
+                n_hops=args.n_hops,
+                sink_span=args.sink_span,
+                thinking_span=args.thinking_span,
+                chunk_tokens=args.chunk_tokens,
+                sink_chunk_tokens=args.sink_chunk_tokens,
+            )
+        elif mode == "ft_improve":
+            attr_result, sink_span, thinking_span, debug_info = run_ft_multihop_improve(
+                example,
+                model,
+                tokenizer,
+                n_hops=args.n_hops,
+                sink_span=args.sink_span,
+                thinking_span=args.thinking_span,
+                chunk_tokens=args.chunk_tokens,
+                sink_chunk_tokens=args.sink_chunk_tokens,
+            )
+        elif mode == "ft_split_hop":
+            attr_result, sink_span, thinking_span, debug_info = run_ft_multihop_split_hop(
+                example,
+                model,
+                tokenizer,
+                n_hops=args.n_hops,
+                sink_span=args.sink_span,
+                thinking_span=args.thinking_span,
+                chunk_tokens=args.chunk_tokens,
+                sink_chunk_tokens=args.sink_chunk_tokens,
+            )
+        elif mode == "ifr_in_all_gen":
+            attr_result, sink_span, thinking_span, debug_info = run_ifr_in_all_gen(
+                example,
+                model,
+                tokenizer,
+                n_hops=args.n_hops,
+                sink_span=args.sink_span,
+                thinking_span=args.thinking_span,
+                chunk_tokens=args.chunk_tokens,
+                sink_chunk_tokens=args.sink_chunk_tokens,
+            )
+        else:
+            raise ValueError(f"Unsupported mode={mode}")
+        ifr_meta = (attr_result.metadata or {}).get("ifr") or {}
+        hop_vectors_trimmed = list(ifr_meta.get("per_hop_projected") or [])
+        if not hop_vectors_trimmed:
+            raise RuntimeError(f"No per-hop vectors found for {mode} mode.")
+        prompt_tokens_trimmed = list(attr_result.prompt_tokens)
+        generation_tokens_trimmed = list(attr_result.generation_tokens)
+        thinking_ratios = ifr_meta.get("thinking_ratios")
+        raw_prompt_ids = debug_info.get("prompt_ids")
+        if isinstance(raw_prompt_ids, list) and raw_prompt_ids and isinstance(raw_prompt_ids[0], list):
+            raw_prompt_ids = raw_prompt_ids[0]
+        raw_generation_ids = debug_info.get("generation_ids")
+        if isinstance(raw_generation_ids, list) and raw_generation_ids and isinstance(raw_generation_ids[0], list):
+            raw_generation_ids = raw_generation_ids[0]
+        user_prompt_indices = list(debug_info.get("user_prompt_indices") or [])
+        chat_prompt_indices = list(debug_info.get("chat_prompt_indices") or [])
+        prompt_len_full = len(raw_prompt_ids) if isinstance(raw_prompt_ids, list) else None
+        raw_vectors = debug_info.get("raw_hop_vectors") or []
+        hop_vectors_raw = [vec.detach().cpu() if hasattr(vec, "detach") else torch.as_tensor(vec) for vec in raw_vectors]
+        method_meta = {"ifr": analysis.sanitize_ifr_meta(ifr_meta)}
+    elif mode == "ifr":
+        # Standard IFR (single-hop span aggregate), with pre/post trim views.
+        attr = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=args.chunk_tokens,
+            sink_chunk_tokens=args.sink_chunk_tokens,
+        )
+        sink_span = tuple(args.sink_span) if args.sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+        thinking_span = tuple(args.thinking_span) if args.thinking_span is not None else tuple(example.thinking_span) if example.thinking_span else sink_span
+        if sink_span is None:
+            raise ValueError("sink_span is required for IFR mode (use dataset sink_span or pass --sink_span).")
+        span_result = attr.calculate_ifr_span(
+            example.prompt,
+            target=example.target,
+            span=tuple(sink_span),
+        )
+        span_meta = span_result.metadata.get("ifr") if span_result.metadata else None
+        aggregate = span_meta.get("aggregate") if isinstance(span_meta, dict) else None
+        if aggregate is None or not hasattr(aggregate, "token_importance_total"):
+            raise RuntimeError("IFR span aggregate missing from metadata; cannot render pre-trim view.")
+        raw_vector = aggregate.token_importance_total.detach().cpu()
+        trimmed_vector = attr._project_vector(raw_vector)
+        hop_vectors_raw = [raw_vector]
+        hop_vectors_trimmed = [trimmed_vector]
+        prompt_tokens_trimmed = list(attr.user_prompt_tokens)
+        generation_tokens_trimmed = list(attr.generation_tokens)
+        raw_prompt_ids = attr.prompt_ids.detach().cpu().tolist()[0]
+        raw_generation_ids = attr.generation_ids.detach().cpu().tolist()[0]
+        user_prompt_indices = list(getattr(attr, "user_prompt_indices", []) or [])
+        chat_prompt_indices = list(getattr(attr, "chat_prompt_indices", []) or [])
+        prompt_len_full = len(raw_prompt_ids)
+        sink_abs = (prompt_len_full + sink_span[0], prompt_len_full + sink_span[1])
+        think_abs = (prompt_len_full + thinking_span[0], prompt_len_full + thinking_span[1]) if thinking_span else None
+        meta = {
+            "type": "span_aggregate",
+            "ifr_view": "aggregate",
+            "sink_span_generation": sink_span,
+            "sink_span_absolute": sink_abs,
+            "thinking_span_generation": thinking_span,
+            "thinking_span_absolute": think_abs,
+        }
+        method_meta = {"ifr": analysis.tensor_to_list(meta)}
+    elif mode == "ifr_all_positions_output_only":
+        # IFR all-positions (output-only) + token-level CAGE (row/recursive) derived from the matrix.
+        attr = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=args.chunk_tokens,
+            sink_chunk_tokens=args.sink_chunk_tokens,
+        )
+        sink_span = tuple(args.sink_span) if args.sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+        thinking_span = tuple(args.thinking_span) if args.thinking_span is not None else tuple(example.thinking_span) if example.thinking_span else sink_span
+        if sink_span is None:
+            raise ValueError(
+                "sink_span is required for ifr_all_positions_output_only mode "
+                "(use dataset sink_span or pass --sink_span)."
+            )
+        attr_result = attr.calculate_ifr_for_all_positions_output_only(
+            example.prompt,
+            target=example.target,
+            sink_span=tuple(sink_span),
+        )
+        indices_to_explain = list(sink_span)
+        _, row_attr, rec_attr = attr_result.get_all_token_attrs(indices_to_explain)
+        row_vec = row_attr.squeeze(0).detach().cpu()
+        rec_vec = rec_attr.squeeze(0).detach().cpu()
+        hop_vectors_trimmed = [row_vec, rec_vec]
+        prompt_tokens_trimmed = list(attr.user_prompt_tokens)
+        generation_tokens_trimmed = list(attr.generation_tokens)
+        raw_prompt_ids = attr.prompt_ids.detach().cpu().tolist()[0]
+        raw_generation_ids = attr.generation_ids.detach().cpu().tolist()[0]
+        user_prompt_indices = list(getattr(attr, "user_prompt_indices", []) or [])
+        chat_prompt_indices = list(getattr(attr, "chat_prompt_indices", []) or [])
+        prompt_len_full = len(raw_prompt_ids)
+        gen_len = len(raw_generation_ids or [])
+        hop_vectors_raw = [
+            _lift_trimmed_to_full(
+                v,
+                prompt_len_full=int(prompt_len_full or 0),
+                gen_len=gen_len,
+                user_prompt_indices=user_prompt_indices,
+            )
+            for v in hop_vectors_trimmed
+        ]
+        ifr_meta = dict((attr_result.metadata or {}).get("ifr") or {})
+        ifr_meta["ifr_view"] = "all_positions_output_only (row+rec)"
+        ifr_meta["panel_titles"] = ["Row attribution", "Recursive attribution (CAGE)"]
+        ifr_meta["indices_to_explain"] = indices_to_explain
+        method_meta = {"ifr": analysis.tensor_to_list(ifr_meta)}
+    elif mode == "ifr_all_positions":
+        # IFR all-positions (full generation) + token-level CAGE (row/recursive) derived from the matrix.
+        attr = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=args.chunk_tokens,
+            sink_chunk_tokens=args.sink_chunk_tokens,
+        )
+        sink_span = tuple(args.sink_span) if args.sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+        thinking_span = tuple(args.thinking_span) if args.thinking_span is not None else tuple(example.thinking_span) if example.thinking_span else sink_span
+        if sink_span is None:
+            raise ValueError(
+                "sink_span is required for ifr_all_positions mode (use dataset sink_span or pass --sink_span)."
+            )
+        attr_result = attr.calculate_ifr_for_all_positions(
+            example.prompt,
+            target=example.target,
+        )
+        indices_to_explain = list(sink_span)
+        _, row_attr, rec_attr = attr_result.get_all_token_attrs(indices_to_explain)
+        row_vec = row_attr.squeeze(0).detach().cpu()
+        rec_vec = rec_attr.squeeze(0).detach().cpu()
+        hop_vectors_trimmed = [row_vec, rec_vec]
+        prompt_tokens_trimmed = list(attr.user_prompt_tokens)
+        generation_tokens_trimmed = list(attr.generation_tokens)
+        raw_prompt_ids = attr.prompt_ids.detach().cpu().tolist()[0]
+        raw_generation_ids = attr.generation_ids.detach().cpu().tolist()[0]
+        user_prompt_indices = list(getattr(attr, "user_prompt_indices", []) or [])
+        chat_prompt_indices = list(getattr(attr, "chat_prompt_indices", []) or [])
+        prompt_len_full = len(raw_prompt_ids)
+        gen_len = len(raw_generation_ids or [])
+        hop_vectors_raw = [
+            _lift_trimmed_to_full(
+                v,
+                prompt_len_full=int(prompt_len_full or 0),
+                gen_len=gen_len,
+                user_prompt_indices=user_prompt_indices,
+            )
+            for v in hop_vectors_trimmed
+        ]
+        ifr_meta = dict((attr_result.metadata or {}).get("ifr") or {})
+        ifr_meta["ifr_view"] = "all_positions (row+rec)"
+        ifr_meta["panel_titles"] = ["Row attribution", "Recursive attribution (CAGE)"]
+        ifr_meta["indices_to_explain"] = indices_to_explain
+        method_meta = {"ifr": analysis.tensor_to_list(ifr_meta)}
+    elif mode in ("attnlrp", "ft_attnlrp"):
+        # Reuse the shared LLMLRPAttribution implementations (root-level).
+        attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        sink_span = tuple(args.sink_span) if args.sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+        thinking_span = (
+            tuple(args.thinking_span)
+            if args.thinking_span is not None
+            else tuple(example.thinking_span) if example.thinking_span else sink_span
+        )
+        if mode == "attnlrp":
+            # Case-study AttnLRP: reuse FT-AttnLRP logic but take hop0 (the first span-aggregate)
+            # for a full, signed attribution vector (no observation masking).
+            attr_result = attributor.calculate_attnlrp_ft_hop0(
+                example.prompt,
+                target=example.target,
+                sink_span=sink_span,
+                thinking_span=thinking_span,
+                neg_handling=args.attnlrp_neg_handling,
+                norm_mode=args.attnlrp_norm_mode,
+            )
+            meta = attr_result.metadata or {}
+            multi_hop = meta.get("multi_hop_result")
+            raw_attributions = getattr(multi_hop, "raw_attributions", None) or []
+            attnlrp_raw_attributions = list(raw_attributions)
+            base_attr = raw_attributions[0] if raw_attributions else None
+            if base_attr is None or not hasattr(base_attr, "token_importance_total"):
+                raise RuntimeError("AttnLRP hop0 missing from multi-hop result.")
+            hop0_vec = torch.as_tensor(getattr(base_attr, "token_importance_total"), dtype=torch.float32).detach().cpu()
+            if hop0_vec.numel() <= 0:
+                raise RuntimeError("Empty generation for AttnLRP case study.")
+            # Use the actual sink span applied by hop0 (defaults to full generation when unset).
+            sink_span = tuple(getattr(base_attr, "sink_range"))
+            if thinking_span is None:
+                thinking_span = sink_span
+            hop_vectors_trimmed = [hop0_vec]
+            thinking_ratios = list(getattr(multi_hop, "thinking_ratios", []) or [])
+            method_meta = {
+                "attnlrp": {
+                    "type": "calculate_attnlrp_multi_hop(n_hops=0) hop0 raw_attributions[0]",
+                    "sink_span_generation": sink_span,
+                    "thinking_span_generation": thinking_span,
+                    "thinking_ratios": thinking_ratios,
+                    "neg_handling": args.attnlrp_neg_handling,
+                    "norm_mode": args.attnlrp_norm_mode,
+                    "ratio_enabled": args.attnlrp_norm_mode == "norm",
+                }
+            }
+        else:
+            # exp2 ft_attnlrp: multi-hop aggregated AttnLRP (metadata contains per-hop vectors).
+            attr_result = attributor.calculate_attnlrp_aggregated_multi_hop(
+                example.prompt,
+                target=example.target,
+                sink_span=sink_span,
+                thinking_span=thinking_span,
+                n_hops=int(args.n_hops),
+                neg_handling=args.attnlrp_neg_handling,
+                norm_mode=args.attnlrp_norm_mode,
+            )
+            meta = attr_result.metadata or {}
+            multi_hop = meta.get("multi_hop_result")
+            if multi_hop is None:
+                raise RuntimeError("FT-AttnLRP case study missing metadata.multi_hop_result.")
+            raw_attributions = getattr(multi_hop, "raw_attributions", None) or []
+            attnlrp_raw_attributions = list(raw_attributions)
+            hop_vectors_trimmed = [
+                torch.as_tensor(getattr(hop, "token_importance_total"), dtype=torch.float32).detach().cpu()
+                for hop in raw_attributions
+            ]
+            thinking_ratios = list(getattr(multi_hop, "thinking_ratios", []) or [])
+            method_meta = {
+                "attnlrp": {
+                    "type": "calculate_attnlrp_aggregated_multi_hop (exp2 ft_attnlrp)",
+                    "n_hops": int(args.n_hops),
+                    "sink_span_generation": sink_span,
+                    "thinking_span_generation": thinking_span,
+                    "thinking_ratios": thinking_ratios,
+                    "neg_handling": args.attnlrp_neg_handling,
+                    "norm_mode": args.attnlrp_norm_mode,
+                    "ratio_enabled": args.attnlrp_norm_mode == "norm",
+                }
+            }
+        prompt_tokens_trimmed = list(attributor.user_prompt_tokens)
+        generation_tokens_trimmed = list(attributor.generation_tokens)
+        raw_prompt_ids = attributor.prompt_ids.detach().cpu().tolist()[0]
+        raw_generation_ids = attributor.generation_ids.detach().cpu().tolist()[0]
+        user_prompt_indices = list(getattr(attributor, "user_prompt_indices", []) or [])
+        chat_prompt_indices = list(getattr(attributor, "chat_prompt_indices", []) or [])
+        prompt_len_full = len(raw_prompt_ids)
+    else:
+        raise ValueError(f"Unsupported mode={mode}")
+    if not hop_vectors_trimmed:
+        raise RuntimeError("No hop vectors to visualize.")
+    raw_tokens = build_raw_tokens_from_ids(tokenizer, raw_prompt_ids, raw_generation_ids)
+    sink_span_abs = None
+    thinking_span_abs = None
+    if prompt_len_full is not None and sink_span is not None:
+        sink_span_abs = (prompt_len_full + sink_span[0], prompt_len_full + sink_span[1])
+    if prompt_len_full is not None and thinking_span is not None:
+        thinking_span_abs = (prompt_len_full + thinking_span[0], prompt_len_full + thinking_span[1])
+    prompt_len_full_safe = int(prompt_len_full or 0)
+    roles_raw = build_raw_roles(
+        raw_tokens,
+        prompt_len_full_safe,
+        user_prompt_indices,
+        chat_prompt_indices,
+        thinking_span_abs,
+        sink_span_abs,
+    )
+    prompt_tokens_only = list(prompt_tokens_trimmed)
+    prompt_only_vectors = extract_prompt_only_vectors(hop_vectors_trimmed, len(prompt_tokens_only))
+    # Ensure every method has a pre-trim full vector per panel.
+    if not hop_vectors_raw:
+        if mode in ("attnlrp", "ft_attnlrp") and attnlrp_raw_attributions is not None:
+            gen_len = len(raw_generation_ids or [])
+            expected = int((prompt_len_full_safe + gen_len) if prompt_len_full is not None else 0)
+            full_vectors: List[torch.Tensor] = []
+            for hop in attnlrp_raw_attributions:
+                meta = getattr(hop, "metadata", None) or {}
+                raw_full = meta.get("token_importance_total_with_chat_template")
+                if raw_full is None:
+                    full_vectors = []
+                    break
+                v = _postprocess_attnlrp_full_vector(
+                    torch.as_tensor(raw_full, dtype=torch.float32),
+                    prompt_len_full=prompt_len_full_safe,
+                    gen_len=gen_len,
+                    user_prompt_indices=user_prompt_indices,
+                    neg_handling=args.attnlrp_neg_handling,
+                    norm_mode=args.attnlrp_norm_mode,
+                )
+                if expected and int(v.numel()) != expected:
+                    raise RuntimeError(
+                        "AttnLRP full-vector length mismatch for pre-trim view: "
+                        f"got {int(v.numel())}, expected {expected}."
+                    )
+                full_vectors.append(v)
+            hop_vectors_raw = full_vectors
+    if not hop_vectors_raw and prompt_len_full is not None:
+        # Fallback: lift trimmed vectors back to full token space with zeros for template tokens.
+        gen_len = len(raw_generation_ids or [])
+        hop_vectors_raw = [
+            _lift_trimmed_to_full(
+                v,
+                prompt_len_full=prompt_len_full_safe,
+                gen_len=gen_len,
+                user_prompt_indices=user_prompt_indices,
+            )
+            for v in hop_vectors_trimmed
+        ]
+    if not hop_vectors_raw:
+        raise RuntimeError("Missing pre-trim vectors; cannot render required full-sequence heatmap.")
+    # Lightweight debug stats to catch silent all-zero / NaN cases.
+    hop_stats_raw = [analysis.vector_stats(torch.nan_to_num(v.detach().cpu(), nan=0.0)) for v in hop_vectors_raw]
+    hop_stats_prompt = [analysis.vector_stats(torch.nan_to_num(v.detach().cpu(), nan=0.0)) for v in prompt_only_vectors]
+    for i in range(max(len(hop_stats_raw), len(hop_stats_prompt))):
+        raw_abs = hop_stats_raw[i]["abs_max"] if i < len(hop_stats_raw) else None
+        prompt_abs = hop_stats_prompt[i]["abs_max"] if i < len(hop_stats_prompt) else None
+        print(f"[stats] panel {i}: raw_abs_max={raw_abs} prompt_abs_max={prompt_abs}")
+    hop_token_raw = analysis.package_token_hops(hop_vectors_raw)
+    hop_token_prompt = analysis.package_token_hops(prompt_only_vectors)
+    case_meta: Dict[str, Any] = {
+        "dataset": ds_name,
+        "index": args.index,
+        "sink_span": sink_span,
+        "thinking_span": thinking_span,
+        "n_hops": args.n_hops,
+        "thinking_ratios": thinking_ratios,
+        "mode": mode,
+        "ifr_view": method_meta.get("ifr", {}).get("ifr_view") if isinstance(method_meta.get("ifr"), dict) else None,
+        "panel_titles": method_meta.get("ifr", {}).get("panel_titles") if isinstance(method_meta.get("ifr"), dict) else None,
+        "attnlrp_neg_handling": args.attnlrp_neg_handling if mode in ("attnlrp", "ft_attnlrp") else None,
+        "attnlrp_norm_mode": args.attnlrp_norm_mode if mode in ("attnlrp", "ft_attnlrp") else None,
+        "attnlrp_ratio_enabled": (args.attnlrp_norm_mode == "norm") if mode in ("attnlrp", "ft_attnlrp") else None,
+        "vector_stats_raw": hop_stats_raw,
+        "vector_stats_prompt": hop_stats_prompt,
+    }
+    generation_text = "".join(generation_tokens_trimmed) if generation_tokens_trimmed else ""
+    prompt_text = example.prompt
+    record = {
+        "meta": case_meta,
+        "prompt": prompt_text,
+        "target": example.target,
+        "generation": generation_text,
+        "full_all_tokens": raw_tokens,
+        "raw_token_roles": roles_raw,
+        "prompt_tokens": prompt_tokens_only,
+        "prompt_token_roles": ["user" for _ in range(len(prompt_tokens_only))],
+        "token_hops_raw": hop_token_raw,
+        "token_hops_prompt": hop_token_prompt,
+        "ifr_meta": method_meta.get("ifr"),
+        "attnlrp_meta": method_meta.get("attnlrp"),
+    }
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    stem = make_output_stem(ds_name, args.index, mode)
+    json_path = out_dir / f"{stem}.json"
+    html_path = out_dir / f"{stem}.html"
+    with json_path.open("w", encoding="utf-8") as f:
+        json.dump(record, f, ensure_ascii=False, indent=2)
+    html = viz.render_case_html(
+        case_meta,
+        token_view_raw={
+            "label": "Pre-trim token-level heatmap (full sequence with chat template)",
+            "tokens": raw_tokens,
+            "roles": roles_raw,
+            "hops": hop_token_raw,
+        },
+        token_view_prompt={
+            "label": "Prompt-only token-level heatmap (user prompt only)",
+            "tokens": prompt_tokens_only,
+            "roles": ["user" for _ in range(len(prompt_tokens_only))],
+            "hops": hop_token_prompt,
+        },
+    )
+    html_path.write_text(html, encoding="utf-8")
+    print(f"[done] wrote {json_path}")
+    print(f"[done] wrote {html_path}")
+if __name__ == "__main__":
+    main()

exp/case_study/run_mas_case.py ADDED Viewed

	@@ -0,0 +1,805 @@

+#!/usr/bin/env python3
+"""MAS case study: visualize token-perturbation faithfulness for attribution methods.
+This script matches the faithfulness evaluation logic implemented in:
+  - evaluations/faithfulness.py
+  - llm_attr_eval.LLMAttributionEvaluator.faithfulness_test()
+For a single example and a selected attribution method, we:
+  1) Compute token-level attributions (Seq / Row / Recursive) over prompt tokens.
+  2) Rank prompt tokens by attribution mass.
+  3) Iteratively perturb the prompt by replacing one token at a time with PAD tokens.
+  4) Score the model as sum log p(generation + EOS | prompt) under the chat template.
+  5) Compute RISE / MAS / RISE+AP (AUCs) and visualize the perturbation impact as token heatmaps.
+Outputs JSON + HTML to exp/case_study/out/.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import types
+from importlib.machinery import ModuleSpec
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import numpy as np
+def _early_set_cuda_visible_devices() -> None:
+    """Set CUDA_VISIBLE_DEVICES before importing torch/transformers.
+    Note: CUDA device indices are re-mapped inside the process after applying the mask.
+    """
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    cuda = args.cuda.strip() if isinstance(args.cuda, str) else ""
+    if cuda and "," in cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+if __name__ == "__main__":
+    _early_set_cuda_visible_devices()
+import torch
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+# Avoid optional vision deps when importing transformers.
+os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")
+os.environ.setdefault("DISABLE_TRANSFORMERS_IMAGE_TRANSFORMS", "1")
+def _stub_torchvision() -> None:
+    """Provide minimal torchvision stubs so transformers imports succeed without torchvision."""
+    if "torchvision" in sys.modules:
+        return
+    def _mk(name: str) -> types.ModuleType:
+        mod = types.ModuleType(name)
+        mod.__spec__ = ModuleSpec(name, loader=None)
+        return mod
+    tv = _mk("torchvision")
+    tv.__dict__["__path__"] = []
+    submods = ["transforms", "_meta_registrations", "datasets", "io", "models", "ops", "utils"]
+    for name in submods:
+        mod = _mk(f"torchvision.{name}")
+        sys.modules[f"torchvision.{name}"] = mod
+        setattr(tv, name, mod)
+    class _InterpolationMode:
+        NEAREST = 0
+        NEAREST_EXACT = 0
+        BILINEAR = 1
+        BICUBIC = 2
+        LANCZOS = 3
+        BOX = 4
+        HAMMING = 5
+    sys.modules["torchvision.transforms"].InterpolationMode = _InterpolationMode
+    sys.modules["torchvision.transforms"].__all__ = ["InterpolationMode"]
+    ops_mod = sys.modules.get("torchvision.ops") or _mk("torchvision.ops")
+    sys.modules["torchvision.ops"] = ops_mod
+    setattr(tv, "ops", ops_mod)
+    misc_mod = _mk("torchvision.ops.misc")
+    sys.modules["torchvision.ops.misc"] = misc_mod
+    setattr(ops_mod, "misc", misc_mod)
+    class _FrozenBatchNorm2d:
+        def __init__(self, *args, **kwargs):
+            pass
+    misc_mod.FrozenBatchNorm2d = _FrozenBatchNorm2d
+    sys.modules["torchvision"] = tv
+def _stub_timm() -> None:
+    """Provide minimal timm stubs to avoid optional vision deps."""
+    if "timm" in sys.modules:
+        return
+    def _mk(name: str) -> types.ModuleType:
+        mod = types.ModuleType(name)
+        mod.__spec__ = ModuleSpec(name, loader=None)
+        return mod
+    timm = _mk("timm")
+    timm.__dict__["__path__"] = []
+    sys.modules["timm"] = timm
+    data_mod = _mk("timm.data")
+    sys.modules["timm.data"] = data_mod
+    timm.data = data_mod
+    class _ImageNetInfo:
+        pass
+    def _infer_imagenet_subset(*args, **kwargs):
+        return None
+    data_mod.ImageNetInfo = _ImageNetInfo
+    data_mod.infer_imagenet_subset = _infer_imagenet_subset
+    layers_mod = _mk("timm.layers")
+    sys.modules["timm.layers"] = layers_mod
+    timm.layers = layers_mod
+    create_norm_mod = _mk("timm.layers.create_norm")
+    sys.modules["timm.layers.create_norm"] = create_norm_mod
+    layers_mod.create_norm = create_norm_mod
+    def _get_norm_layer(*args, **kwargs):
+        return None
+    create_norm_mod.get_norm_layer = _get_norm_layer
+    classifier_mod = _mk("timm.layers.classifier")
+    sys.modules["timm.layers.classifier"] = classifier_mod
+    layers_mod.classifier = classifier_mod
+def _stub_gemma3n() -> None:
+    """Stub Gemma3n config module if transformers tries to import it."""
+    if "transformers.models.gemma3n.configuration_gemma3n" in sys.modules:
+        return
+    gemma_pkg = types.ModuleType("transformers.models.gemma3n")
+    gemma_pkg.__spec__ = ModuleSpec("transformers.models.gemma3n", loader=None, is_package=True)
+    sys.modules["transformers.models.gemma3n"] = gemma_pkg
+    gemma_conf = types.ModuleType("transformers.models.gemma3n.configuration_gemma3n")
+    gemma_conf.__spec__ = ModuleSpec("transformers.models.gemma3n.configuration_gemma3n", loader=None)
+    class Gemma3nConfig:
+        def __init__(self, *args, **kwargs):
+            self.model_type = "gemma3n"
+    class Gemma3nTextConfig(Gemma3nConfig):
+        pass
+    gemma_conf.Gemma3nConfig = Gemma3nConfig
+    gemma_conf.Gemma3nTextConfig = Gemma3nTextConfig
+    gemma_conf.__all__ = ["Gemma3nConfig", "Gemma3nTextConfig"]
+    sys.modules["transformers.models.gemma3n.configuration_gemma3n"] = gemma_conf
+    setattr(gemma_pkg, "configuration_gemma3n", gemma_conf)
+_stub_torchvision()
+_stub_timm()
+_stub_gemma3n()
+import transformers  # noqa: E402
+# Provide light stubs if Longformer classes are unavailable; we don't use them here.
+if not hasattr(transformers, "LongformerTokenizer"):
+    class _DummyLongformerTokenizer:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("LongformerTokenizer stubbed; install full transformers if needed.")
+    transformers.LongformerTokenizer = _DummyLongformerTokenizer
+if not hasattr(transformers, "LongformerForMaskedLM"):
+    class _DummyLongformerForMaskedLM:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("LongformerForMaskedLM stubbed; install full transformers if needed.")
+    transformers.LongformerForMaskedLM = _DummyLongformerForMaskedLM
+from exp.case_study import viz  # noqa: E402
+from exp.exp2 import dataset_utils as ds_utils  # noqa: E402
+from shared_utils import DEFAULT_PROMPT_TEMPLATE  # noqa: E402
+import llm_attr  # noqa: E402
+from evaluations.attribution_recovery import load_model  # noqa: E402
+def resolve_device(cuda: Optional[str], cuda_num: int) -> str:
+    if cuda and isinstance(cuda, str) and "," in cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+        return "auto"
+    if cuda and isinstance(cuda, str) and cuda.strip():
+        try:
+            idx = int(cuda)
+        except Exception:
+            idx = 0
+        return f"cuda:{idx}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{cuda_num}" if torch.cuda.is_available() else "cpu"
+def load_example(dataset: str, index: int, data_root: Path) -> Tuple[ds_utils.CachedExample, str]:
+    ds_path = Path(dataset)
+    if ds_path.exists():
+        examples = ds_utils.read_cached_jsonl(ds_path)
+        dataset_name = ds_path.name
+    else:
+        loader = ds_utils.DatasetLoader(data_root=data_root)
+        examples = loader.load(dataset)
+        dataset_name = dataset
+    if not examples:
+        raise ValueError(f"No examples found for dataset={dataset}")
+    if index < 0:
+        index = len(examples) + index
+    if not (0 <= index < len(examples)):
+        raise IndexError(f"index {index} out of range for dataset with {len(examples)} examples")
+    return examples[index], dataset_name
+def make_output_stem(dataset_name: str, index: int, method: str) -> str:
+    safe_name = dataset_name.replace("/", "_").replace(" ", "_")
+    return f"mas_case_{method}_{safe_name}_idx{index}"
+def format_prompt(tokenizer: Any, prompt: str) -> str:
+    modified_prompt = DEFAULT_PROMPT_TEMPLATE.format(context=prompt, query="")
+    formatted_prompt = [{"role": "user", "content": modified_prompt}]
+    return tokenizer.apply_chat_template(
+        formatted_prompt,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+@torch.inference_mode()
+def compute_logprob_response_given_prompt(model: Any, prompt_ids: torch.Tensor, response_ids: torch.Tensor) -> torch.Tensor:
+    """Compute log-probabilities of response_ids given prompt_ids.
+    Shapes:
+      prompt_ids: [B, N]
+      response_ids: [B, M]
+      returns: [B, M]
+    """
+    input_ids = torch.cat([prompt_ids, response_ids], dim=1)
+    attention_mask = torch.ones_like(input_ids)
+    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits  # [B, N+M, V]
+    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+    response_start = int(prompt_ids.shape[1])
+    logits_for_response = log_probs[:, response_start - 1 : -1, :]  # [B, M, V]
+    gathered = logits_for_response.gather(2, response_ids.unsqueeze(-1))
+    return gathered.squeeze(-1)
+@torch.inference_mode()
+def score_prompt_ids_with_generation(model: Any, *, prompt_ids: torch.Tensor, generation_ids: torch.Tensor) -> float:
+    return float(compute_logprob_response_given_prompt(model, prompt_ids, generation_ids).sum().detach().cpu().item())
+@torch.inference_mode()
+def _ensure_pad_token_id(tokenizer: Any) -> int:
+    if tokenizer.pad_token_id is None:
+        if tokenizer.eos_token_id is None:
+            raise RuntimeError("tokenizer has neither pad_token_id nor eos_token_id; cannot define baseline token.")
+        tokenizer.pad_token = tokenizer.eos_token
+    return int(tokenizer.pad_token_id)
+def _find_subsequence_start(haystack: torch.Tensor, needle: torch.Tensor) -> Optional[int]:
+    if haystack.ndim != 1 or needle.ndim != 1:
+        raise ValueError("Expected 1D tensors for subsequence matching.")
+    if needle.numel() == 0:
+        return 0
+    hay_len = int(haystack.numel())
+    needle_len = int(needle.numel())
+    if needle_len > hay_len:
+        return None
+    for i in range(hay_len - needle_len + 1):
+        if torch.equal(haystack[i : i + needle_len], needle):
+            return i
+    return None
+def decode_text_into_tokens(tokenizer: Any, text: str) -> List[str]:
+    encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
+    offsets = list(encoding["offset_mapping"])
+    tokens: List[str] = []
+    for start, end in offsets:
+        tokens.append(text[start:end])
+    return tokens
+def auc(arr: np.ndarray) -> float:
+    return float((arr.sum() - arr[0] / 2 - arr[-1] / 2) / (arr.shape[0] - 1))
+def mas_trace(
+    model: Any,
+    tokenizer: Any,
+    *,
+    attribution: torch.Tensor,
+    prompt: str,
+    generation: str,
+    user_prompt_indices: Optional[Sequence[int]] = None,
+    keep_prompt_token_indices: Optional[Sequence[int]] = None,
+    k: int = 20,
+) -> Dict[str, Any]:
+    """Return a token-level faithfulness trace (RISE/MAS/RISE+AP) plus per-token deltas."""
+    pad_token_id = _ensure_pad_token_id(tokenizer)
+    user_prompt = " " + prompt
+    formatted = format_prompt(tokenizer, user_prompt)
+    formatted_ids = tokenizer(formatted, return_tensors="pt", add_special_tokens=False).input_ids
+    user_ids = tokenizer(user_prompt, return_tensors="pt", add_special_tokens=False).input_ids
+    prompt_ids = formatted_ids.to(model.device)
+    prompt_ids_perturbed = prompt_ids.clone()
+    gen_ids = tokenizer(
+        generation + (tokenizer.eos_token or ""),
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).input_ids.to(model.device)
+    attr_cpu = attribution.detach().cpu()
+    w = attr_cpu.sum(0)
+    P = int(w.numel())
+    if keep_prompt_token_indices is None:
+        keep = list(range(P))
+    else:
+        keep = []
+        seen: set[int] = set()
+        for raw in keep_prompt_token_indices:
+            try:
+                idx = int(raw)
+            except Exception:
+                continue
+            if 0 <= idx < P and idx not in seen:
+                keep.append(idx)
+                seen.add(idx)
+        keep.sort()
+    K = len(keep)
+    if K:
+        w_keep = w.index_select(0, torch.as_tensor(keep, dtype=torch.long))
+        sorted_local = torch.argsort(w_keep, descending=True)
+        sorted_attr_indices = torch.as_tensor([keep[int(i.item())] for i in sorted_local], dtype=torch.long)
+        attr_sum = float(w_keep.sum().item())
+    else:
+        sorted_attr_indices = torch.zeros((0,), dtype=torch.long)
+        attr_sum = 0.0
+    if int(user_ids.shape[1]) != P:
+        raise ValueError(
+            "Prompt-side attribution length does not match tokenized user prompt length: "
+            f"attr P={P}, user_prompt P={int(user_ids.shape[1])}."
+        )
+    prompt_positions: List[int]
+    if user_prompt_indices is not None:
+        prompt_positions = [int(x) for x in user_prompt_indices]
+        if len(prompt_positions) != P:
+            raise ValueError(
+                "user_prompt_indices length does not match prompt-side attribution length: "
+                f"indices P={len(prompt_positions)}, attr P={P}."
+            )
+        if P and max(prompt_positions) >= int(prompt_ids_perturbed.shape[1]):
+            raise ValueError("user_prompt_indices contains an out-of-bounds index for formatted prompt ids.")
+    else:
+        user_start = _find_subsequence_start(formatted_ids[0], user_ids[0])
+        if user_start is None:
+            raise RuntimeError("Failed to locate user prompt token span inside formatted chat prompt.")
+        prompt_positions = [int(user_start) + j for j in range(P)]
+    if K > 0:
+        steps = int(k) if k is not None else 0
+        if steps <= 0:
+            steps = 1
+        steps = min(steps, K)
+    else:
+        steps = 0
+    scores = np.zeros(steps + 1, dtype=np.float64)
+    density = np.zeros(steps + 1, dtype=np.float64)
+    scores[0] = score_prompt_ids_with_generation(model, prompt_ids=prompt_ids_perturbed, generation_ids=gen_ids)
+    density[0] = 1.0
+    if K == 0:
+        return {
+            "num_tokens": P,
+            "sorted_attr_indices": [],
+            "scores_raw": scores.tolist(),
+            "density": density.tolist(),
+            "normalized_model_response": scores.tolist(),
+            "alignment_penalty": np.zeros_like(scores).tolist(),
+            "corrected_scores": scores.tolist(),
+            "token_deltas_raw": np.zeros(P, dtype=np.float64).tolist(),
+            "attr_weights": np.zeros(P, dtype=np.float64).tolist(),
+            "metrics": {"RISE": 0.0, "MAS": 0.0, "RISE+AP": 0.0},
+        }
+    if attr_sum <= 0:
+        density = np.linspace(1.0, 0.0, steps + 1)
+    per_token_delta = np.zeros(P, dtype=np.float64)
+    base = K // steps
+    remainder = K % steps
+    start = 0
+    for step in range(steps):
+        size = base + (1 if step < remainder else 0)
+        group = sorted_attr_indices[start : start + size]
+        start += size
+        for idx_t in group:
+            idx = int(idx_t.item())
+            abs_pos = int(prompt_positions[idx])
+            prompt_ids_perturbed[0, abs_pos] = pad_token_id
+        scores[step + 1] = score_prompt_ids_with_generation(model, prompt_ids=prompt_ids_perturbed, generation_ids=gen_ids)
+        if attr_sum > 0:
+            dec = float(w.index_select(0, group).sum().item()) / attr_sum
+            density[step + 1] = density[step] - dec
+        delta = scores[step] - scores[step + 1]
+        for idx_t in group:
+            idx = int(idx_t.item())
+            per_token_delta[idx] = delta
+    min_normalized_pred = 1.0
+    normalized_model_response = scores.copy()
+    for i in range(len(scores)):
+        normalized_pred = (normalized_model_response[i] - scores[-1]) / (abs(scores[0] - scores[-1]))
+        normalized_pred = np.clip(normalized_pred, 0.0, 1.0)
+        min_normalized_pred = min(min_normalized_pred, float(normalized_pred))
+        normalized_model_response[i] = min_normalized_pred
+    alignment_penalty = np.abs(normalized_model_response - density)
+    corrected_scores = normalized_model_response + alignment_penalty
+    corrected_scores = corrected_scores.clip(0, 1)
+    corrected_scores = (corrected_scores - np.min(corrected_scores)) / (np.max(corrected_scores) - np.min(corrected_scores))
+    if np.isnan(corrected_scores).any():
+        corrected_scores = np.linspace(1, 0, len(scores))
+    rise = auc(normalized_model_response)
+    mas = auc(corrected_scores)
+    rise_ap = auc(normalized_model_response + alignment_penalty)
+    if attr_sum > 0:
+        attr_weights = np.zeros(P, dtype=np.float64)
+        for idx in keep:
+            attr_weights[idx] = float(w[idx].item()) / (attr_sum + 1e-12)
+    else:
+        attr_weights = np.zeros(P, dtype=np.float64)
+    return {
+        "num_tokens": P,
+        "sorted_attr_indices": [int(i.item()) for i in sorted_attr_indices],
+        "scores_raw": scores.tolist(),
+        "density": density.tolist(),
+        "normalized_model_response": normalized_model_response.tolist(),
+        "alignment_penalty": alignment_penalty.tolist(),
+        "corrected_scores": corrected_scores.tolist(),
+        "token_deltas_raw": per_token_delta.tolist(),
+        "attr_weights": attr_weights.tolist(),
+        "metrics": {"RISE": rise, "MAS": mas, "RISE+AP": rise_ap},
+    }
+def compute_method_attribution(
+    method: str,
+    example: ds_utils.CachedExample,
+    model: Any,
+    tokenizer: Any,
+    *,
+    n_hops: int,
+    sink_span: Optional[Tuple[int, int]],
+    thinking_span: Optional[Tuple[int, int]],
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+    attnlrp_neg_handling: str,
+    attnlrp_norm_mode: str,
+) -> Tuple[str, Any, llm_attr.LLMAttributionResult]:
+    prompt = example.prompt
+    target = example.target
+    if method == "ifr":
+        if sink_span is None:
+            raise ValueError("IFR requires sink_span (use dataset sink_span or pass --sink_span).")
+        attributor = llm_attr.LLMIFRAttribution(model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=sink_chunk_tokens)
+        result = attributor.calculate_ifr_span(prompt, target=target, span=sink_span)
+        return "IFR (ifr_span)", attributor, result
+    if method == "ifr_all_positions_output_only":
+        if sink_span is None:
+            raise ValueError(
+                "ifr_all_positions_output_only requires sink_span (use dataset sink_span or pass --sink_span)."
+            )
+        attributor = llm_attr.LLMIFRAttribution(model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=sink_chunk_tokens)
+        result = attributor.calculate_ifr_for_all_positions_output_only(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+        )
+        return "IFR (ifr_all_positions_output_only)", attributor, result
+    if method in ("ft", "ft_ifr"):
+        attributor = llm_attr.LLMIFRAttribution(model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=sink_chunk_tokens)
+        result = attributor.calculate_ifr_multi_hop(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+        return "FT-IFR (ifr_multi_hop)", attributor, result
+    if method in ("ft_improve", "ft_ifr_improve"):
+        import ft_ifr_improve
+        attributor = ft_ifr_improve.LLMIFRAttributionImproved(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        result = attributor.calculate_ifr_multi_hop_stop_words(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+        return "FT-IFR (ifr_multi_hop_stop_words)", attributor, result
+    if method == "ft_split_hop":
+        import ft_ifr_improve
+        attributor = ft_ifr_improve.LLMIFRAttributionSplitHop(
+            model,
+            tokenizer,
+            chunk_tokens=chunk_tokens,
+            sink_chunk_tokens=sink_chunk_tokens,
+        )
+        result = attributor.calculate_ifr_multi_hop_split_hop(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+        )
+        return "FT-IFR (ifr_multi_hop_split_hop)", attributor, result
+    if method == "attnlrp":
+        attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        result = attributor.calculate_attnlrp_ft_hop0(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            neg_handling=attnlrp_neg_handling,
+            norm_mode=attnlrp_norm_mode,
+        )
+        return "AttnLRP (ft_attnlrp hop0)", attributor, result
+    if method == "ft_attnlrp":
+        attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        result = attributor.calculate_attnlrp_aggregated_multi_hop(
+            prompt,
+            target=target,
+            sink_span=sink_span,
+            thinking_span=thinking_span,
+            n_hops=int(n_hops),
+            neg_handling=attnlrp_neg_handling,
+            norm_mode=attnlrp_norm_mode,
+        )
+        return "FT-AttnLRP (attnlrp_aggregated_multi_hop)", attributor, result
+    raise ValueError(f"Unsupported method={method!r}")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser("MAS case study (faithfulness perturbation visualization)")
+    parser.add_argument("--dataset", type=str, default="exp/exp2/data/morehopqa.jsonl", help="Dataset name or JSONL path.")
+    parser.add_argument("--data_root", type=str, default="exp/exp2/data", help="Cache root for dataset names.")
+    parser.add_argument("--index", type=int, default=0, help="Sample index (supports negative for reverse).")
+    parser.add_argument(
+        "--method",
+        type=str,
+        choices=[
+            "ifr",
+            "ifr_all_positions_output_only",
+            "ft",
+            "ft_ifr",
+            "ft_improve",
+            "ft_ifr_improve",
+            "ft_split_hop",
+            "attnlrp",
+            "ft_attnlrp",
+        ],
+        default="ft",
+    )
+    parser.add_argument("--model", type=str, default="qwen-8B", help="HF repo id (ignored if --model_path set).")
+    parser.add_argument("--model_path", type=str, default=None, help="Local model path to override --model.")
+    parser.add_argument("--cuda", type=str, default=None, help="CUDA spec (e.g., '0' or '0,1').")
+    parser.add_argument("--cuda_num", type=int, default=0, help="Fallback GPU index when --cuda unset.")
+    parser.add_argument("--n_hops", type=int, default=1, help="Number of hops for multi-hop methods.")
+    parser.add_argument("--sink_span", type=int, nargs=2, default=None, help="Optional sink span over generation tokens.")
+    parser.add_argument("--thinking_span", type=int, nargs=2, default=None, help="Optional thinking span over generation tokens.")
+    parser.add_argument("--chunk_tokens", type=int, default=128, help="IFR chunk size.")
+    parser.add_argument("--sink_chunk_tokens", type=int, default=32, help="IFR sink chunk size.")
+    parser.add_argument(
+        "--attnlrp_neg_handling",
+        type=str,
+        choices=["drop", "abs"],
+        default="drop",
+        help="FT-AttnLRP: how to handle negative values after each hop (drop=clamp>=0, abs=absolute value).",
+    )
+    parser.add_argument(
+        "--attnlrp_norm_mode",
+        type=str,
+        choices=["norm", "no_norm"],
+        default="norm",
+        help="FT-AttnLRP: norm enables per-hop global+thinking normalization + ratios; no_norm disables all three.",
+    )
+    parser.add_argument("--output_dir", type=str, default="exp/case_study/out", help="Where to write HTML/JSON artifacts.")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    device = resolve_device(args.cuda, args.cuda_num)
+    if torch.cuda.is_available():
+        visible = os.environ.get("CUDA_VISIBLE_DEVICES")
+        print(f"[info] CUDA_VISIBLE_DEVICES={visible!r} torch.cuda.device_count()={torch.cuda.device_count()} device={device}")
+    if args.method == "ft_ifr":
+        method_key = "ft"
+    elif args.method == "ft_ifr_improve":
+        method_key = "ft_improve"
+    else:
+        method_key = args.method
+    model_name = args.model_path if args.model_path is not None else args.model
+    model, tokenizer = load_model(model_name, device)
+    example, ds_name = load_example(args.dataset, args.index, Path(args.data_root))
+    sink_span = tuple(args.sink_span) if args.sink_span is not None else tuple(example.sink_span) if example.sink_span else None
+    thinking_span = (
+        tuple(args.thinking_span)
+        if args.thinking_span is not None
+        else tuple(example.thinking_span) if example.thinking_span else None
+    )
+    method_label, attributor, attr_result = compute_method_attribution(
+        method_key,
+        example,
+        model,
+        tokenizer,
+        n_hops=args.n_hops,
+        sink_span=sink_span,
+        thinking_span=thinking_span,
+        chunk_tokens=args.chunk_tokens,
+        sink_chunk_tokens=args.sink_chunk_tokens,
+        attnlrp_neg_handling=args.attnlrp_neg_handling,
+        attnlrp_norm_mode=args.attnlrp_norm_mode,
+    )
+    indices_to_explain = example.indices_to_explain or example.sink_span
+    if not (isinstance(indices_to_explain, list) and len(indices_to_explain) == 2):
+        raise ValueError("MAS case study requires token-span indices_to_explain=[start_tok,end_tok] (e.g. sink_span).")
+    seq_attr, row_attr, rec_attr = attr_result.get_all_token_attrs(indices_to_explain)
+    prompt_tokens = decode_text_into_tokens(tokenizer, " " + example.prompt)
+    generation_text = example.target if example.target is not None else (getattr(attributor, "generation", None) or "")
+    variant_specs = [
+        ("seq", "Seq attribution", seq_attr),
+        ("row", "Row attribution", row_attr),
+        ("recursive", "Recursive attribution", rec_attr),
+    ]
+    formatted = format_prompt(tokenizer, " " + example.prompt)
+    prompt_ids = tokenizer(formatted, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)
+    gen_ids = tokenizer(
+        generation_text + (tokenizer.eos_token or ""),
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).input_ids.to(model.device)
+    base_score = score_prompt_ids_with_generation(model, prompt_ids=prompt_ids, generation_ids=gen_ids)
+    panels_raw: List[Dict[str, Any]] = []
+    panels_display: List[Dict[str, Any]] = []
+    for variant_key, variant_label, variant_attr in variant_specs:
+        prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])  # cols=(P+G), rows=G
+        attr_prompt = variant_attr[:, :prompt_len]
+        keep_prompt_token_indices = None
+        if method_key == "ft_improve":
+            import ft_ifr_improve
+            keep_prompt_token_indices = ft_ifr_improve.keep_token_indices(list(getattr(attributor, "user_prompt_tokens", []) or []))
+        trace = mas_trace(
+            model,
+            tokenizer,
+            attribution=attr_prompt.to(device="cpu"),
+            prompt=example.prompt,
+            generation=generation_text,
+            user_prompt_indices=getattr(attributor, "user_prompt_indices", None),
+            keep_prompt_token_indices=keep_prompt_token_indices,
+        )
+        trace["variant"] = variant_key
+        trace["variant_label"] = variant_label
+        panel_raw = {
+            "variant": variant_key,
+            "variant_label": variant_label,
+            "metrics": trace.get("metrics"),
+            "sorted_attr_indices": trace.get("sorted_attr_indices"),
+            "attr_weights": trace.get("attr_weights"),
+            "token_deltas_raw": trace.get("token_deltas_raw"),
+            "mas_trace": trace,
+        }
+        panels_raw.append(panel_raw)
+        panel_display = {
+            "variant": variant_key,
+            "variant_label": variant_label,
+            "metrics": trace.get("metrics"),
+            "sorted_attr_indices": trace.get("sorted_attr_indices"),
+            "attr_weights": trace.get("attr_weights"),
+            "token_deltas_raw": trace.get("token_deltas_raw"),
+        }
+        panels_display.append(panel_display)
+    case_meta: Dict[str, Any] = {
+        "dataset": ds_name,
+        "index": args.index,
+        "mode": "mas",
+        "attr_method": method_key,
+        "attr_method_label": method_label,
+        "sink_span": sink_span,
+        "thinking_span": thinking_span,
+        "n_hops": int(args.n_hops),
+        "attnlrp_neg_handling": args.attnlrp_neg_handling if method_key in ("attnlrp", "ft_attnlrp") else None,
+        "attnlrp_norm_mode": args.attnlrp_norm_mode if method_key in ("attnlrp", "ft_attnlrp") else None,
+        "attnlrp_ratio_enabled": (args.attnlrp_norm_mode == "norm") if method_key in ("attnlrp", "ft_attnlrp") else None,
+        "base_score": float(base_score),
+    }
+    record = {
+        "meta": case_meta,
+        "prompt": example.prompt,
+        "target": example.target,
+        "generation": generation_text,
+        "prompt_tokens": prompt_tokens,
+        "panels": panels_raw,
+    }
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    stem = make_output_stem(ds_name, args.index, method_key)
+    json_path = out_dir / f"{stem}.json"
+    html_path = out_dir / f"{stem}.html"
+    with json_path.open("w", encoding="utf-8") as f:
+        json.dump(record, f, ensure_ascii=False, indent=2)
+    html = viz.render_mas_token_html(
+        case_meta,
+        prompt_tokens=prompt_tokens,
+        panels=panels_display,
+        generation=generation_text,
+    )
+    html_path.write_text(html, encoding="utf-8")
+    print(f"[done] wrote {json_path}")
+    print(f"[done] wrote {html_path}")
+if __name__ == "__main__":
+    main()

exp/case_study/viz.py ADDED Viewed

	@@ -0,0 +1,647 @@

+"""HTML helpers for visualizing hop-wise IFR/AttnLRP attributions."""
+from __future__ import annotations
+import math
+from typing import Any, Dict, List, Optional, Sequence
+from html import escape
+TOKEN_SCALE_QUANTILE = 0.995
+def _robust_abs_max(scores: Sequence[float], *, quantile: float = TOKEN_SCALE_QUANTILE) -> float:
+    """Return a robust abs max to avoid a single outlier washing out the colormap.
+    Uses a high quantile (default: p99.5) over |scores|. Top outliers saturate.
+    """
+    abs_vals: List[float] = []
+    for x in scores:
+        try:
+            v = float(x)
+        except Exception:
+            continue
+        if math.isnan(v):
+            continue
+        abs_vals.append(abs(v))
+    if not abs_vals:
+        return 0.0
+    abs_vals.sort()
+    q = float(quantile)
+    if q < 0.0:
+        q = 0.0
+    if q > 1.0:
+        q = 1.0
+    idx = int(q * (len(abs_vals) - 1))
+    return float(abs_vals[idx])
+def _color_for_score(score: float, max_score: float) -> str:
+    if max_score <= 0:
+        return "background-color: rgba(245,245,245,0.7);"
+    ratio = min(1.0, score / (max_score + 1e-12))
+    r = 255
+    g = int(235 - 90 * ratio)
+    b = int(220 - 160 * ratio)
+    alpha = 0.25 + 0.55 * ratio
+    return f"background-color: rgba({r}, {g}, {b}, {alpha});"
+def _render_sentence_list(title: str, sentences: Sequence[str], scores: Sequence[float], max_score: float) -> str:
+    rows: List[str] = []
+    for sent, sc in zip(sentences, scores):
+        style = _color_for_score(abs(float(sc)), max_score)
+        rows.append(
+            f'<div class="sent-row" style="{style}"><span class="score">{sc:.4f}</span>'
+            f'<span class="text">{escape(sent)}</span></div>'
+        )
+    return f"""
+    <div class="sent-block">
+      <div class="sent-title">{escape(title)}</div>
+      {''.join(rows)}
+    </div>
+    """
+def _render_tokens(
+    tokens: Sequence[str],
+    scores: Sequence[float],
+    max_score: float,
+    roles: Sequence[str],
+) -> str:
+    spans: List[str] = []
+    if max_score <= 0:
+        max_score = 1e-8
+    for idx, tok in enumerate(tokens):
+        score = float(scores[idx]) if idx < len(scores) else 0.0
+        style = _color_for_score(abs(score), max_score)
+        role = roles[idx] if idx < len(roles) else "gen"
+        safe_tok = escape(tok)
+        spans.append(
+            f'<span class="tok {role}" title="idx={idx}, score={score:.6f}" style="{style}">{safe_tok}</span>'
+        )
+    return "".join(spans)
+def _render_top_table(top_items: List[Dict[str, Any]]) -> str:
+    if not top_items:
+        return "<div class='top-table'><em>No attribution mass.</em></div>"
+    header = "<div class='top-row top-header'><span>Rank</span><span>Idx</span><span>Score</span><span>Sentence</span></div>"
+    body_rows = []
+    for rank, item in enumerate(top_items, start=1):
+        body_rows.append(
+            f"<div class='top-row'><span>{rank}</span><span>{item['idx']}</span>"
+            f"<span>{item['score']:.4f}</span><span>{escape(item['sentence'])}</span></div>"
+        )
+    return f"<div class='top-table'>{header}{''.join(body_rows)}</div>"
+def render_case_html(
+    case_meta: Dict[str, Any],
+    *,
+    token_view_raw: Dict[str, Any],
+    token_view_prompt: Dict[str, Any],
+    context: Optional[Dict[str, Any]] = None,
+    hops_sent: Optional[Sequence[Dict[str, Any]]] = None,
+) -> str:
+    has_sentence_view = bool(context) and bool(hops_sent)
+    prompt_len = len((context or {}).get("prompt_sentences") or []) if has_sentence_view else 0
+    gen_len = len((context or {}).get("generation_sentences") or []) if has_sentence_view else 0
+    prompt_max = 0.0
+    gen_max = 0.0
+    if has_sentence_view:
+        prompt_max = max(
+            (
+                max(h["sentence_scores_raw"][:prompt_len])
+                for h in (hops_sent or [])
+                if h.get("sentence_scores_raw") and h["sentence_scores_raw"][:prompt_len]
+            ),
+            default=0.0,
+        )
+        gen_max = max(
+            (
+                max(h["sentence_scores_raw"][prompt_len:])
+                for h in (hops_sent or [])
+                if h.get("sentence_scores_raw") and h["sentence_scores_raw"][prompt_len:]
+            ),
+            default=0.0,
+        )
+    raw_hops = token_view_raw.get("hops", []) or []
+    prompt_hops = token_view_prompt.get("hops", []) or []
+    if len(raw_hops) != len(prompt_hops):
+        raise ValueError(
+            "token_view_raw and token_view_prompt must have the same number of panels: "
+            f"raw={len(raw_hops)} prompt={len(prompt_hops)}"
+        )
+    hop_sections: List[str] = []
+    hop_count = len(prompt_hops)
+    mode = case_meta.get("mode", "ft")
+    ifr_view = case_meta.get("ifr_view", "aggregate")
+    sink_span = case_meta.get("sink_span")
+    panel_titles = case_meta.get("panel_titles")
+    def _panel_title(panel_idx: int) -> str:
+        if isinstance(panel_titles, list) and panel_idx < len(panel_titles):
+            try:
+                title = panel_titles[panel_idx]
+            except Exception:
+                title = None
+            if title is not None:
+                return str(title)
+        if mode in ("ft", "ft_improve", "ft_split_hop", "ifr_in_all_gen", "ft_attnlrp"):
+            return f"Hop {panel_idx}"
+        if mode == "ifr_all_positions_output_only":
+            return f"IFR output-only panel {panel_idx}"
+        if mode == "ifr_all_positions":
+            return f"IFR all-positions panel {panel_idx}"
+        if mode == "attnlrp":
+            return "AttnLRP (sink-span aggregate)"
+        return "IFR (sink-span aggregate)"
+    for hop_idx in range(hop_count):
+        raw_entry = raw_hops[hop_idx]
+        raw_scores = raw_entry.get("token_scores") or []
+        raw_mass = float(raw_entry.get("total_mass", 0.0))
+        raw_scale = _robust_abs_max(raw_scores)
+        if raw_scale <= 0:
+            raw_scale = float(raw_entry.get("token_score_max") or 0.0)
+        if raw_scale <= 0:
+            raw_scale = 1e-8
+        prompt_entry = prompt_hops[hop_idx]
+        prompt_scores = prompt_entry.get("token_scores") or []
+        prompt_mass = float(prompt_entry.get("total_mass", 0.0))
+        prompt_scale = _robust_abs_max(prompt_scores)
+        if prompt_scale <= 0:
+            prompt_scale = float(prompt_entry.get("token_score_max") or 0.0)
+        if prompt_scale <= 0:
+            prompt_scale = 1e-8
+        tok_raw_html = f"""
+            <div class="tokens-block">
+              <div class="tokens-title">{escape(token_view_raw.get("label", "Pre-trim token-level heatmap (full)"))}</div>
+              <div class="tokens-row">
+              {_render_tokens(token_view_raw.get("tokens", []), raw_scores, raw_scale, token_view_raw.get("roles", []))}
+              </div>
+            </div>
+        """
+        tok_prompt_html = f"""
+            <div class="tokens-block">
+              <div class="tokens-title">{escape(token_view_prompt.get("label", "Prompt-only token-level heatmap"))}</div>
+              <div class="tokens-row">
+              {_render_tokens(token_view_prompt.get("tokens", []), prompt_scores, prompt_scale, token_view_prompt.get("roles", []))}
+              </div>
+            </div>
+        """
+        sentence_html = ""
+        top_html = ""
+        if has_sentence_view and hop_idx < len(hops_sent or []):
+            hop = (hops_sent or [])[hop_idx]
+            raw_scores = hop.get("sentence_scores_raw") or []
+            prompt_scores = raw_scores[:prompt_len]
+            gen_scores = raw_scores[prompt_len:]
+            # Sentence view is not used by the current case-study runner; keep the path for completeness.
+            sentence_html = f"""
+              <div class="columns">
+                {_render_sentence_list('Prompt sentences', (context or {}).get('prompt_sentences') or [], prompt_scores, prompt_max)}
+                {_render_sentence_list('Generation sentences', (context or {}).get('generation_sentences') or [], gen_scores, gen_max)}
+              </div>
+            """
+            top_html = f"""
+              <div class="top-wrap">
+                <div class="section-label">Top sentences (all)</div>
+                {_render_top_table(hop.get('top_sentences') or [])}
+              </div>
+            """
+        hop_sections.append(
+            f"""
+            <div class="hop">
+              <div class="hop-header">
+                <div class="hop-title">{escape(_panel_title(hop_idx))}</div>
+                <div class="hop-meta">
+                  raw mass: {raw_mass:.6f} | raw scale(p{int(TOKEN_SCALE_QUANTILE*1000)/10:.1f} abs): {raw_scale:.6g}
+                  &nbsp;|&nbsp;
+                  prompt mass: {prompt_mass:.6f} | prompt scale(p{int(TOKEN_SCALE_QUANTILE*1000)/10:.1f} abs): {prompt_scale:.6g}
+                </div>
+              </div>
+              {tok_raw_html}
+              {tok_prompt_html}
+              {sentence_html}
+              {top_html}
+            </div>
+            """
+        )
+    thinking_ratios = case_meta.get("thinking_ratios") or []
+    ratios_str = ", ".join(f"{r:.4f}" for r in thinking_ratios) if thinking_ratios else "N/A"
+    if mode == "ft":
+        mode_label = "FT Multi-hop (IFR)"
+    elif mode == "ifr_in_all_gen":
+        mode_label = "IFR In-all-gen (multi-hop)"
+    elif mode == "ifr":
+        mode_label = "IFR Standard"
+    elif mode == "ifr_all_positions":
+        mode_label = "IFR All-positions"
+    elif mode == "ifr_all_positions_output_only":
+        mode_label = "IFR Output-only (all positions)"
+    elif mode == "attnlrp":
+        mode_label = "AttnLRP"
+    elif mode == "ft_attnlrp":
+        mode_label = "FT Multi-hop (AttnLRP)"
+    else:
+        mode_label = str(mode)
+    if mode in ("ft", "ifr_in_all_gen", "ft_attnlrp"):
+        view_key = "Recursive hops"
+        view_val = case_meta.get("n_hops")
+    elif mode in ("ifr", "ifr_all_positions", "ifr_all_positions_output_only"):
+        view_key = "IFR view"
+        view_val = ifr_view
+    elif mode == "attnlrp":
+        view_key = "AttnLRP view"
+        view_val = "ft_hop0_span_aggregate"
+    else:
+        view_key = "View"
+        view_val = "N/A"
+    scale_row = f"<div>Token scale: per-panel per-view p{int(TOKEN_SCALE_QUANTILE*1000)/10:.1f}(|score|)</div>"
+    neg_handling = case_meta.get("attnlrp_neg_handling")
+    norm_mode = case_meta.get("attnlrp_norm_mode")
+    ratio_enabled = case_meta.get("attnlrp_ratio_enabled")
+    attn_rows = []
+    if neg_handling:
+        attn_rows.append(f"<div>FT-AttnLRP neg_handling: {escape(str(neg_handling))}</div>")
+    if norm_mode:
+        attn_rows.append(f"<div>FT-AttnLRP norm_mode: {escape(str(norm_mode))}</div>")
+    if ratio_enabled is not None:
+        attn_rows.append(f"<div>FT-AttnLRP ratio_enabled: {escape(str(bool(ratio_enabled)))}</div>")
+    header = f"""
+    <div class="header">
+      <div>
+        <div class="title">{escape(mode_label)} Case Study</div>
+        <div class="subtitle">Dataset: {escape(str(case_meta.get('dataset')))} | index: {case_meta.get('index')}</div>
+      </div>
+      <div class="meta">
+        <div>Sink span (gen idx): {escape(str(case_meta.get('sink_span')))}</div>
+        <div>Thinking span (gen idx): {escape(str(case_meta.get('thinking_span')))}</div>
+        <div>Panels: {hop_count}</div>
+        <div>{escape(str(view_key))}: {escape(str(view_val))}</div>
+        {scale_row}
+        {''.join(attn_rows)}
+        <div>Thinking ratios: {ratios_str}</div>
+      </div>
+    </div>
+    """
+    style = """
+    <style>
+      body { font-family: "Inter", "Helvetica Neue", Arial, sans-serif; margin: 0; padding: 24px; background: #fcfcff; color: #1f2933; }
+      .title { font-size: 24px; font-weight: 700; }
+      .subtitle { font-size: 14px; color: #566; margin-top: 4px; }
+      .header { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; padding-bottom: 16px; border-bottom: 1px solid #e5e8ee; }
+      .meta { font-size: 13px; color: #334; line-height: 1.6; }
+      .hop { margin-top: 20px; padding: 16px; border: 1px solid #e5e8ee; border-radius: 10px; background: #fff; box-shadow: 0 2px 6px rgba(0,0,0,0.04); }
+      .hop-header { display: flex; justify-content: space-between; align-items: center; }
+      .hop-title { font-weight: 600; font-size: 16px; }
+      .hop-meta { font-size: 12px; color: #556; }
+      .tokens-block { margin-top: 12px; border: 1px solid #eef1f6; border-radius: 8px; padding: 10px; background: #f9fbff; }
+      .tokens-title { font-size: 13px; font-weight: 600; margin-bottom: 8px; color: #263; }
+      .tokens-row { font-family: "SFMono-Regular", Consolas, monospace; font-size: 12px; line-height: 1.8; word-break: break-word; }
+      .tok { display: inline; padding: 2px 1px; margin: 0 0px; border-radius: 3px; }
+      .tok.prompt { border-bottom: 1px dashed #6b8fb8; }
+      .tok.user { border-bottom: 1px dashed #4f72c7; }
+      .tok.template { border-bottom: 1px dashed #9aa9c0; }
+      .tok.think { border-bottom: 1px dashed #8ba86b; }
+      .tok.output { border-bottom: 1px dashed #c78a6e; }
+      .tok.gen { border-bottom: 1px dashed #999; }
+      .tok:hover { outline: 1px solid #8899aa; }
+      .columns { display: grid; grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); gap: 12px; margin-top: 12px; }
+      .sent-block { padding: 8px; border: 1px solid #eef1f6; border-radius: 8px; background: #f9fbff; }
+      .sent-title { font-weight: 600; font-size: 13px; margin-bottom: 6px; color: #263; }
+      .sent-row { padding: 6px 8px; border-radius: 6px; margin-bottom: 6px; display: flex; gap: 8px; align-items: flex-start; }
+      .sent-row:last-child { margin-bottom: 0; }
+      .sent-row .score { font-family: "SFMono-Regular", Consolas, monospace; font-size: 12px; color: #233; min-width: 60px; }
+      .sent-row .text { flex: 1; font-size: 13px; }
+      .top-wrap { margin-top: 10px; }
+      .section-label { font-size: 13px; font-weight: 600; margin-bottom: 6px; color: #263; }
+      .top-table { border: 1px solid #eef1f6; border-radius: 8px; background: #fff; }
+      .top-row { display: grid; grid-template-columns: 50px 50px 80px 1fr; padding: 6px 8px; gap: 8px; font-size: 12px; }
+      .top-header { background: #f3f6fb; font-weight: 700; color: #223; }
+      .top-row:nth-child(odd):not(.top-header) { background: #fbfdff; }
+    </style>
+    """
+    title = f"{mode_label} Case Study"
+    html = f"""<!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="utf-8" />
+        <title>{escape(title)}</title>
+        {style}
+      </head>
+      <body>
+        {header}
+        {''.join(hop_sections)}
+      </body>
+    </html>"""
+    return html
+def _render_sentence_spans(title: str, sentences: Sequence[str], scores: Sequence[float]) -> str:
+    max_abs = max((abs(float(x)) for x in scores), default=0.0)
+    spans: List[str] = []
+    for idx, sentence in enumerate(sentences):
+        score = float(scores[idx]) if idx < len(scores) else 0.0
+        style = _color_for_score(abs(score), max_abs)
+        spans.append(
+            f'<span class="sent-span" title="idx={idx}, score={score:.6f}" style="{style}">{escape(sentence)}</span>'
+        )
+    return f"""
+    <div class="sentmap">
+      <div class="sentmap-title">{escape(title)}</div>
+      <div class="sentmap-text">{''.join(spans)}</div>
+    </div>
+    """
+def _render_token_spans(title: str, tokens: Sequence[str], scores: Sequence[float]) -> str:
+    max_abs = max((abs(float(x)) for x in scores), default=0.0)
+    spans: List[str] = []
+    for idx, tok in enumerate(tokens):
+        score = float(scores[idx]) if idx < len(scores) else 0.0
+        style = _color_for_score(abs(score), max_abs)
+        spans.append(
+            f'<span class="tok-span" title="idx={idx}, score={score:.6f}" style="{style}">{escape(tok)}</span>'
+        )
+    return f"""
+    <div class="tokmap">
+      <div class="tokmap-title">{escape(title)}</div>
+      <div class="tokmap-text">{''.join(spans)}</div>
+    </div>
+    """
+def render_mas_sentence_html(
+    case_meta: Dict[str, Any],
+    *,
+    prompt_sentences: Sequence[str],
+    panels: Sequence[Dict[str, Any]],
+    generation: Optional[str] = None,
+) -> str:
+    """Render MAS sentence-level diagnostics (attribution / pure ablation / guided marginal)."""
+    method_label = case_meta.get("attr_method_label") or case_meta.get("attr_method") or "Unknown method"
+    title = f"MAS Sentence Study ({method_label})"
+    neg_handling = case_meta.get("attnlrp_neg_handling")
+    norm_mode = case_meta.get("attnlrp_norm_mode")
+    ratio_enabled = case_meta.get("attnlrp_ratio_enabled")
+    attn_rows = []
+    if neg_handling:
+        attn_rows.append(f"<div>FT-AttnLRP neg_handling: {escape(str(neg_handling))}</div>")
+    if norm_mode:
+        attn_rows.append(f"<div>FT-AttnLRP norm_mode: {escape(str(norm_mode))}</div>")
+    if ratio_enabled is not None:
+        attn_rows.append(f"<div>FT-AttnLRP ratio_enabled: {escape(str(bool(ratio_enabled)))}</div>")
+    base_score = case_meta.get("base_score")
+    base_score_row = f"<div>Base score: {float(base_score):.6f}</div>" if isinstance(base_score, (int, float)) else ""
+    gen_block = ""
+    if isinstance(generation, str) and generation:
+        gen_block = f"""
+        <div class="text-block">
+          <div class="text-title">Generation (scored)</div>
+          <div class="text-body">{escape(generation)}</div>
+        </div>
+        """
+    header = f"""
+    <div class="header">
+      <div>
+        <div class="title">{escape(title)}</div>
+        <div class="subtitle">Dataset: {escape(str(case_meta.get('dataset')))} | index: {case_meta.get('index')}</div>
+      </div>
+      <div class="meta">
+        <div>Attribution method: {escape(str(case_meta.get('attr_method')))}</div>
+        <div>Sink span (gen idx): {escape(str(case_meta.get('sink_span')))}</div>
+        <div>Thinking span (gen idx): {escape(str(case_meta.get('thinking_span')))}</div>
+        <div>Panels: {len(panels)}</div>
+        {''.join(attn_rows)}
+        {base_score_row}
+      </div>
+    </div>
+    """
+    panel_sections: List[str] = []
+    for panel in panels:
+        label = panel.get("variant_label") or panel.get("panel_label") or panel.get("variant") or "Panel"
+        metrics = panel.get("metrics") or {}
+        metrics_str = " | ".join(
+            f"{k}: {float(metrics[k]):.4f}" if isinstance(metrics.get(k), (int, float)) else f"{k}: {metrics.get(k)}"
+            for k in ("RISE", "MAS", "RISE+AP")
+            if k in metrics
+        )
+        attr_weights = panel.get("attr_weights") or []
+        pure_deltas = panel.get("pure_sentence_deltas_raw") or []
+        guided_deltas = panel.get("guided_sentence_deltas_raw") or panel.get("sentence_deltas_raw") or []
+        rank_order = panel.get("sorted_attr_indices") or []
+        rank_str = ", ".join(str(int(x)) for x in rank_order) if rank_order else "N/A"
+        panel_sections.append(
+            f"""
+            <div class="panel">
+              <div class="panel-header">
+                <div class="panel-title">{escape(str(label))}</div>
+                <div class="panel-meta">{escape(metrics_str)}</div>
+              </div>
+              {_render_sentence_spans("Method attribution (sentence weights)", prompt_sentences, attr_weights)}
+              {_render_sentence_spans("Pure sentence ablation (base − score)", prompt_sentences, pure_deltas)}
+              {_render_sentence_spans("Attribution-guided MAS marginal (path deltas)", prompt_sentences, guided_deltas)}
+              <div class="panel-foot">Rank order: {escape(rank_str)}</div>
+            </div>
+            """
+        )
+    style = """
+    <style>
+      body { font-family: "Inter", "Helvetica Neue", Arial, sans-serif; margin: 0; padding: 24px; background: #fcfcff; color: #1f2933; }
+      .title { font-size: 24px; font-weight: 700; }
+      .subtitle { font-size: 14px; color: #566; margin-top: 4px; }
+      .header { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; padding-bottom: 16px; border-bottom: 1px solid #e5e8ee; }
+      .meta { font-size: 13px; color: #334; line-height: 1.6; }
+      .text-block { margin-top: 16px; border: 1px solid #eef1f6; border-radius: 10px; padding: 12px; background: #fff; }
+      .text-title { font-size: 13px; font-weight: 700; color: #263; margin-bottom: 8px; }
+      .text-body { font-size: 13px; line-height: 1.7; white-space: pre-wrap; word-break: break-word; }
+      .panel { margin-top: 18px; padding: 16px; border: 1px solid #e5e8ee; border-radius: 10px; background: #fff; box-shadow: 0 2px 6px rgba(0,0,0,0.04); }
+      .panel-header { display: flex; justify-content: space-between; align-items: center; }
+      .panel-title { font-weight: 600; font-size: 16px; }
+      .panel-meta { font-size: 12px; color: #556; }
+      .panel-foot { margin-top: 8px; font-size: 12px; color: #556; }
+      .sentmap { margin-top: 12px; border: 1px solid #eef1f6; border-radius: 8px; padding: 10px; background: #f9fbff; }
+      .sentmap-title { font-size: 13px; font-weight: 600; margin-bottom: 8px; color: #263; }
+      .sentmap-text { font-size: 13px; line-height: 1.8; white-space: pre-wrap; word-break: break-word; }
+      .sent-span { display: inline; padding: 2px 2px; margin: 0 0px; border-radius: 4px; }
+      .sent-span:hover { outline: 1px solid #8899aa; }
+    </style>
+    """
+    html = f"""<!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="utf-8" />
+        <title>{escape(title)}</title>
+        {style}
+      </head>
+      <body>
+        {header}
+        {gen_block}
+        {''.join(panel_sections)}
+      </body>
+    </html>"""
+    return html
+def render_mas_token_html(
+    case_meta: Dict[str, Any],
+    *,
+    prompt_tokens: Sequence[str],
+    panels: Sequence[Dict[str, Any]],
+    generation: Optional[str] = None,
+) -> str:
+    """Render MAS token-level diagnostics (attribution weights + guided marginal deltas)."""
+    method_label = case_meta.get("attr_method_label") or case_meta.get("attr_method") or "Unknown method"
+    title = f"MAS Token Study ({method_label})"
+    neg_handling = case_meta.get("attnlrp_neg_handling")
+    norm_mode = case_meta.get("attnlrp_norm_mode")
+    ratio_enabled = case_meta.get("attnlrp_ratio_enabled")
+    attn_rows = []
+    if neg_handling:
+        attn_rows.append(f"<div>FT-AttnLRP neg_handling: {escape(str(neg_handling))}</div>")
+    if norm_mode:
+        attn_rows.append(f"<div>FT-AttnLRP norm_mode: {escape(str(norm_mode))}</div>")
+    if ratio_enabled is not None:
+        attn_rows.append(f"<div>FT-AttnLRP ratio_enabled: {escape(str(bool(ratio_enabled)))}</div>")
+    base_score = case_meta.get("base_score")
+    base_score_row = f"<div>Base score: {float(base_score):.6f}</div>" if isinstance(base_score, (int, float)) else ""
+    gen_block = ""
+    if isinstance(generation, str) and generation:
+        gen_block = f"""
+        <div class="text-block">
+          <div class="text-title">Generation (scored)</div>
+          <div class="text-body">{escape(generation)}</div>
+        </div>
+        """
+    header = f"""
+    <div class="header">
+      <div>
+        <div class="title">{escape(title)}</div>
+        <div class="subtitle">Dataset: {escape(str(case_meta.get('dataset')))} | index: {case_meta.get('index')}</div>
+      </div>
+      <div class="meta">
+        <div>Attribution method: {escape(str(case_meta.get('attr_method')))}</div>
+        <div>Sink span (gen idx): {escape(str(case_meta.get('sink_span')))}</div>
+        <div>Thinking span (gen idx): {escape(str(case_meta.get('thinking_span')))}</div>
+        <div>Prompt tokens: {len(prompt_tokens)}</div>
+        <div>Panels: {len(panels)}</div>
+        {''.join(attn_rows)}
+        {base_score_row}
+      </div>
+    </div>
+    """
+    panel_sections: List[str] = []
+    for panel in panels:
+        label = panel.get("variant_label") or panel.get("panel_label") or panel.get("variant") or "Panel"
+        metrics = panel.get("metrics") or {}
+        metrics_str = " | ".join(
+            f"{k}: {float(metrics[k]):.4f}" if isinstance(metrics.get(k), (int, float)) else f"{k}: {metrics.get(k)}"
+            for k in ("RISE", "MAS", "RISE+AP")
+            if k in metrics
+        )
+        attr_weights = panel.get("attr_weights") or []
+        guided_deltas = panel.get("token_deltas_raw") or []
+        rank_order = panel.get("sorted_attr_indices") or []
+        rank_str = ", ".join(str(int(x)) for x in rank_order) if rank_order else "N/A"
+        panel_sections.append(
+            f"""
+            <div class="panel">
+              <div class="panel-header">
+                <div class="panel-title">{escape(str(label))}</div>
+                <div class="panel-meta">{escape(metrics_str)}</div>
+              </div>
+              {_render_token_spans("Method attribution (token weights)", prompt_tokens, attr_weights)}
+              {_render_token_spans("Attribution-guided MAS marginal (path deltas)", prompt_tokens, guided_deltas)}
+              <div class="panel-foot">Rank order: {escape(rank_str)}</div>
+            </div>
+            """
+        )
+    style = """
+    <style>
+      body { font-family: "Inter", "Helvetica Neue", Arial, sans-serif; margin: 0; padding: 24px; background: #fcfcff; color: #1f2933; }
+      .title { font-size: 24px; font-weight: 700; }
+      .subtitle { font-size: 14px; color: #566; margin-top: 4px; }
+      .header { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; padding-bottom: 16px; border-bottom: 1px solid #e5e8ee; }
+      .meta { font-size: 13px; color: #334; line-height: 1.6; }
+      .text-block { margin-top: 16px; border: 1px solid #eef1f6; border-radius: 10px; padding: 12px; background: #fff; }
+      .text-title { font-size: 13px; font-weight: 700; color: #263; margin-bottom: 8px; }
+      .text-body { font-size: 13px; line-height: 1.7; white-space: pre-wrap; word-break: break-word; }
+      .panel { margin-top: 18px; padding: 16px; border: 1px solid #e5e8ee; border-radius: 10px; background: #fff; box-shadow: 0 2px 6px rgba(0,0,0,0.04); }
+      .panel-header { display: flex; justify-content: space-between; align-items: center; }
+      .panel-title { font-weight: 600; font-size: 16px; }
+      .panel-meta { font-size: 12px; color: #556; }
+      .panel-foot { margin-top: 8px; font-size: 12px; color: #556; }
+      .tokmap { margin-top: 12px; border: 1px solid #eef1f6; border-radius: 8px; padding: 10px; background: #f9fbff; }
+      .tokmap-title { font-size: 13px; font-weight: 600; margin-bottom: 8px; color: #263; }
+      .tokmap-text { font-size: 13px; line-height: 1.8; white-space: pre-wrap; word-break: break-word; }
+      .tok-span { display: inline; padding: 1px 1px; margin: 0 0px; border-radius: 3px; }
+      .tok-span:hover { outline: 1px solid #8899aa; }
+    </style>
+    """
+    html = f"""<!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="utf-8" />
+        <title>{escape(title)}</title>
+        {style}
+      </head>
+      <body>
+        {header}
+        {gen_block}
+        {''.join(panel_sections)}
+      </body>
+    </html>"""
+    return html

exp/exp1/README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# FlashTrace 长上下文耗时实验（exp1）
+自包含脚本：`exp/exp1/run_time_curve.py`
+用途：在单个 RULER 样本上，测量不同上下文长度下各归因方法的 wall-clock 时间与 GPU 峰值显存，供论文中的线性增长表格使用。
+## 方法覆盖
+- `IG`（20 步）
+- `attention_I_G`（注意力 * IG）
+- `attnlrp`（单次反传的 LRP 版本）
+- `perturbation_all`（log-loss ablation）
+- `perturbation_CLP`（KL 版）
+- `perturbation_REAGENT`（MLM 替换，LED/4096 上限，超过则可能失败）
+- `ifr_all_positions`（IFR one-by-one baseline，`sink_chunk_tokens=1` 固定）
+- `ifr_multi_hop`（FlashTrace，多跳+chunk 支持）
+- `ifr_multi_hop_both`（FT-IFR both：stop_words + in_all_gen，多跳+chunk 支持）
+## 运行示例
+```bash
+# 默认 input 长度 1024,4096,8192，output 长度 32,256,512；每格 3 次
+python exp/exp1/run_time_curve.py \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 2,3,4,5,6,7 \
+  --attr_funcs perturbation_all,perturbation_REAGENT,ifr_all_positions,perturbation_CLP,ifr_multi_hop,ifr_multi_hop_both,attnlrp \
+  --input_lengths 10 \
+  --output_lengths 2000,5000,10000 \
+  --repeats 1 \
+  --chunk_tokens 128 \
+  --sink_chunk_tokens 32 \
+  --catch_oom \
+  --ruler_file data/ruler_multihop/8192/vt_h10_c1/validation.jsonl
+```
+输出：
+- `exp/exp1/out/time_curve_runs.jsonl`：每次运行的原始记录（attr、目标 input/output/total、实际长度、time、peak_mem、status）。
+- `exp/exp1/out/time_curve_summary.csv`：按方法 + 目标 input/output 汇总的均值/方差（同时写出 total=input+output）。
+## 注意事项
+- `--input_lengths` 控制 prompt（user prompt）长度，`--output_lengths` 控制 output（sink）长度；每个格子的 total = input + output。
+- 兼容：仍支持 `--total_lengths/--lengths`（deprecated），表示 prompt+output 总长度；prompt 长度按两者差值生成。
+- `--target_text` 作为基底被重复拼接以满足目标 output 长度，仅用于控制长度，不在乎语义。
+- `--catch_oom/--no-catch-oom` 用于选择是把 OOM 记为 status 继续，还是直接抛错中止。
+- 多卡：`--cuda 0,1` 会在脚本启动前设置 `CUDA_VISIBLE_DEVICES` 并用 `device_map=balanced` 分片加载；单卡指定 `--cuda 0`。
+- 超出模型上下文 (`config.max_position_embeddings`) 会标记 `skipped_model_ctx`（按实际喂给模型的 formatted prompt + output(+eos) token 数检查）。
+- `perturbation_REAGENT` 的 Longformer 仅支持 4096 tokens，超过可能返回 OOM 或 runtime_error。
+- IFR multi-hop 提供 `--chunk_tokens/--sink_chunk_tokens` 以在超长上下文上强制分块，显存会下降但时间略升；`ifr_all_positions` 分支固定 `sink_chunk_tokens=1`。

exp/exp1/run_time_curve.py ADDED Viewed

	@@ -0,0 +1,757 @@

+#!/usr/bin/env python3
+"""
+Measure wall-clock time and GPU memory for attribution methods across
+different context lengths using a single synthetic RULER-style example.
+This script stays self-contained under exp/exp1 and reuses the attribution
+implementations in the repo (IG, perturbation, attention*IG, IFR/FlashTrace).
+The goal is to populate the time-vs-length table; correctness of the task
+content is not important, only matching token lengths and running 3 repeats.
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import random
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+def _early_set_cuda_visible_devices() -> None:
+    """Parse --cuda early to set CUDA_VISIBLE_DEVICES before torch import."""
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    if args.cuda and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+_early_set_cuda_visible_devices()
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+import llm_attr
+DEFAULT_INPUT_LENGTHS = [1024, 4096, 8192]
+DEFAULT_OUTPUT_LENGTHS = [32, 256, 512]
+DEFAULT_ATTRS = [
+    "IG",
+    "perturbation_all",
+    "attention_I_G",
+    "perturbation_REAGENT",
+    "ifr_all_positions",
+    "perturbation_CLP",
+    "ifr_multi_hop",
+    "attnlrp",
+]
+DEFAULT_RULER_FILE = REPO_ROOT / "data" / "ruler_multihop" / "8192" / "vt_h10_c1" / "validation.jsonl"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser("FlashTrace time/memory curve.")
+    parser.add_argument("--model", type=str, required=True, help="Model name or HF repo id.")
+    parser.add_argument("--model_path", type=str, default=None, help="Optional local model path.")
+    parser.add_argument("--cuda", type=str, default=None, help='CUDA devices, e.g. "0,1" or "0".')
+    parser.add_argument("--cuda_num", type=int, default=0, help="Single GPU index if --cuda is not set.")
+    parser.add_argument(
+        "--attr_funcs",
+        type=str,
+        default=",".join(DEFAULT_ATTRS),
+        help="Comma-separated attribution methods.",
+    )
+    length_group = parser.add_mutually_exclusive_group()
+    parser.add_argument(
+        "--output_lengths",
+        type=str,
+        default=",".join(str(x) for x in DEFAULT_OUTPUT_LENGTHS),
+        help="Comma-separated target output token lengths (sink/output segment).",
+    )
+    length_group.add_argument(
+        "--input_lengths",
+        type=str,
+        default=",".join(str(x) for x in DEFAULT_INPUT_LENGTHS),
+        help="Comma-separated target input/prompt token lengths (user prompt only; excludes chat template).",
+    )
+    length_group.add_argument(
+        "--total_lengths",
+        "--lengths",
+        dest="total_lengths",
+        type=str,
+        default=None,
+        help="Deprecated. Target total token lengths (prompt + output). Use --input_lengths instead.",
+    )
+    parser.add_argument("--repeats", type=int, default=3, help="Number of runs per cell.")
+    parser.add_argument("--output_dir", type=str, default="exp/exp1/out", help="Output directory.")
+    parser.add_argument(
+        "--ruler_file",
+        type=str,
+        default=str(DEFAULT_RULER_FILE),
+        help="RULER jsonl file providing a long base passage.",
+    )
+    parser.add_argument(
+        "--chunk_tokens",
+        type=int,
+        default=128,
+        help="IFR chunk_tokens override when context is long.",
+    )
+    parser.add_argument(
+        "--sink_chunk_tokens",
+        type=int,
+        default=32,
+        help="IFR sink_chunk_tokens override when context is long.",
+    )
+    parser.add_argument(
+        "--catch_oom",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="If true, treat CUDA OOM as status=oom and continue; if false, let OOM raise.",
+    )
+    parser.add_argument(
+        "--target_text",
+        type=str,
+        default=" The answer is 42.",
+        help="Base text to tile when constructing outputs of a given length.",
+    )
+    return parser.parse_args()
+def parse_csv_ints(value: str) -> List[int]:
+    return [int(x) for x in value.split(",") if x.strip()]
+def resolve_device(cuda: Optional[str], cuda_num: int) -> str:
+    if cuda is not None and "," in cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
+        return "auto"
+    if cuda is not None and cuda.strip():
+        try:
+            idx = int(cuda)
+        except Exception:
+            idx = 0
+        return f"cuda:{idx}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{cuda_num}" if torch.cuda.is_available() else "cpu"
+def load_ruler_base(path: Path, fallback: str) -> str:
+    if not path.exists():
+        return fallback
+    with path.open() as f:
+        for line in f:
+            try:
+                record = json.loads(line)
+                if "input" in record:
+                    return record["input"]
+            except json.JSONDecodeError:
+                continue
+    return fallback
+def build_prompt_to_length(tokenizer, base_text: str, target_tokens: int) -> Tuple[str, int]:
+    """
+    Build a prompt whose tokenized length (without special tokens) is ~target_tokens.
+    If base_text is shorter, we repeat it; if longer, we truncate.
+    """
+    if target_tokens <= 0:
+        return "", 0
+    base_ids = tokenizer(base_text, add_special_tokens=False).input_ids
+    if not base_ids:
+        base_ids = [tokenizer.eos_token_id]
+    tiled: List[int] = []
+    while len(tiled) < target_tokens:
+        tiled.extend(base_ids)
+    tiled = tiled[:target_tokens]
+    prompt = tokenizer.decode(tiled, clean_up_tokenization_spaces=False)
+    return prompt, len(tiled)
+def build_output_to_length(tokenizer, base_text: str, target_tokens: int) -> Tuple[str, int]:
+    """
+    Build a target/output string of ~target_tokens using a base snippet.
+    """
+    if target_tokens <= 0:
+        return "", 0
+    base_ids = tokenizer(base_text, add_special_tokens=False).input_ids
+    if not base_ids:
+        base_ids = [tokenizer.eos_token_id]
+    tiled: List[int] = []
+    while len(tiled) < target_tokens:
+        tiled.extend(base_ids)
+    tiled = tiled[:target_tokens]
+    text = tokenizer.decode(tiled, clean_up_tokenization_spaces=False)
+    return text, len(tiled)
+def build_formatted_prompt(tokenizer, prompt: str) -> str:
+    user_prompt = " " + prompt
+    modified_prompt = llm_attr.DEFAULT_PROMPT_TEMPLATE.format(context=user_prompt, query="")
+    formatted_prompt = [{"role": "user", "content": modified_prompt}]
+    return tokenizer.apply_chat_template(
+        formatted_prompt,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+def estimate_model_lengths(tokenizer, prompt: str, target: str) -> Dict[str, int]:
+    user_prompt = " " + prompt
+    formatted_prompt = build_formatted_prompt(tokenizer, prompt)
+    user_prompt_len = len(tokenizer(user_prompt, add_special_tokens=False).input_ids)
+    formatted_prompt_len = len(tokenizer(formatted_prompt, add_special_tokens=False).input_ids)
+    generation_len = len(tokenizer(target + tokenizer.eos_token, add_special_tokens=False).input_ids)
+    return {
+        "user_prompt_tokens": user_prompt_len,
+        "formatted_prompt_tokens": formatted_prompt_len,
+        "generation_tokens": generation_len,
+        "total_tokens": formatted_prompt_len + generation_len,
+    }
+def exceeds_model_ctx(tokenizer, prompt: str, target: str, max_ctx: Optional[int]) -> bool:
+    if max_ctx is None:
+        return False
+    return estimate_model_lengths(tokenizer, prompt, target)["total_tokens"] > max_ctx
+def load_model_balanced(model_name: str, device: str):
+    """Load model with an explicit balanced device_map when multi-GPU is requested."""
+    if device == "auto":
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="balanced",
+            torch_dtype=torch.float16,
+            attn_implementation="eager",
+        )
+    elif isinstance(device, str) and device.startswith("cuda:"):
+        try:
+            gpu_idx = int(device.split(":")[1])
+        except Exception:
+            gpu_idx = 0
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map={"": gpu_idx},
+            torch_dtype=torch.float16,
+            attn_implementation="eager",
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            attn_implementation="eager",
+        )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+def collect_device_indices(device_str: str, model: Any) -> List[int]:
+    """
+    Infer the CUDA device indices that should be tracked for memory stats.
+    Prefers the model's device map; otherwise falls back to all visible devices
+    or the single requested device.
+    """
+    if not torch.cuda.is_available():
+        return []
+    devices: set[int] = set()
+    device_map = getattr(model, "hf_device_map", None)
+    if isinstance(device_map, dict):
+        for dev in device_map.values():
+            if dev is None:
+                continue
+            idx: Optional[int] = None
+            if isinstance(dev, torch.device):
+                idx = dev.index if dev.index is not None else (0 if dev.type == "cuda" else None)
+            elif isinstance(dev, str):
+                try:
+                    d = torch.device(dev)
+                    idx = d.index if d.index is not None else (0 if d.type == "cuda" else None)
+                except Exception:
+                    idx = None
+            elif isinstance(dev, int):
+                idx = dev
+            if idx is not None:
+                devices.add(idx)
+    if not devices:
+        if device_str == "auto":
+            devices.update(range(torch.cuda.device_count()))
+        elif isinstance(device_str, str) and device_str.startswith("cuda:"):
+            try:
+                devices.add(int(device_str.split(":")[1]))
+            except Exception:
+                pass
+        else:
+            devices.update(range(torch.cuda.device_count()))
+    return sorted(devices)
+def maybe_reset_cuda(device_indices: List[int]) -> None:
+    if not torch.cuda.is_available() or not device_indices:
+        return
+    for idx in device_indices:
+        try:
+            torch.cuda.reset_peak_memory_stats(device=idx)
+        except Exception:
+            pass
+    try:
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
+def measure(
+    method_fn,
+    device_indices: List[int],
+    *,
+    catch_oom: bool,
+) -> Tuple[str, Optional[float], Optional[float], Optional[float], Dict[int, Dict[str, float]]]:
+    status = "ok"
+    wall: Optional[float] = None
+    mem_alloc: Optional[float] = None
+    mem_reserved: Optional[float] = None
+    mem_by_device: Dict[int, Dict[str, float]] = {}
+    try:
+        if torch.cuda.is_available() and device_indices:
+            for idx in device_indices:
+                torch.cuda.synchronize(device=idx)
+        t0 = time.time()
+        method_fn()
+        if torch.cuda.is_available() and device_indices:
+            for idx in device_indices:
+                torch.cuda.synchronize(device=idx)
+        wall = time.time() - t0
+    except RuntimeError as e:
+        if "out of memory" in str(e).lower():
+            status = "oom"
+            if not catch_oom:
+                raise
+        else:
+            status = f"runtime_error: {e}"
+            if not catch_oom:
+                raise
+    except Exception as e:
+        status = f"error: {e}"
+        if not catch_oom:
+            raise
+    finally:
+        if torch.cuda.is_available() and device_indices:
+            try:
+                total_alloc = 0.0
+                total_reserved = 0.0
+                for idx in device_indices:
+                    alloc_bytes = torch.cuda.max_memory_allocated(device=idx)
+                    reserved_bytes = torch.cuda.max_memory_reserved(device=idx)
+                    total_alloc += alloc_bytes
+                    total_reserved += reserved_bytes
+                    mem_by_device[idx] = {
+                        "allocated_gb": alloc_bytes / 1e9,
+                        "reserved_gb": reserved_bytes / 1e9,
+                    }
+                mem_alloc = total_alloc / 1e9
+                mem_reserved = total_reserved / 1e9
+            except Exception:
+                pass
+    return status, wall, mem_alloc, mem_reserved, mem_by_device
+def make_attr_runner(
+    attr_func: str,
+    model: Any,
+    tokenizer: Any,
+    chunk_tokens: int,
+    sink_chunk_tokens: int,
+    batch_size: int,
+    prompt: str,
+    target: str,
+):
+    lf = attr_func.lower()
+    if lf == "ig":
+        llm_attributor = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        def fn():
+            return llm_attributor.calculate_IG_per_generation(
+                prompt, steps=20, baseline=tokenizer.eos_token_id, batch_size=batch_size, target=target
+            )
+        return fn
+    if lf == "attention_i_g":
+        llm_attn = llm_attr.LLMAttentionAttribution(model, tokenizer)
+        llm_ig = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        def fn():
+            attn = llm_attn.calculate_attention_attribution(prompt, target=target)
+            ig = llm_ig.calculate_IG_per_generation(
+                prompt, steps=20, baseline=tokenizer.eos_token_id, batch_size=batch_size, target=target
+            )
+            attn.attribution_matrix = attn.attribution_matrix * ig.attribution_matrix
+            return attn
+        return fn
+    if lf == "perturbation_all":
+        llm_attrtor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+        def fn():
+            return llm_attrtor.calculate_feature_ablation_sentences(
+                prompt, baseline=tokenizer.eos_token_id, measure="log_loss", target=target
+            )
+        return fn
+    if lf == "perturbation_clp":
+        llm_attrtor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+        def fn():
+            return llm_attrtor.calculate_feature_ablation_sentences(
+                prompt, baseline=tokenizer.eos_token_id, measure="KL", target=target
+            )
+        return fn
+    if lf == "perturbation_reagent":
+        llm_attrtor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+        def fn():
+            return llm_attrtor.calculate_feature_ablation_sentences_mlm(prompt, target=target)
+        return fn
+    if lf == "ifr_all_positions":
+        llm_attrtor = llm_attr.LLMIFRAttribution(
+            model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=1
+        )
+        def fn():
+            return llm_attrtor.calculate_ifr_for_all_positions(prompt, target=target)
+        return fn
+    if lf == "ifr_multi_hop":
+        llm_attrtor = llm_attr.LLMIFRAttribution(
+            model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=sink_chunk_tokens
+        )
+        def fn():
+            return llm_attrtor.calculate_ifr_multi_hop(prompt, target=target)
+        return fn
+    if lf == "ifr_multi_hop_both":
+        import ft_ifr_improve
+        llm_attrtor = ft_ifr_improve.LLMIFRAttributionBoth(
+            model, tokenizer, chunk_tokens=chunk_tokens, sink_chunk_tokens=sink_chunk_tokens
+        )
+        def fn():
+            return llm_attrtor.calculate_ifr_multi_hop_both(prompt, target=target)
+        return fn
+    if lf == "attnlrp":
+        llm_attrtor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        def fn():
+            return llm_attrtor.calculate_attnlrp(prompt, target=target)
+        return fn
+    raise ValueError(f"Unsupported attr_func {attr_func}")
+def compute_batch_size(sequence_length: int, max_input_len: int) -> int:
+    denom = int(sequence_length)
+    return max(1, math.floor((max_input_len - 100) / max(1, denom)))
+def aggregate_results(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    grouped: Dict[Tuple[str, int, int], Dict[str, List[float]]] = defaultdict(lambda: {"time": [], "mem": []})
+    statuses: Dict[Tuple[str, int, int], List[str]] = defaultdict(list)
+    for row in rows:
+        key = (row["attr_func"], row["target_input_tokens"], row["target_output_tokens"])
+        statuses[key].append(row["status"])
+        if row.get("time_sec") is not None:
+            grouped[key]["time"].append(row["time_sec"])
+        if row.get("peak_mem_gb") is not None:
+            grouped[key]["mem"].append(row["peak_mem_gb"])
+    summary = []
+    for key, vals in grouped.items():
+        attr_func, input_tokens, output_tokens = key
+        total_tokens = input_tokens + output_tokens
+        times = vals["time"]
+        mems = vals["mem"]
+        summary.append(
+            {
+                "attr_func": attr_func,
+                "target_input_tokens": input_tokens,
+                "target_total_tokens": total_tokens,
+                "target_output_tokens": output_tokens,
+                "time_mean": np.mean(times) if times else None,
+                "time_std": np.std(times) if times else None,
+                "mem_mean": np.mean(mems) if mems else None,
+                "mem_std": np.std(mems) if mems else None,
+                "statuses": statuses[key],
+            }
+        )
+    return summary
+def append_jsonl_row(f, row: Dict[str, Any]) -> None:
+    f.write(json.dumps(row) + "\n")
+    f.flush()
+    try:
+        os.fsync(f.fileno())
+    except OSError:
+        pass
+def write_summary_csv(rows: List[Dict[str, Any]], out_dir: Path) -> Path:
+    summary = aggregate_results(rows)
+    summary_path = out_dir / "time_curve_summary.csv"
+    tmp_path = out_dir / "time_curve_summary.csv.tmp"
+    with tmp_path.open("w") as f:
+        f.write(
+            "attr_func,target_input_tokens,target_output_tokens,target_total_tokens,time_mean,time_std,peak_mem_mean,peak_mem_std,statuses\n"
+        )
+        for row in summary:
+            f.write(
+                "{},{},{},{},{},{},{},{},{}\n".format(
+                    row["attr_func"],
+                    row["target_input_tokens"],
+                    row["target_output_tokens"],
+                    row["target_total_tokens"],
+                    "" if row["time_mean"] is None else f"{row['time_mean']:.4f}",
+                    "" if row["time_std"] is None else f"{row['time_std']:.4f}",
+                    "" if row["mem_mean"] is None else f"{row['mem_mean']:.4f}",
+                    "" if row["mem_std"] is None else f"{row['mem_std']:.4f}",
+                    "|".join(row["statuses"]),
+                )
+            )
+        f.flush()
+        try:
+            os.fsync(f.fileno())
+        except OSError:
+            pass
+    tmp_path.replace(summary_path)
+    return summary_path
+def main() -> None:
+    args = parse_args()
+    device = resolve_device(args.cuda, args.cuda_num)
+    attr_funcs = [a.strip() for a in args.attr_funcs.split(",") if a.strip()]
+    target_output_lengths = parse_csv_ints(args.output_lengths)
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    random.seed(42)
+    np.random.seed(42)
+    torch.manual_seed(42)
+    model_name = args.model if args.model_path is None else args.model_path
+    model, tokenizer = load_model_balanced(model_name, device)
+    device_indices = collect_device_indices(device, model)
+    max_ctx = getattr(getattr(model, "config", None), "max_position_embeddings", None)
+    base_text = load_ruler_base(Path(args.ruler_file), fallback="RULER fallback text. ")
+    target_base = args.target_text
+    all_rows: List[Dict[str, Any]] = []
+    runner = None
+    raised: Optional[BaseException] = None
+    jsonl_f = None
+    jsonl_path = out_dir / "time_curve_runs.jsonl"
+    summary_path = out_dir / "time_curve_summary.csv"
+    def record_row(row: Dict[str, Any]) -> None:
+        all_rows.append(row)
+        if jsonl_f is not None:
+            append_jsonl_row(jsonl_f, row)
+        write_summary_csv(all_rows, out_dir)
+    using_deprecated_total = args.total_lengths is not None
+    if using_deprecated_total:
+        target_total_lengths = parse_csv_ints(args.total_lengths)
+        length_grid: List[Tuple[int, int, int]] = []
+        for total_tokens in target_total_lengths:
+            for output_tokens in target_output_lengths:
+                length_grid.append((total_tokens - output_tokens, output_tokens, total_tokens))
+    else:
+        target_input_lengths = parse_csv_ints(args.input_lengths)
+        length_grid = []
+        for input_tokens in target_input_lengths:
+            for output_tokens in target_output_lengths:
+                length_grid.append((input_tokens, output_tokens, input_tokens + output_tokens))
+    try:
+        jsonl_f = jsonl_path.open("w")
+        write_summary_csv([], out_dir)
+        for input_tokens, output_tokens, total_tokens in length_grid:
+            if input_tokens <= 0:
+                for attr in attr_funcs:
+                    for rep in range(args.repeats):
+                        record_row(
+                            {
+                                "attr_func": attr,
+                                "target_input_tokens": input_tokens,
+                                "target_output_tokens": output_tokens,
+                                "target_total_tokens": total_tokens,
+                                "actual_input_tokens": None,
+                                "actual_output_tokens": None,
+                                "actual_total_tokens_raw": None,
+                                "actual_user_prompt_tokens": None,
+                                "actual_formatted_prompt_tokens": None,
+                                "actual_generation_tokens": None,
+                                "actual_total_tokens": None,
+                                "status": "skipped_nonpositive_input",
+                                "time_sec": None,
+                                "peak_mem_gb": None,
+                                "peak_mem_reserved_gb": None,
+                                "repeat": rep,
+                                "used_deprecated_total_lengths": using_deprecated_total,
+                            }
+                        )
+                continue
+            prompt, actual_input_len = build_prompt_to_length(tokenizer, base_text, input_tokens)
+            target, actual_output_len = build_output_to_length(tokenizer, target_base, output_tokens)
+            actual_total_tokens_raw = len(tokenizer(prompt + target, add_special_tokens=False).input_ids)
+            model_lens = estimate_model_lengths(tokenizer, prompt, target)
+            if max_ctx is not None and model_lens["total_tokens"] > max_ctx:
+                for attr in attr_funcs:
+                    for rep in range(args.repeats):
+                        record_row(
+                            {
+                                "attr_func": attr,
+                                "target_input_tokens": input_tokens,
+                                "target_output_tokens": output_tokens,
+                                "target_total_tokens": total_tokens,
+                                "actual_input_tokens": actual_input_len,
+                                "actual_output_tokens": actual_output_len,
+                                "actual_total_tokens_raw": actual_total_tokens_raw,
+                                "actual_user_prompt_tokens": model_lens["user_prompt_tokens"],
+                                "actual_formatted_prompt_tokens": model_lens["formatted_prompt_tokens"],
+                                "actual_generation_tokens": model_lens["generation_tokens"],
+                                "actual_total_tokens": model_lens["total_tokens"],
+                                "status": "skipped_model_ctx",
+                                "time_sec": None,
+                                "peak_mem_gb": None,
+                                "peak_mem_reserved_gb": None,
+                                "repeat": rep,
+                                "used_deprecated_total_lengths": using_deprecated_total,
+                            }
+                        )
+                continue
+            batch_size = compute_batch_size(model_lens["total_tokens"], max_input_len=max_ctx or 200000)
+            for attr in attr_funcs:
+                for rep in range(args.repeats):
+                    runner = None
+                    maybe_reset_cuda(device_indices)
+                    try:
+                        runner = make_attr_runner(
+                            attr,
+                            model=model,
+                            tokenizer=tokenizer,
+                            chunk_tokens=args.chunk_tokens,
+                            sink_chunk_tokens=args.sink_chunk_tokens,
+                            batch_size=batch_size,
+                            prompt=prompt,
+                            target=target,
+                        )
+                    except RuntimeError as e:
+                        if "out of memory" in str(e).lower():
+                            status = "oom"
+                            if not args.catch_oom:
+                                raise
+                        else:
+                            status = f"init_runtime_error: {e}"
+                            if not args.catch_oom:
+                                raise
+                        wall = None
+                        mem_alloc = None
+                        mem_reserved = None
+                        mem_by_device = {}
+                    except Exception as e:
+                        status = f"init_error: {e}"
+                        if not args.catch_oom:
+                            raise
+                        wall = None
+                        mem_alloc = None
+                        mem_reserved = None
+                        mem_by_device = {}
+                    else:
+                        status, wall, mem_alloc, mem_reserved, mem_by_device = measure(
+                            runner, device_indices=device_indices, catch_oom=args.catch_oom
+                        )
+                    finally:
+                        runner = None
+                    record_row(
+                        {
+                            "attr_func": attr,
+                            "target_input_tokens": input_tokens,
+                            "target_output_tokens": output_tokens,
+                            "target_total_tokens": total_tokens,
+                            "actual_input_tokens": actual_input_len,
+                            "actual_output_tokens": actual_output_len,
+                            "actual_total_tokens_raw": actual_total_tokens_raw,
+                            "actual_user_prompt_tokens": model_lens["user_prompt_tokens"],
+                            "actual_formatted_prompt_tokens": model_lens["formatted_prompt_tokens"],
+                            "actual_generation_tokens": model_lens["generation_tokens"],
+                            "actual_total_tokens": model_lens["total_tokens"],
+                            "status": status,
+                            "time_sec": wall,
+                            "peak_mem_gb": mem_reserved if mem_reserved is not None else mem_alloc,
+                            "peak_mem_reserved_gb": mem_reserved,
+                            "peak_mem_by_device_gb": mem_by_device if mem_by_device else None,
+                            "repeat": rep,
+                            "used_deprecated_total_lengths": using_deprecated_total,
+                        }
+                    )
+    except BaseException as e:
+        raised = e
+    finally:
+        runner = None
+        if jsonl_f is not None:
+            jsonl_f.close()
+        write_summary_csv(all_rows, out_dir)
+        print(f"Wrote per-run records to {jsonl_path}")
+        print(f"Wrote summary to {summary_path}")
+    if raised is not None:
+        raise raised
+if __name__ == "__main__":
+    main()

exp/exp2/DATASETS.md ADDED Viewed

	@@ -0,0 +1,231 @@

+# exp/exp2 数据集与样本流说明
+本文件说明 Experiment 2 中支持的数据集、样本结构，以及在「采样阶段」与「归因阶段」的处理方式。
+## 支持的数据集
+- `morehopqa`（`data/with_human_verification.json`）
+- RULER 系列 JSONL：`hotpotqa_long`、`niah_*`、`vt_*`（自动在 `data/ruler_multihop/<len>/.../validation.jsonl` 搜索），或直接传入任意 RULER JSONL 路径
+- 其余数据集（如 math）被显式跳过
+- 归因阶段同样优先使用缓存文件 `exp/exp2/data/<name>.jsonl`，否则按上述规则解析；传入存在的 JSONL 路径也会按 RULER 结构加载
+### 共同的样本字段定义
+```json
+{
+  "prompt": "<上下文+问题>",
+  "target": "<答案或生成>",
+  "indices_to_explain": [start_tok, end_tok] | null, // token-level：需要解释的 generation token span（闭区间）
+  "attr_mask_indices": [...],       // legacy：覆盖率金标句子索引（当前 exp2 不再使用），可能为 null
+  "sink_span": [start, end] | null, // 生成 token 中的答案片段
+ "thinking_span": [start, end] | null, // 生成 token 中的 CoT 片段
+  "metadata": { ... }               // 数据集特定元信息
+}
+```
+- **`CachedExample`**：`dataset_utils.py` 统一的内存态结构，字段与上述 JSON 完全一致，用于采样阶段（加载原始数据）与归因阶段（加载缓存或原始）。
+- **缓存行（JSONL）**：`sample_and_filter.py` 写入的每行 JSON，与 `CachedExample` 字段一一对应。
+- **采样阶段处理流（通用）**：
+  1. 加载原始数据集样本（`prompt`/`indices_to_explain` 等保持一致）。
+  2. 按模板调用生成模型，要求「思考文本 + 末尾 \\box{} 答案」。
+  3. 若生成不符合「思考 + 单个 \\box{} 且无尾巴」的格式，直接丢弃该样本。
+  4. 提取思考片段与 `\\box{}` 内文本，仅用 `\\box{}` 内文调用判定模型。
+  5. 判定为 True 时，重新拼接「思考片段 + 去除 box 包裹的答案文本」作为 `target`，并据此记录 `sink_span`/`thinking_span`。
+  6. 写入缓存：只保留 `reference_answer`、`judge_response`（可选 `boxed_answer`），不再存储 `candidate_answer`。
+### 生成切分与 span 解析
+- `split_boxed_generation`（`dataset_utils.py`）校验格式：必须是「非空思考文本 + 单个末尾 \\box{}」且箱体之后无其他字符，否则直接跳过。
+- `target` 由「思考片段 + 换行 + 最终答案文本（无 box）」重组。
+- `attach_spans_from_answer` 使用 tokenizer 的 offset mapping 将最终答案在 `target` 中的字符区间映射到 token 级索引，得到 `sink_span`；`thinking_span` 取从开头到 `sink_span` 前一 token 的闭区间。两者均为 token 级 span，满足后续多跳 IFR 的调用约定。
+- `indices_to_explain` 在采样写缓存时统一设置为 `sink_span`（boxed 内文在 `target` 中对应的 generation token span）。
+---
+## MoreHopQA
+- **原始样本结构（`MoreHopQAAttributionDataset` → `CachedExample`）**
+  ```json
+  {
+    "prompt": "<context 拼接>\\n<question>",
+    "target": null,
+    "indices_to_explain": null,
+    "attr_mask_indices": null,
+    "sink_span": null,
+    "thinking_span": null,
+    "metadata": {
+      "answer": "<gold answer>",
+      "_id": "<example id>",
+      "original_context": <原始上下文结构>
+    }
+  }
+  ```
+  - 加载时机：`DatasetLoader.load_raw("morehopqa")` 在采样阶段、归因阶段（无缓存时）都会产出 `CachedExample`。
+  - 说明：exp2 的 token-level row/rec 需要 `target` + 可定位的答案 token span；建议先跑 `sample_and_filter.py` 产出缓存后再做归因评估。
+- **采样阶段（生成 & 过滤后写缓存）**
+  ```json
+  {
+    "prompt": "<同上>",
+    "target": "<生成的 CoT + 最终答案文本（已去掉 box 包裹）>",
+    "indices_to_explain": [start_tok, end_tok],
+    "attr_mask_indices": null,
+    "sink_span": [start_tok, end_tok] | null,
+    "thinking_span": [start_tok, end_tok] | null,
+    "metadata": {
+      "answer": "<gold answer>",
+      "_id": "<example id>",
+      "original_context": <原始上下文结构>,
+      "reference_answer": "<gold answer>",
+      "judge_response": "<True/False 文本>",
+      "boxed_answer": "<可选，boxed 解析结果>"
+    }
+  }
+  ```
+  - `sink_span`/`thinking_span`：仅在成功解析 `\\box{}` 时填充；`target` 为「思考 + 最终答案文本」的裁剪版。
+  - 写入：`exp/exp2/data/morehopqa.jsonl`。
+- **归因阶段（加载缓存优先）**
+  - 加载：`run_exp.py` 优先 `load_cached`（JSONL → `CachedExample`），否则回退原始结构并在线生成 `target`。
+  - 使用：忠实度（token-level RISE/MAS）直接用缓存的 `target`；`ifr_multi_hop` 在有 `sink_span`/`thinking_span` 时限定答案/CoT，否则视整个生成为 sink。
+---
+## RULER 热点问答（`hotpotqa_long`）
+- **原始样本结构（`RulerAttributionDataset` → `CachedExample`）**
+  ```json
+  {
+    "prompt": "<input> + <answer_prefix>",
+    "target": "<answer_prefix + sep + ', '.join(outputs)>",
+    "indices_to_explain": [0],
+    "attr_mask_indices": [<句子索引>...] | null,
+    "sink_span": null,
+    "thinking_span": null,
+    "metadata": {
+      "dataset": "ruler",
+      "length": <int>,
+      "length_w_model_temp": <any>,
+      "outputs": [...],
+      "answer_prefix": "<str>",
+      "token_position_answer": <any>,
+      "needle_spans": [
+        {
+          "title": "<str>",
+          "doc_index": <int>,
+          "document_number": <int>,
+          "sentence_index": <int>,
+          "sentence": "<str>",
+          "context_span": [start, end],
+          "span": [start, end],
+          "snippet": "<str>"
+        },
+        ...
+      ],
+      "prompt_sentence_count": <int>,
+      "reference_answer": "<在 loader 中补充，来自 outputs 或 target>"
+    }
+  }
+  ```
+  - 加载时机：`DatasetLoader.load_raw("hotpotqa_long")` 在采样阶段、归因阶段（无缓存时）都会产出 `CachedExample`。
+- **采样阶段（生成 & 过滤后写缓存）**
+  ```json
+  {
+    "prompt": "<同上>",
+    "target": "<生成的 CoT + 最终答案文本（已去掉 box 包裹）>",
+    "indices_to_explain": [-2],
+    "attr_mask_indices": [<句子索引>...] | null,
+    "sink_span": [start_tok, end_tok] | null,
+    "thinking_span": [start_tok, end_tok] | null,
+    "metadata": {
+      "dataset": "ruler",
+      "length": <int>,
+      "length_w_model_temp": <any>,
+      "outputs": [...],
+      "answer_prefix": "<str>",
+      "token_position_answer": <any>,
+      "needle_spans": [...],
+      "prompt_sentence_count": <int>,
+      "reference_answer": "<outputs 拼接或 target>",
+      "judge_response": "<True/False 文本>",
+      "boxed_answer": "<可选>"
+    }
+  }
+  ```
+  - `attr_mask_indices` 保留原值；`indices_to_explain` 统一为末句 `[-2]`（最后一个非 EOS 生成句）；`sink_span`/`thinking_span` 仅在成功解析 `\\box{}` 时填充；`target` 为「思考 + 最终答案文本」的裁剪版。
+  - 写入：`exp/exp2/data/hotpotqa_long.jsonl`。
+- **归因阶段（加载缓存优先）**
+  - 加载：优先 `load_cached`（JSONL → `CachedExample`），否则回退原始解析。
+  - 使用：覆盖率使用 `attr_mask_indices`；忠实度与 `ifr_multi_hop` 利用缓存的 `sink_span`/`thinking_span` 定位答案/CoT，若缺失则视整个生成为 sink。
+---
+## RULER NIAH / Variable Tracking（`niah_*`, `vt_*`）
+- **原始样本结构（同 RULER 通用）**
+  ```json
+  {
+    "prompt": "<input> + <answer_prefix>",
+    "target": "<answer_prefix + sep + ', '.join(outputs)>",
+    "indices_to_explain": [0],
+    "attr_mask_indices": [<句子索引>...] | null,
+    "sink_span": null,
+    "thinking_span": null,
+    "metadata": {
+      "dataset": "ruler",
+      "length": <int>,
+      "length_w_model_temp": <any>,
+      "outputs": [...],
+      "answer_prefix": "<str>",
+      "token_position_answer": <any>,
+      "needle_spans": [...],
+      "prompt_sentence_count": <int>,
+      "reference_answer": "<在 loader 中补充>"
+    }
+  }
+  ```
+  - 加载时机：`DatasetLoader.load_raw("<niah_* 或 vt_*>")` 在采样阶段、归因阶段（无缓存时）使用。
+- **采样阶段（生成 & 过滤后写缓存）**
+  ```json
+  {
+    "prompt": "<同上>",
+    "target": "<思考 + 最终答案文本（无 box），无其他尾巴>",
+    "indices_to_explain": [start_tok, end_tok],
+    "attr_mask_indices": [<句子索引>...] | null,
+    "sink_span": [start_tok, end_tok] | null,
+    "thinking_span": [start_tok, end_tok] | null,
+    "metadata": {
+      "dataset": "ruler",
+      "length": <int>,
+      "length_w_model_temp": <any>,
+      "outputs": [...],
+      "answer_prefix": "<str>",
+      "token_position_answer": <any>,
+      "needle_spans": [...],
+      "prompt_sentence_count": <int>,
+      "reference_answer": "<outputs 拼接或 target>",
+      "judge_response": "<True/False 文本>",
+      "boxed_answer": "<可选>"
+    }
+  }
+  ```
+  - 生成/判定流程与 `hotpotqa_long` 相同；`target` 是裁剪后的「思考 + 最终答案文本」。
+  - 写入：`exp/exp2/data/<dataset>.jsonl`（例如 `niah_mq_q2.jsonl`, `vt_h6_c1.jsonl`）。
+- **归因阶段（加载缓存优先）**
+  - 与 `hotpotqa_long` 相同：优先缓存，否则原始；恢复率（`recovery_ruler`）使用 `metadata.needle_spans`（映射到 prompt tokens）；多跳 IFR 在有 `sink_span`/`thinking_span` 时作用于答案/CoT。
+---
+## `indices_to_explain` 约定
+- token-level：`indices_to_explain = [start_tok, end_tok]`（闭区间），坐标系为 `tokenizer(target, add_special_tokens=False)` 的 generation token indices。
+- exp2 推荐：`indices_to_explain == sink_span`，即 boxed 内文（最终答案）在 `target` 中对应的 token span。
+---
+## 自定义 RULER JSONL 路径
+- 若 `--dataset` 传入存在的 JSONL 路径，`dataset_from_name` 按 RULER 文件解析，字段与流程同 RULER 系列。
+- 采样、归因阶段行为与上文 RULER 描述一致，只是文件名由显式路径决定。
+---
+## 归因阶段加载优先级与效果
+- `run_exp.py` 加载顺序：`exp/exp2/data/<name>.jsonl` 缓存 > 显式给定的 JSONL 路径 > 原始解析（MoreHopQA 或 RULER）
+- 恢复率 (`mode=recovery_ruler`) 仅支持 RULER（要求 `metadata.needle_spans`），否则拒绝
+- 忠实度 (`mode=faithfulness_gen`) 使用生成文本；`ifr_multi_hop` 在有 `sink_span`/`thinking_span` 时才对答案/CoT 做多跳，否则退化为整段生成

exp/exp2/README.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# FlashTrace 实验 2（多步推理下的忠实度）
+本目录提供「11 数据集 × 9 方法 × 3 指标」的实验工具，**跳过 AT2**，**跳过 math**。流程分为两步：先采样并过滤高质量 CoT+boxed 生成，再对过滤结果做归因评估。
+支持数据集：MoreHopQA、HotpotQA（RULER hotpotqa_long）、RULER niah（niah_*）、RULER variable tracking（vt_*）。RULER 路径自动在 `data/ruler_multihop/<len>/.../validation.jsonl` 中搜索。
+主要文件：
+- `sample_and_filter.py`：采样 + 判定一致性，输出到 `exp/exp2/data/`
+- `run_exp.py`：归因测试，输出到 `exp/exp2/output/`
+- `dataset_utils.py`：数据加载、答案 span 解析
+采样脚本支持的数据集
+- `morehopqa`（本地 `data/with_human_verification.json`）
+- `hotpotqa_long`（自动在 `data/ruler_multihop/<len>/hotpotqa_long/validation.jsonl` 搜索）
+- `niah_*`（RULER niah 变体，自动搜索同上）
+- `vt_*`（RULER variable tracking 变体，自动搜索同上）
+- 直接传 RULER JSONL 路径（作为数据集名处理），其余类型不支持
+归因测试支持
+- 数据集：优先使用 `exp/exp2/data/<name>.jsonl` 缓存，若无则按采样同样的解析规则加载；math 显式拒绝。
+- 指标：
+  - `faithfulness_gen`（生成侧）：可运行在任何已加载样本（math 以外）。
+  - `recovery_ruler`（恢复率，仅 RULER）：Recall@10%（排名只在 prompt tokens 上进行，gold 来自 `needle_spans`）。
+- 方法（`--attr_funcs`）：`IG`、`perturbation_all`、`perturbation_CLP`、`perturbation_REAGENT`、`attention`（内部融合 IG）、`ifr_all_positions`、`ifr_multi_hop`、`attnlrp`、`ft_attnlrp`、`basic`。AT2 未提供。
+---
+## 数据采样
+实现逻辑
+- 统一数据加载：`DatasetLoader` 读取 MoreHopQA / HotpotQA / RULER niah / RULER vt；可直接传自定义 RULER JSONL。
+- 生成模型：`qwen3-235b-a22b-2507`（英文 system prompt），要求「先简要思考，再用 `\box{}` 包裹最终答案且末尾不追加内容」；user prompt 为原题，无额外模板。
+- 判定模型：`deepseek-v3-1-terminus`（英文 system prompt），只输出 True/False 判断 `\box{}` 内文与参考答案是否一致。
+- 过滤：仅保留「思考 + 末尾 boxed 答案」且判定为 True 的样本；`target` 用提取的思考片段与 **去掉 box 包裹的最终答案** 重组，附带 token 级 `sink_span`/`thinking_span`、`reference_answer`、`judge_response`（不再存 `candidate_answer`），`indices_to_explain` 统一写为 `sink_span`（boxed 内文在 `target` 的 generation token span，[start_tok, end_tok]）。
+- 采样会按原始顺序依次尝试样本，判定失败立即跳过；累计到 `--max_examples` 条成功样本即提前停止（若源数据不足则更少），tqdm 会分别显示尝试与成功计数。
+使用说明
+```bash
+export FLASHTRACE_API_KEY=sk-yaojia-get-ccfa  # 或 OPENAI_API_KEY
+# 示例：采样 hotpotqa_long，保留最多 100 条判定为 True 的样本
+python exp/exp2/sample_and_filter.py \
+  --dataset data/with_human_verification.json \
+  --max_examples 100 \
+  --api_key sk-yaojia-get-ccfa \
+  --tokenizer_model /opt/share/models/Qwen/Qwen3-8B > exp/exp2/out.log
+```
+常用参数：
+- `--dataset`：morehopqa | hotpotqa_long | niah_* | vt_*（或直接 JSONL 路径）
+- `--max_examples`：希望保留的成功样本数；达到后即停止（若源数据不足则更少）
+- `--tokenizer_model`：用于 span 检测的 tokenizer（默认复用生成模型）
+- `--api_base`/`--api_key`：接口地址与密钥（默认本地 http://localhost:4000/v1）
+- `--request_interval` / `--judge_interval`：生成/判定间隔节流（默认 1s）
+- `--rate_limit_delay`：遇到 HTTP 429 时的等待秒数（默认 5s）；会在重试前自动 sleep
+输出：`exp/exp2/data/<dataset>.jsonl`
+---
+## 归因测试
+实现逻辑
+- 输入：优先读取 `exp/exp2/data/<dataset>.jsonl`（过滤缓存）；若不存在则回退到原始数据解析。
+- 方法：忠实度（token-level RISE/MAS）对齐 `evaluations/faithfulness.py` 的逻辑（AT2 未实现），math 自动拒绝。
+- 多跳 FlashTrace：若缓存含 `sink_span`/`thinking_span` 则用于 multi-hop IFR，否则默认整句答案为 sink。
+- 一次运行可同时评测多个指标：`--mode` 支持多值与逗号分隔（如 `--mode faithfulness_gen,recovery_ruler` 或 `--mode faithfulness_gen, recovery_ruler`），对同一批样本只做一次归因。
+- 可选保存样本级 trace：加 `--save_hop_traces` 会为**所有方法、所有样本**保存归因向量与逐样本指标到 `exp/exp2/output/traces/...`；对 multi-hop 方法还会额外保存每跳的 token-level 向量 `V_h`（单一 `vh`，即实际参与多跳传播的向量），并在 manifest 中记录 `attnlrp_neg_handling/attnlrp_norm_mode` 等设置。
+- 已知兼容性：部分 tokenizer 在 chat template 边界会出现 token 合并，导致评测侧用 token-id 子序列定位 user prompt 失败；exp2 已改为直接复用归因阶段算出的 `user_prompt_indices` 做扰动定位。
+- 批大小估算：沿用原脚本 `(max_input_len-100)/len(tokenizer(format_prompt(prompt)+target))` 的保守估计（至少 1）。`max_input_len` 由代码内置映射表基于 `--model` 字符串决定，未命中或仅传 `--model_path` 时默认 2000；如需映射值而又用本地路径，请同时传入对应的 `--model` 名称。
+- 计时：对每个样本的归因计算（recovery/faithfulness）分别计时，最终在 CSV 末尾追加 `Avg Sample Time (s)` 并在控制台打印平均耗时。
+- 输出：`exp/exp2/output/faithfulness/...`、`exp/exp2/output/recovery/...`，以及（可选）`exp/exp2/output/traces/...`，按数据集和模型分目录。
+使用说明
+```bash
+# 生成侧 RISE/MAS 忠实度 perturbation_all_fast,perturbation_CLP_fast,perturbation_REAGENT_fast,ifr_multi_hop_stop_words,ifr_multi_hop_both,ifr_multi_hop_split_hop,ft_attnlrp,ifr_multi_hop,attnlrp,ifr_all_positions,perturbation_all,perturbation_REAGENT,perturbation_CLP,IG,attention
+python exp/exp2/run_exp.py \
+  --datasets exp/exp2/data/math.jsonl \
+  --attr_funcs IG,attention \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 2,3,4,5,6,7 \
+  --num_examples 100 \
+  --mode faithfulness_gen \
+  --n_hops 1 \
+  --save_hop_traces \
+&& python exp/exp2/run_exp.py \
+  --datasets exp/exp2/data/morehopqa.jsonl \
+  --attr_funcs IG,attention \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 2,3,4,5,6,7 \
+  --num_examples 100 \
+  --mode faithfulness_gen \
+  --n_hops 1 \
+  --save_hop_traces
+  # --attnlrp_neg_handling drop \
+  # --attnlrp_norm_mode norm
+```
+常用参数：
+- `--datasets`：逗号分隔数据集名；若已存在 `exp/exp2/data/<name>.jsonl` 则直接使用。
+- `--attr_funcs`：逗号分隔方法（无 AT2）；`ifr_multi_hop` 与 `ft_attnlrp` 支持多跳（由 `--n_hops` 控制）。
+- `--attnlrp_neg_handling`：FT-AttnLRP 每跳负值处理（`drop`/`abs`）。
+- `--attnlrp_norm_mode`：FT-AttnLRP 正则化与 hop ratio 开关（`norm`/`no_norm`）。
+- `--data_root`/`--output_root`：缓存与结果目录（默认 `exp/exp2/data` / `exp/exp2/output`）。
+- `--mode`：`faithfulness_gen`、`recovery_ruler`，可多值/逗号分隔（一次归因同时输出多个指标）；`--num_examples` 控制评测条数。math 会被拒绝。***
+- `--save_hop_traces`：保存样本级 trace 到 `exp/exp2/output/traces/<dataset>/<model>/<run_tag>/`（每样本 `ex_*.npz` + `manifest.jsonl`）。

exp/exp2/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""Dataset helpers for Experiment 2 (CoT / multi-hop faithfulness).
+Named dataset_utils to avoid collision with the HF `datasets` package.
+"""
+from __future__ import annotations
+import json
+import random
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+from attribution_datasets import (
+    AttributionExample,
+    MoreHopQAAttributionDataset,
+    RulerAttributionDataset,
+)
+@dataclass
+class CachedExample:
+    prompt: str
+    target: Optional[str]
+    indices_to_explain: Optional[List[int]]
+    attr_mask_indices: Optional[List[int]]
+    sink_span: Optional[List[int]]
+    thinking_span: Optional[List[int]]
+    metadata: Dict[str, Any]
+def read_cached_jsonl(path: Path) -> List[CachedExample]:
+    examples: List[CachedExample] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            examples.append(
+                CachedExample(
+                    prompt=obj["prompt"],
+                    target=obj.get("target"),
+                    indices_to_explain=obj.get("indices_to_explain"),
+                    attr_mask_indices=obj.get("attr_mask_indices"),
+                    sink_span=obj.get("sink_span"),
+                    thinking_span=obj.get("thinking_span"),
+                    metadata=obj.get("metadata", {}),
+                )
+            )
+    return examples
+def load_cached(path: Path, sample: Optional[int] = None, seed: int = 42) -> List[CachedExample]:
+    ex = read_cached_jsonl(path)
+    if sample is not None and sample < len(ex):
+        random.Random(seed).shuffle(ex)
+        ex = ex[:sample]
+    return ex
+def load_ruler(path: Path, sample: Optional[int] = None, seed: int = 42) -> List[CachedExample]:
+    ds = RulerAttributionDataset(path)
+    examples: List[CachedExample] = []
+    ex_iter: Iterable[AttributionExample] = ds
+    if sample is not None and sample < len(ds):
+        ex_iter = list(ds)
+        random.Random(seed).shuffle(ex_iter)
+        ex_iter = ex_iter[:sample]
+    for ex in ex_iter:
+        examples.append(
+            CachedExample(
+                prompt=ex.prompt,
+                target=ex.target,
+                indices_to_explain=ex.indices_to_explain,
+                attr_mask_indices=ex.attr_mask_indices,
+                sink_span=None,
+                thinking_span=None,
+                metadata=ex.metadata,
+            )
+        )
+    return examples
+def load_morehopqa(
+    path: str | Path = "./data/with_human_verification.json", sample: Optional[int] = None, seed: int = 42
+) -> List[CachedExample]:
+    ds = MoreHopQAAttributionDataset(path)
+    ex_iter: Iterable[AttributionExample] = ds
+    if sample is not None and sample < len(ds):
+        ex_iter = list(ds)
+        random.Random(seed).shuffle(ex_iter)
+        ex_iter = ex_iter[:sample]
+    examples: List[CachedExample] = []
+    for ex in ex_iter:
+        examples.append(
+            CachedExample(
+                prompt=ex.prompt,
+                target=None,
+                indices_to_explain=ex.indices_to_explain,
+                attr_mask_indices=ex.attr_mask_indices,
+                sink_span=None,
+                thinking_span=None,
+                metadata=ex.metadata,
+            )
+        )
+    return examples
+def auto_find_ruler(task: str) -> Optional[Path]:
+    length_dirs = ["4096", "8192", "16384", "32768", "65536", "131072"]
+    base = Path("data/ruler_multihop")
+    for ld in length_dirs:
+        cand = base / ld / task / "validation.jsonl"
+        if cand.exists():
+            return cand
+    return None
+def dataset_from_name(name: str) -> Optional[Path]:
+    if name == "hotpotqa_long":
+        return auto_find_ruler("hotpotqa_long")
+    if name.startswith("vt_"):
+        return auto_find_ruler(name)
+    if name.startswith("niah"):
+        return auto_find_ruler(name)
+    p = Path(name)
+    if p.exists():
+        return p
+    return None
+_BOX_PATTERN = re.compile(r"\\box(?:ed)?\s*[\{｛](.*?)[\}｝]", flags=re.DOTALL)
+def _find_box_span(text: str) -> Optional[tuple[int, int, str]]:
+    """Return (start_char, end_char, answer_text) for the last \\boxed block."""
+    matches = list(_BOX_PATTERN.finditer(text))
+    if not matches:
+        return None
+    m = matches[-1]
+    return m.start(0), m.end(0), m.group(1).strip()
+def extract_boxed_answer(text: str) -> Optional[str]:
+    """Extract the answer string inside the last \\boxed{} block."""
+    match = _find_box_span(text)
+    return match[2] if match else None
+def _find_answer_span(text: str, answer: str) -> Optional[tuple[int, int]]:
+    """Return (start_char, end_char) for the last occurrence of `answer` in text."""
+    if not answer or not text:
+        return None
+    start = text.rfind(answer)
+    if start == -1:
+        return None
+    return start, start + len(answer)
+def split_boxed_generation(text: str) -> Optional[tuple[str, str, str]]:
+    """Return (thinking_text, boxed_segment, boxed_answer) if format matches."""
+    if not text:
+        return None
+    match = _find_box_span(text)
+    if not match:
+        return None
+    start_char, end_char, boxed_inner = match
+    boxed_segment = text[start_char:end_char].strip()
+    thinking_text = text[:start_char].strip()
+    trailing = text[end_char:].strip()
+    if not boxed_inner or not boxed_segment:
+        return None
+    if trailing:
+        return None
+    if not thinking_text:
+        return None
+    return thinking_text, boxed_segment, boxed_inner
+def attach_spans_from_answer(
+    example: CachedExample, tokenizer, answer_text: Optional[str] = None
+) -> CachedExample:
+    """Attach sink/thinking spans by locating the (plain) answer in `target`.
+    `answer_text` should be the extracted boxed answer; falls back to metadata or
+    parsing the target when omitted. Works even when the target no longer keeps
+    the \\box{} wrapper.
+    """
+    tgt = example.target or ""
+    answer = (answer_text or "").strip()
+    if not answer:
+        answer = (example.metadata.get("boxed_answer") or extract_boxed_answer(tgt) or "").strip()
+    metadata = dict(example.metadata)
+    if answer:
+        metadata.setdefault("boxed_answer", answer)
+    if tokenizer is None or not tgt or not answer:
+        return CachedExample(
+            prompt=example.prompt,
+            target=example.target,
+            indices_to_explain=example.indices_to_explain,
+            attr_mask_indices=example.attr_mask_indices,
+            sink_span=example.sink_span,
+            thinking_span=example.thinking_span,
+            metadata=metadata,
+        )
+    span = _find_answer_span(tgt, answer)
+    if span is None:
+        return CachedExample(
+            prompt=example.prompt,
+            target=example.target,
+            indices_to_explain=example.indices_to_explain,
+            attr_mask_indices=example.attr_mask_indices,
+            sink_span=example.sink_span,
+            thinking_span=example.thinking_span,
+            metadata=metadata,
+        )
+    span_start_char, span_end_char = span
+    gen_ids = tokenizer(tgt, add_special_tokens=False, return_offsets_mapping=True)
+    sink_tokens: List[int] = []
+    for idx, (s, e) in enumerate(gen_ids["offset_mapping"]):
+        # include tokens that overlap the answer span
+        if s < span_end_char and e > span_start_char:
+            sink_tokens.append(idx)
+    if not sink_tokens:
+        return CachedExample(
+            prompt=example.prompt,
+            target=example.target,
+            indices_to_explain=example.indices_to_explain,
+            attr_mask_indices=example.attr_mask_indices,
+            sink_span=example.sink_span,
+            thinking_span=example.thinking_span,
+            metadata=metadata,
+        )
+    sink_span = [min(sink_tokens), max(sink_tokens)]
+    thinking_end = max(0, sink_span[0] - 1)
+    thinking_span = [0, thinking_end] if thinking_end >= 0 else sink_span
+    return CachedExample(
+        prompt=example.prompt,
+        target=example.target,
+        indices_to_explain=example.indices_to_explain,
+        attr_mask_indices=example.attr_mask_indices,
+        sink_span=example.sink_span or sink_span,
+        thinking_span=example.thinking_span or thinking_span,
+        metadata=metadata,
+    )
+def attach_spans_from_boxed(example: CachedExample, tokenizer) -> CachedExample:
+    """Backward-compatible wrapper that first looks for \\box{} then falls back to answer text."""
+    tgt = example.target
+    match = _find_box_span(tgt) if tgt else None
+    boxed_answer = match[2] if match else None
+    return attach_spans_from_answer(example, tokenizer, boxed_answer)
+def ruler_gold_prompt_token_indices(example: CachedExample, tokenizer) -> List[int]:
+    """Return token indices (prompt-side) that overlap RULER `needle_spans` in metadata.
+    The returned indices are with respect to `tokenizer(" " + example.prompt, add_special_tokens=False)`,
+    matching the attribution pipeline's leading-space convention.
+    """
+    needle_spans = (example.metadata or {}).get("needle_spans") or []
+    if not isinstance(needle_spans, list) or not needle_spans:
+        return []
+    prompt_text = " " + (example.prompt or "")
+    enc = tokenizer(prompt_text, add_special_tokens=False, return_offsets_mapping=True)
+    offsets = enc.get("offset_mapping")
+    if offsets is None:
+        raise ValueError("Tokenizer does not provide offset_mapping; cannot map needle_spans to tokens.")
+    spans: List[tuple[int, int]] = []
+    for item in needle_spans:
+        if not isinstance(item, dict):
+            continue
+        raw = item.get("span")
+        if not (isinstance(raw, list) and len(raw) == 2):
+            continue
+        try:
+            start = int(raw[0]) + 1  # shift for leading space in prompt_text
+            end = int(raw[1]) + 1
+        except Exception:
+            continue
+        if end > start:
+            spans.append((start, end))
+    if not spans:
+        return []
+    gold: set[int] = set()
+    for tok_idx, off in enumerate(offsets):
+        if off is None:
+            continue
+        try:
+            s, e = int(off[0]), int(off[1])
+        except Exception:
+            continue
+        if e <= s:
+            continue
+        for span_start, span_end in spans:
+            if s < span_end and e > span_start:
+                gold.add(tok_idx)
+                break
+    return sorted(gold)
+class DatasetLoader:
+    """Thin loader that resolves and samples datasets for exp2."""
+    def __init__(self, seed: int = 42, data_root: Path | str = Path("exp/exp2/data")) -> None:
+        self.seed = seed
+        self.data_root = Path(data_root)
+    def _sample(self, items: List[CachedExample], sample: Optional[int]) -> List[CachedExample]:
+        if sample is not None and sample < len(items):
+            rnd = random.Random(self.seed)
+            rnd.shuffle(items)
+            items = items[:sample]
+        return items
+    def _cached_path(self, name: str) -> Optional[Path]:
+        path = self.data_root / f"{name}.jsonl"
+        return path if path.exists() else None
+    def load(self, name: str, sample: Optional[int] = None) -> List[CachedExample]:
+        # 1) Prefer prepared cache under exp/exp2/data
+        cached_path = self._cached_path(name)
+        if cached_path:
+            return self._sample(load_cached(cached_path), sample)
+        return self.load_raw(name, sample=sample)
+    def load_raw(self, name: str, sample: Optional[int] = None) -> List[CachedExample]:
+        def _looks_like_json_array(path: Path) -> bool:
+            try:
+                with path.open("r", encoding="utf-8") as f:
+                    while True:
+                        ch = f.read(1)
+                        if not ch:
+                            return False
+                        if ch.isspace():
+                            continue
+                        return ch == "["
+            except OSError:
+                return False
+        # MoreHopQA
+        if name == "morehopqa":
+            ex = load_morehopqa()
+            for item in ex:
+                if "answer" in item.metadata:
+                    item.metadata.setdefault("reference_answer", item.metadata["answer"])
+            return self._sample(ex, sample)
+        # Allow passing the raw MoreHopQA JSON path directly.
+        p = Path(name)
+        if p.exists() and _looks_like_json_array(p):
+            ex = load_morehopqa(p)
+            for item in ex:
+                if "answer" in item.metadata:
+                    item.metadata.setdefault("reference_answer", item.metadata["answer"])
+            return self._sample(ex, sample)
+        # RULER / HotpotQA / niah / vt (all go through RulerAttributionDataset)
+        resolved = dataset_from_name(name)
+        if resolved is None:
+            raise FileNotFoundError(f"Could not resolve dataset {name}")
+        ex = load_ruler(resolved)
+        for item in ex:
+            outputs = item.metadata.get("outputs") or []
+            if outputs:
+                item.metadata.setdefault("reference_answer", ", ".join(outputs))
+            if item.target and "reference_answer" not in item.metadata:
+                item.metadata["reference_answer"] = item.target
+        return self._sample(ex, sample)

exp/exp2/map_math_mine_to_exp2_cache.py ADDED Viewed

	@@ -0,0 +1,584 @@

+#!/usr/bin/env python3
+"""Prepare data/math_mine.json into an exp2 cached JSONL dataset.
+This script supports two modes:
+- map (offline): convert GSM8K-style math examples:
+    {"question": "...", "answer": "... #### 18"}
+  into exp2's cached JSONL format (one JSON object per line).
+- resample (online): resample targets like exp/exp2/sample_and_filter.py:
+  call a chat completion API to generate "<thinking> + final \\box{} answer",
+  judge the boxed answer against the reference answer extracted from the raw
+  GSM8K-style entry, and write only judge=True samples.
+In both modes, exp2 expects token-level spans (NOT character spans):
+  - indices_to_explain: [start_tok, end_tok] (generation-token indices, closed interval)
+  - sink_span/thinking_span: token spans over tokenizer(target, add_special_tokens=False)
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from transformers import AutoTokenizer
+from tqdm import tqdm
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from exp.exp2.dataset_utils import CachedExample, attach_spans_from_answer, split_boxed_generation  # noqa: E402
+class RateLimitError(RuntimeError):
+    """Raised when API returns 429; carries a suggested wait time."""
+    def __init__(self, wait_seconds: float, detail: str) -> None:
+        super().__init__(detail)
+        self.wait_seconds = wait_seconds
+GEN_SYSTEM_PROMPT = (
+    "You are a reasoning assistant. "
+    "Before answering, engage in an chain of thought. "
+    "Process this freely and naturally without using specific headers or strict formatting. "
+    "When you reach the conclusion, wrap the entire final sentence containing the answer inside \\box{}. "
+    "Ensure the box wraps the **sentence** that naturally delivers the answer. DO NOT rewrite the answer word for the box separately."
+)
+JUDGE_SYSTEM_PROMPT = (
+    "You verify whether the model's boxed answer matches the reference answer. "
+    "Reply strictly with True or False and nothing else."
+)
+def call_chat_api(
+    api_base: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    *,
+    timeout: int,
+    max_tokens: int,
+    temperature: float,
+    cache_ttl: int,
+    cache_namespace: Optional[str],
+    rate_limit_delay: Optional[float] = None,
+) -> str:
+    """Minimal OpenAI-compatible chat.completions client (no external deps)."""
+    url = api_base.rstrip("/") + "/chat/completions"
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+    }
+    if cache_ttl > 0:
+        cache_obj: Dict[str, Any] = {"ttl": cache_ttl}
+        if cache_namespace:
+            cache_obj["namespace"] = cache_namespace
+        payload["cache"] = cache_obj
+    data = json.dumps(payload).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+    opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
+    try:
+        with opener.open(req, timeout=timeout) as resp:
+            resp_bytes = resp.read()
+    except urllib.error.HTTPError as e:
+        detail = e.read().decode("utf-8", errors="ignore") if hasattr(e, "read") else ""
+        if e.code == 429:
+            retry_after = None
+            if hasattr(e, "headers") and e.headers:
+                retry_after_header = e.headers.get("Retry-After")
+                if retry_after_header:
+                    try:
+                        retry_after = float(retry_after_header)
+                    except ValueError:
+                        retry_after = None
+            wait = retry_after or rate_limit_delay or 5.0
+            raise RateLimitError(wait, f"API HTTP 429: {detail}") from e
+        raise RuntimeError(f"API HTTP error {e.code}: {detail}") from e
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"API request failed: {e}") from e
+    try:
+        response = json.loads(resp_bytes.decode("utf-8"))
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Failed to decode API response: {resp_bytes!r}") from e
+    choices = response.get("choices", [])
+    if not choices:
+        raise RuntimeError(f"Empty choices from API: {response}")
+    content = choices[0].get("message", {}).get("content", "")
+    if not content:
+        raise RuntimeError(f"Empty content from API: {response}")
+    return content.strip()
+def build_gen_messages(prompt: str) -> List[Dict[str, str]]:
+    return [
+        {"role": "system", "content": GEN_SYSTEM_PROMPT},
+        {"role": "user", "content": prompt},
+    ]
+def build_judge_messages(reference_answer: str, candidate_answer: str) -> List[Dict[str, str]]:
+    user = (
+        "Decide if the model's boxed answer matches the reference answer.\n"
+        f"Reference answer: {reference_answer}\n"
+        f"Model boxed answer (only the content inside \\box{{}}): {candidate_answer}\n"
+        "Output only True if they are semantically consistent; otherwise output False."
+    )
+    return [
+        {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
+        {"role": "user", "content": user},
+    ]
+def parse_bool(text: str) -> bool:
+    first = text.strip().splitlines()[0].strip().lower()
+    if first in {"true", "yes"}:
+        return True
+    if first in {"false", "no"}:
+        return False
+    # fallback: check substring
+    if "true" in first and "false" not in first:
+        return True
+    if "false" in first:
+        return False
+    raise ValueError(f"Cannot parse boolean from: {text!r}")
+def _load_tokenizer(tokenizer_model: str):
+    tok_path = Path(tokenizer_model)
+    if tok_path.exists():
+        tokenizer = AutoTokenizer.from_pretrained(tok_path.as_posix(), local_files_only=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
+    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+def _split_gsm8k_answer(answer: str) -> Optional[Tuple[str, str]]:
+    """Return (thinking_text, final_answer) parsed from GSM8K `answer`."""
+    text = (answer or "").strip()
+    if not text:
+        return None
+    if "####" not in text:
+        return None
+    thinking, final = text.rsplit("####", 1)
+    thinking = thinking.strip()
+    final = final.strip()
+    if not final:
+        return None
+    return thinking, final
+def _is_token_span(span: Any) -> bool:
+    return isinstance(span, list) and len(span) == 2 and all(isinstance(x, int) for x in span)
+def _build_cached_example(
+    *,
+    question: str,
+    answer: str,
+    tokenizer,
+    example_idx: int,
+    source_path: str,
+) -> Optional[CachedExample]:
+    parsed = _split_gsm8k_answer(answer)
+    if parsed is None:
+        return None
+    thinking_text, final_answer = parsed
+    prompt = question.strip()
+    target = f"{thinking_text}\n{final_answer}" if thinking_text else final_answer
+    example = CachedExample(
+        prompt=prompt,
+        target=target,
+        indices_to_explain=None,
+        attr_mask_indices=None,
+        sink_span=None,
+        thinking_span=None,
+        metadata={
+            "dataset": "math_mine",
+            "source_path": source_path,
+            "example_idx": int(example_idx),
+            "raw_question": question,
+            "raw_answer": answer,
+            "reference_answer": final_answer,
+            "boxed_answer": final_answer,
+        },
+    )
+    example = attach_spans_from_answer(example, tokenizer, final_answer)
+    if not _is_token_span(example.sink_span):
+        return None
+    # exp2 requires token-level indices_to_explain=[start_tok,end_tok] (closed interval).
+    indices_to_explain = list(example.sink_span)
+    thinking_span = example.thinking_span
+    if thinking_span is not None and _is_token_span(thinking_span) and indices_to_explain[0] == 0:
+        # No room for "thinking" tokens; avoid overlapping spans.
+        thinking_span = None
+    return CachedExample(
+        prompt=example.prompt,
+        target=example.target,
+        indices_to_explain=indices_to_explain,
+        attr_mask_indices=example.attr_mask_indices,
+        sink_span=indices_to_explain,
+        thinking_span=thinking_span,
+        metadata=example.metadata,
+    )
+def _build_resampled_example(
+    *,
+    question: str,
+    raw_answer: str,
+    reference_answer: str,
+    generation: str,
+    tokenizer,
+    example_idx: int,
+    source_path: str,
+    judge_response: str,
+    generator_model: str,
+    judge_model: str,
+) -> Optional[CachedExample]:
+    parsed = split_boxed_generation(generation)
+    if not parsed:
+        return None
+    thinking_text, _boxed_segment, boxed_answer = parsed
+    target_text = f"{thinking_text}\n{boxed_answer}" if thinking_text else boxed_answer
+    example = CachedExample(
+        prompt=question.strip(),
+        target=target_text,
+        indices_to_explain=None,
+        attr_mask_indices=None,
+        sink_span=None,
+        thinking_span=None,
+        metadata={
+            "dataset": "math_mine",
+            "source_path": source_path,
+            "example_idx": int(example_idx),
+            "raw_question": question,
+            "raw_answer": raw_answer,
+            "reference_answer": reference_answer,
+            "judge_response": judge_response,
+            "generator_model": generator_model,
+            "judge_model": judge_model,
+        },
+    )
+    example = attach_spans_from_answer(example, tokenizer, boxed_answer)
+    if not _is_token_span(example.sink_span):
+        return None
+    indices_to_explain = list(example.sink_span)
+    return CachedExample(
+        prompt=example.prompt,
+        target=example.target,
+        indices_to_explain=indices_to_explain,
+        attr_mask_indices=example.attr_mask_indices,
+        sink_span=indices_to_explain,
+        thinking_span=example.thinking_span,
+        metadata=example.metadata,
+    )
+def _write_jsonl(path: Path, *, examples) -> int:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with path.open("w", encoding="utf-8") as f:
+        for ex in examples:
+            f.write(json.dumps(asdict(ex), ensure_ascii=False) + "\n")
+            count += 1
+    return count
+def main() -> None:
+    ap = argparse.ArgumentParser("Prepare data/math_mine.json for exp2 cached JSONL.")
+    ap.add_argument("--in_json", type=str, default="data/math_mine.json")
+    ap.add_argument("--out_jsonl", type=str, default="exp/exp2/data/math.jsonl")
+    ap.add_argument(
+        "--tokenizer_model",
+        type=str,
+        required=True,
+        help="Tokenizer name or local path; must match the tokenizer used in exp2 attribution.",
+    )
+    ap.add_argument(
+        "--mode",
+        type=str,
+        choices=["map", "resample"],
+        default="map",
+        help="map=offline mapping from GSM8K answers; resample=generate+judge like exp/exp2/sample_and_filter.py.",
+    )
+    # Resample (online) options (kept compatible with exp/exp2/sample_and_filter.py).
+    ap.add_argument("--max_examples", type=int, default=100, help="Number of judge=True examples to keep (resample mode).")
+    ap.add_argument("--seed", type=int, default=42, help="Shuffle seed (only used with --shuffle).")
+    ap.add_argument("--shuffle", action="store_true", help="Shuffle examples before attempting (resample mode).")
+    ap.add_argument("--api_base", type=str, default="http://localhost:4000/v1", help="Chat API base URL.")
+    ap.add_argument("--api_key", type=str, default=None, help="API key; defaults to FLASHTRACE_API_KEY/OPENAI_API_KEY.")
+    ap.add_argument("--generator_model", type=str, default="qwen3-235b-a22b-2507")
+    ap.add_argument("--judge_model", type=str, default="deepseek-v3-1-terminus")
+    ap.add_argument("--api_timeout", type=int, default=300)
+    ap.add_argument("--api_max_tokens", type=int, default=8192)
+    ap.add_argument("--api_temperature", type=float, default=0.0)
+    ap.add_argument("--api_cache_ttl", type=int, default=600)
+    ap.add_argument("--api_cache_namespace", type=str, default="flashtrace-exp2")
+    ap.add_argument("--retry_delay", type=float, default=2.0)
+    ap.add_argument("--retries", type=int, default=2, help="Additional retries on API failure.")
+    ap.add_argument("--request_interval", type=float, default=1.0, help="Sleep seconds between generation calls.")
+    ap.add_argument("--judge_interval", type=float, default=1.0, help="Sleep seconds between judge calls.")
+    ap.add_argument("--rate_limit_delay", type=float, default=5.0, help="Seconds to wait on HTTP 429 before retrying.")
+    args = ap.parse_args()
+    in_path = Path(args.in_json)
+    out_path = Path(args.out_jsonl)
+    tokenizer = _load_tokenizer(args.tokenizer_model)
+    raw = json.loads(in_path.read_text(encoding="utf-8"))
+    if not isinstance(raw, list):
+        raise SystemExit(f"Expected a JSON array in {in_path}, got {type(raw).__name__}.")
+    source_total = len(raw)
+    total = 0
+    kept = 0
+    skipped_empty_q = 0
+    skipped_empty_a = 0
+    skipped_parse = 0
+    skipped_span = 0
+    examples = []
+    if args.mode == "map":
+        attempted = None
+        skipped_format = None
+        judged_false = None
+        for idx, item in enumerate(raw):
+            total += 1
+            if not isinstance(item, dict):
+                skipped_parse += 1
+                continue
+            question = str(item.get("question") or "")
+            answer = str(item.get("answer") or "")
+            if not question.strip():
+                skipped_empty_q += 1
+                continue
+            if not answer.strip():
+                skipped_empty_a += 1
+                continue
+            ex = _build_cached_example(
+                question=question,
+                answer=answer,
+                tokenizer=tokenizer,
+                example_idx=idx,
+                source_path=str(in_path),
+            )
+            if ex is None:
+                # distinguish parse-vs-span failure
+                parsed = _split_gsm8k_answer(answer)
+                if parsed is None:
+                    skipped_parse += 1
+                else:
+                    skipped_span += 1
+                continue
+            examples.append(ex)
+            kept += 1
+    else:
+        api_key = args.api_key or os.environ.get("FLASHTRACE_API_KEY") or os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            raise SystemExit("resample mode requires --api_key or FLASHTRACE_API_KEY/OPENAI_API_KEY.")
+        attempted = 0
+        skipped_format = 0
+        judged_false = 0
+        indices = list(range(len(raw)))
+        if bool(args.shuffle):
+            import random
+            rnd = random.Random(int(args.seed))
+            rnd.shuffle(indices)
+        kept_bar = tqdm(total=int(args.max_examples), desc="Kept (judge=True)", position=1, leave=False)
+        for loop_idx in tqdm(indices, total=len(indices), desc="Resampling"):
+            if kept >= int(args.max_examples):
+                break
+            total += 1
+            item = raw[loop_idx]
+            if not isinstance(item, dict):
+                skipped_parse += 1
+                continue
+            question = str(item.get("question") or "")
+            answer = str(item.get("answer") or "")
+            if not question.strip():
+                skipped_empty_q += 1
+                continue
+            if not answer.strip():
+                skipped_empty_a += 1
+                continue
+            parsed = _split_gsm8k_answer(answer)
+            if parsed is None:
+                skipped_parse += 1
+                continue
+            _ref_thinking, reference_answer = parsed
+            attempted += 1
+            gen_messages = build_gen_messages(question.strip())
+            # Step 1: generation
+            for attempt in range(int(args.retries) + 1):
+                try:
+                    generation = call_chat_api(
+                        str(args.api_base),
+                        str(api_key),
+                        str(args.generator_model),
+                        gen_messages,
+                        timeout=int(args.api_timeout),
+                        max_tokens=int(args.api_max_tokens),
+                        temperature=float(args.api_temperature),
+                        cache_ttl=int(args.api_cache_ttl),
+                        cache_namespace=str(args.api_cache_namespace) if args.api_cache_namespace else None,
+                        rate_limit_delay=float(args.rate_limit_delay) if args.rate_limit_delay is not None else None,
+                    )
+                    break
+                except RateLimitError as e:
+                    if attempt >= int(args.retries):
+                        raise
+                    time.sleep(float(e.wait_seconds))
+                except Exception:  # noqa: BLE001
+                    if attempt >= int(args.retries):
+                        raise
+                    time.sleep(float(args.retry_delay))
+            if float(args.request_interval) > 0:
+                time.sleep(float(args.request_interval))
+            parsed_gen = split_boxed_generation(generation)
+            if not parsed_gen:
+                skipped_format += 1
+                print(f"[attempt={attempted}] skipped=format")
+                continue
+            thinking_text, _boxed_segment, boxed_answer = parsed_gen
+            judge_messages = build_judge_messages(reference_answer, boxed_answer)
+            ok = False
+            judge_resp = ""
+            for attempt in range(int(args.retries) + 1):
+                try:
+                    judge_resp = call_chat_api(
+                        str(args.api_base),
+                        str(api_key),
+                        str(args.judge_model),
+                        judge_messages,
+                        timeout=int(args.api_timeout),
+                        max_tokens=64,
+                        temperature=0.0,
+                        cache_ttl=int(args.api_cache_ttl),
+                        cache_namespace=str(args.api_cache_namespace) if args.api_cache_namespace else None,
+                        rate_limit_delay=float(args.rate_limit_delay) if args.rate_limit_delay is not None else None,
+                    )
+                    ok = parse_bool(judge_resp)
+                    break
+                except RateLimitError as e:
+                    if attempt >= int(args.retries):
+                        raise
+                    time.sleep(float(e.wait_seconds))
+                except Exception:  # noqa: BLE001
+                    if attempt >= int(args.retries):
+                        raise
+                    time.sleep(float(args.retry_delay))
+            if float(args.judge_interval) > 0:
+                time.sleep(float(args.judge_interval))
+            if not ok:
+                judged_false += 1
+                print(f"[attempt={attempted}] judge=filtered")
+                continue
+            ex = _build_resampled_example(
+                question=question,
+                raw_answer=answer,
+                reference_answer=reference_answer,
+                generation=generation,
+                tokenizer=tokenizer,
+                example_idx=int(loop_idx),
+                source_path=str(in_path),
+                judge_response=judge_resp,
+                generator_model=str(args.generator_model),
+                judge_model=str(args.judge_model),
+            )
+            if ex is None:
+                skipped_span += 1
+                print(f"[attempt={attempted}] skipped=span")
+                continue
+            examples.append(ex)
+            kept += 1
+            kept_bar.update(1)
+            print(f"[attempt={attempted}] judge=kept")
+        kept_bar.close()
+    written = _write_jsonl(out_path, examples=examples)
+    if written != kept:
+        raise SystemExit(f"Internal error: written={written} != kept={kept}")
+    print(
+        json.dumps(
+            {
+                "in_json": str(in_path),
+                "out_jsonl": str(out_path),
+                "tokenizer_model": args.tokenizer_model,
+                "mode": str(args.mode),
+                "source_total": int(source_total),
+                "visited": total,
+                "kept": kept,
+                "skipped_empty_question": skipped_empty_q,
+                "skipped_empty_answer": skipped_empty_a,
+                "skipped_parse": skipped_parse,
+                "skipped_span": skipped_span,
+                "attempted": attempted,
+                "skipped_format": skipped_format,
+                "judged_false": judged_false,
+                "max_examples": int(args.max_examples) if str(args.mode) == "resample" else None,
+                "api_base": str(args.api_base) if str(args.mode) == "resample" else None,
+                "generator_model": str(args.generator_model) if str(args.mode) == "resample" else None,
+                "judge_model": str(args.judge_model) if str(args.mode) == "resample" else None,
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+if __name__ == "__main__":
+    main()

exp/exp2/migrate_indices_to_explain_token_span.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+"""Migrate exp2 cached JSONL to token-span `indices_to_explain`.
+This converts legacy caches that used sentence indices (e.g. `[-2]`) into the
+token-span format:
+    indices_to_explain = [start_tok, end_tok]
+Where the span points to the boxed-inner (final answer) token span in `target`
+under `tokenizer(target, add_special_tokens=False)`.
+Rule:
+1) If `sink_span` exists and looks valid -> copy it to `indices_to_explain`
+2) Else try to recompute spans from `target` + `metadata.boxed_answer` using
+   `exp/exp2/dataset_utils.attach_spans_from_answer`
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+from transformers import AutoTokenizer
+def _ensure_repo_root_on_path() -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+def _is_token_span(span: Any) -> bool:
+    return isinstance(span, list) and len(span) == 2 and all(isinstance(x, int) for x in span)
+def _load_tokenizer(tokenizer_model: str):
+    tok_path = Path(tokenizer_model)
+    if tok_path.exists():
+        return AutoTokenizer.from_pretrained(tok_path.as_posix(), local_files_only=True)
+    return AutoTokenizer.from_pretrained(tokenizer_model)
+def _migrate_obj(obj: Dict[str, Any], tokenizer) -> tuple[Dict[str, Any], bool]:
+    sink_span = obj.get("sink_span")
+    if _is_token_span(sink_span):
+        obj["indices_to_explain"] = sink_span
+        return obj, True
+    _ensure_repo_root_on_path()
+    from exp.exp2.dataset_utils import CachedExample, attach_spans_from_answer  # noqa: E402
+    example = CachedExample(
+        prompt=obj.get("prompt") or "",
+        target=obj.get("target"),
+        indices_to_explain=obj.get("indices_to_explain"),
+        attr_mask_indices=obj.get("attr_mask_indices"),
+        sink_span=obj.get("sink_span"),
+        thinking_span=obj.get("thinking_span"),
+        metadata=obj.get("metadata") or {},
+    )
+    answer_text = (example.metadata.get("boxed_answer") or "").strip() or None
+    migrated = attach_spans_from_answer(example, tokenizer, answer_text)
+    if not _is_token_span(migrated.sink_span):
+        return obj, False
+    obj["sink_span"] = migrated.sink_span
+    obj["thinking_span"] = migrated.thinking_span
+    obj["indices_to_explain"] = migrated.sink_span
+    obj["metadata"] = migrated.metadata
+    return obj, True
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in_jsonl", type=str, required=True)
+    ap.add_argument("--out_jsonl", type=str, required=True)
+    ap.add_argument("--tokenizer_model", type=str, required=True)
+    ap.add_argument("--strict", action="store_true", help="Fail on any line that cannot be migrated.")
+    args = ap.parse_args()
+    tokenizer = _load_tokenizer(args.tokenizer_model)
+    in_path = Path(args.in_jsonl)
+    out_path = Path(args.out_jsonl)
+    try:
+        same_path = in_path.resolve() == out_path.resolve()
+    except FileNotFoundError:
+        same_path = False
+    tmp_out_path = out_path
+    if same_path:
+        tmp_out_path = out_path.with_name(out_path.name + ".tmp")
+        if tmp_out_path.exists():
+            tmp_out_path.unlink()
+    tmp_out_path.parent.mkdir(parents=True, exist_ok=True)
+    total = 0
+    migrated_ok = 0
+    bad = 0
+    with in_path.open("r", encoding="utf-8") as fin, tmp_out_path.open("w", encoding="utf-8") as fout:
+        for line_no, line in enumerate(fin, start=1):
+            if not line.strip():
+                continue
+            total += 1
+            obj: Dict[str, Any] = json.loads(line)
+            new_obj, ok = _migrate_obj(obj, tokenizer)
+            if ok:
+                migrated_ok += 1
+            else:
+                bad += 1
+                if args.strict:
+                    raise RuntimeError(f"cannot migrate line {line_no}: cannot resolve sink_span token span")
+            fout.write(json.dumps(new_obj, ensure_ascii=False) + "\n")
+    if same_path:
+        tmp_out_path.replace(out_path)
+        print(f"[done] total={total} migrated_ok={migrated_ok} bad={bad} wrote={out_path} (in-place)")
+    else:
+        print(f"[done] total={total} migrated_ok={migrated_ok} bad={bad} wrote={out_path}")
+if __name__ == "__main__":
+    main()

exp/exp2/out.log ADDED Viewed

	@@ -0,0 +1,102 @@

+[1/500] judge=kept
+[2/500] judge=kept
+[3/500] judge=kept
+[4/500] judge=kept
+[5/500] judge=kept
+[6/500] judge=kept
+[7/500] judge=kept
+[8/500] judge=kept
+[9/500] judge=kept
+[10/500] judge=kept
+[11/500] judge=kept
+[12/500] judge=kept
+[13/500] judge=kept
+[14/500] judge=kept
+[15/500] judge=kept
+[16/500] judge=kept
+[17/500] judge=kept
+[18/500] judge=kept
+[19/500] judge=kept
+[20/500] judge=kept
+[21/500] judge=kept
+[22/500] judge=kept
+[23/500] judge=kept
+[24/500] judge=kept
+[25/500] judge=kept
+[26/500] judge=kept
+[27/500] judge=kept
+[28/500] judge=kept
+[29/500] judge=kept
+[30/500] judge=kept
+[31/500] judge=kept
+[32/500] judge=kept
+[33/500] judge=kept
+[34/500] judge=kept
+[35/500] judge=kept
+[36/500] judge=kept
+[37/500] judge=kept
+[38/500] judge=kept
+[39/500] judge=kept
+[40/500] judge=kept
+[41/500] judge=kept
+[42/500] judge=kept
+[43/500] judge=kept
+[44/500] judge=kept
+[45/500] judge=kept
+[46/500] judge=kept
+[47/500] judge=kept
+[48/500] judge=kept
+[49/500] judge=kept
+[50/500] judge=kept
+[51/500] judge=kept
+[52/500] judge=kept
+[53/500] judge=kept
+[54/500] judge=kept
+[55/500] judge=kept
+[56/500] judge=kept
+[57/500] judge=kept
+[58/500] judge=kept
+[59/500] judge=kept
+[60/500] judge=kept
+[61/500] judge=kept
+[62/500] judge=kept
+[63/500] skipped=format
+[64/500] judge=kept
+[65/500] judge=kept
+[66/500] judge=kept
+[67/500] judge=kept
+[68/500] judge=kept
+[69/500] judge=kept
+[70/500] judge=kept
+[71/500] judge=kept
+[72/500] judge=kept
+[73/500] judge=kept
+[74/500] judge=kept
+[75/500] judge=kept
+[76/500] judge=kept
+[77/500] judge=kept
+[78/500] judge=kept
+[79/500] judge=kept
+[80/500] judge=kept
+[81/500] judge=kept
+[82/500] judge=kept
+[83/500] judge=kept
+[84/500] judge=kept
+[85/500] judge=kept
+[86/500] judge=kept
+[87/500] judge=kept
+[88/500] judge=kept
+[89/500] judge=kept
+[90/500] judge=kept
+[91/500] judge=kept
+[92/500] judge=kept
+[93/500] judge=kept
+[94/500] judge=kept
+[95/500] judge=kept
+[96/500] judge=kept
+[97/500] judge=kept
+[98/500] judge=kept
+[99/500] judge=kept
+[100/500] judge=kept
+[101/500] judge=kept
+Kept 100 / target 100 (attempted 101 / 500) -> exp/exp2/data/data/ruler_multihop/1024/vt_h10_c1/validation.jsonl.jsonl

exp/exp2/run_exp.py ADDED Viewed

	@@ -0,0 +1,1296 @@

+#!/usr/bin/env python3
+"""
+Experiment 2 runner: token-level faithfulness (generation perturbation).
+AT2 is omitted.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import sys
+from itertools import islice
+import math
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+# Early CUDA mask handling: set CUDA_VISIBLE_DEVICES before importing torch.
+def _early_set_cuda_visible_devices():
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    # parse_known_args keeps the full argv for later parsing by the main parser
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    if args.cuda and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+_early_set_cuda_visible_devices()
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, utils
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from pathlib import Path
+# ensure repo root on path
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+import llm_attr
+import llm_attr_eval
+from attribution_datasets import AttributionExample
+from exp.exp2 import dataset_utils as ds_utils
+utils.logging.set_verbosity_error()
+def _sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def _infer_attnlrp_spans_from_hops(
+    raw_attributions: Any,
+    *,
+    gen_len: int,
+) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    if not raw_attributions:
+        return (0, max(0, gen_len - 1)), (0, max(0, gen_len - 1))
+    sink_span = tuple(int(x) for x in raw_attributions[0].sink_range)
+    if len(raw_attributions) >= 2:
+        thinking_span = tuple(int(x) for x in raw_attributions[1].sink_range)
+    else:
+        thinking_span = sink_span
+    return sink_span, thinking_span
+def _build_hop_trace_payload(
+    attr_func: str,
+    attr: Any,
+    *,
+    indices_to_explain: List[int],
+) -> Optional[Dict[str, np.ndarray]]:
+    """Extract per-hop vectors (postprocessed) and minimal span metadata."""
+    prompt_len = int(len(getattr(attr, "prompt_tokens", []) or []))
+    gen_len = int(len(getattr(attr, "generation_tokens", []) or []))
+    total_len = prompt_len + gen_len
+    if total_len <= 0:
+        return None
+    hop_vectors: List[torch.Tensor] = []
+    sink_span_gen: Optional[Tuple[int, int]] = None
+    thinking_span_gen: Optional[Tuple[int, int]] = None
+    attnlrp_neg_handling: str = ""
+    attnlrp_norm_mode: str = ""
+    attnlrp_ratio_enabled: int = -1
+    # IFR multi-hop variants expose projected hop vectors via metadata["ifr"]["per_hop_projected"].
+    ifr_meta = (getattr(attr, "metadata", None) or {}).get("ifr") or {}
+    ifr_per_hop = ifr_meta.get("per_hop_projected") or []
+    if ifr_per_hop:
+        hop_vectors = [torch.as_tensor(v, dtype=torch.float32) for v in ifr_per_hop]
+        sink_span_gen = ifr_meta.get("sink_span_generation")
+        thinking_span_gen = ifr_meta.get("thinking_span_generation")
+        if sink_span_gen is not None:
+            sink_span_gen = tuple(int(x) for x in sink_span_gen)
+        if thinking_span_gen is not None:
+            thinking_span_gen = tuple(int(x) for x in thinking_span_gen)
+    elif attr_func in ("ft_attnlrp", "attnlrp_aggregated_multi_hop"):
+        meta = getattr(attr, "metadata", None) or {}
+        attnlrp_neg_handling = str(meta.get("neg_handling") or "")
+        attnlrp_norm_mode = str(meta.get("norm_mode") or "")
+        if meta.get("ratio_enabled") is not None:
+            attnlrp_ratio_enabled = int(bool(meta.get("ratio_enabled")))
+        multi_hop = meta.get("multi_hop_result")
+        if multi_hop is None:
+            return None
+        raw_attributions = getattr(multi_hop, "raw_attributions", None) or []
+        if not raw_attributions:
+            return None
+        hop_vectors = [
+            torch.as_tensor(getattr(hop, "token_importance_total"), dtype=torch.float32)
+            for hop in raw_attributions
+        ]
+        sink_span_gen, thinking_span_gen = _infer_attnlrp_spans_from_hops(raw_attributions, gen_len=gen_len)
+        sink_override = meta.get("sink_span")
+        thinking_override = meta.get("thinking_span")
+        if sink_override is not None:
+            sink_span_gen = tuple(int(x) for x in sink_override)
+        if thinking_override is not None:
+            thinking_span_gen = tuple(int(x) for x in thinking_override)
+    else:
+        return None
+    if sink_span_gen is None:
+        sink_span_gen = (0, max(0, gen_len - 1))
+    if thinking_span_gen is None:
+        thinking_span_gen = sink_span_gen
+    stacked = torch.stack([v.reshape(-1) for v in hop_vectors], dim=0)
+    if stacked.shape[1] != total_len:
+        raise ValueError(
+            f"Hop vector length mismatch for {attr_func}: expected T={total_len}, got {stacked.shape[1]}."
+        )
+    return {
+        "vh": stacked.detach().cpu().numpy().astype(np.float32, copy=False),
+        "prompt_len": np.asarray(prompt_len, dtype=np.int64),
+        "gen_len": np.asarray(gen_len, dtype=np.int64),
+        "sink_span_gen": np.asarray(sink_span_gen, dtype=np.int64),
+        "thinking_span_gen": np.asarray(thinking_span_gen, dtype=np.int64),
+        "indices_to_explain_gen": np.asarray(indices_to_explain, dtype=np.int64),
+        "attnlrp_neg_handling": np.asarray(attnlrp_neg_handling, dtype="U16"),
+        "attnlrp_norm_mode": np.asarray(attnlrp_norm_mode, dtype="U16"),
+        "attnlrp_ratio_enabled": np.asarray(attnlrp_ratio_enabled, dtype=np.int64),
+    }
+def _write_hop_trace(
+    trace_dir: Path,
+    *,
+    example_idx: int,
+    attr_func: str,
+    prompt: str,
+    target: Optional[str],
+    payload: Dict[str, np.ndarray],
+    manifest_handle,
+) -> None:
+    trace_dir.mkdir(parents=True, exist_ok=True)
+    npz_name = f"ex_{example_idx:06d}.npz"
+    npz_path = trace_dir / npz_name
+    np.savez_compressed(npz_path, **payload)
+    record = {
+        "example_idx": int(example_idx),
+        "attr_func": attr_func,
+        "file": npz_name,
+        "prompt_sha1": _sha1_text(prompt),
+        "target_sha1": _sha1_text(target) if target is not None else None,
+        "prompt_len": int(payload["prompt_len"].item()),
+        "gen_len": int(payload["gen_len"].item()),
+        "n_hops_plus_one": int(payload["vh"].shape[0]),
+        "total_len": int(payload["vh"].shape[1]),
+        "sink_span_gen": payload["sink_span_gen"].tolist(),
+        "thinking_span_gen": payload["thinking_span_gen"].tolist(),
+        "indices_to_explain_gen": payload["indices_to_explain_gen"].tolist(),
+        "attnlrp_neg_handling": str(payload["attnlrp_neg_handling"].item()),
+        "attnlrp_norm_mode": str(payload["attnlrp_norm_mode"].item()),
+        "attnlrp_ratio_enabled": int(payload["attnlrp_ratio_enabled"].item()),
+    }
+    manifest_handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+    manifest_handle.flush()
+def _parse_modes(mode_args: Any) -> List[str]:
+    """Parse --mode which may be provided as multiple args and/or comma-separated."""
+    if mode_args is None:
+        raw_parts: List[str] = []
+    elif isinstance(mode_args, str):
+        raw_parts = [mode_args]
+    else:
+        raw_parts = [str(x) for x in mode_args]
+    modes: List[str] = []
+    for chunk in raw_parts:
+        for part in str(chunk).split(","):
+            m = part.strip()
+            if m:
+                modes.append(m)
+    # Default to faithfulness_gen for backward compatibility.
+    if not modes:
+        modes = ["faithfulness_gen"]
+    allowed = {"faithfulness_gen", "recovery_ruler"}
+    seen: set[str] = set()
+    unique: List[str] = []
+    for m in modes:
+        if m not in seen:
+            unique.append(m)
+            seen.add(m)
+    unknown = [m for m in unique if m not in allowed]
+    if unknown:
+        raise SystemExit(f"Unsupported --mode value(s): {unknown}. Allowed: {sorted(allowed)}.")
+    return unique
+def _trace_run_tag(
+    testing_dict: Dict[str, Any],
+    *,
+    modes: List[str],
+    total: int,
+) -> str:
+    attr_func = str(testing_dict.get("attr_func") or "attr")
+    parts = [attr_func]
+    if attr_func in (
+        "ifr_multi_hop",
+        "ifr_in_all_gen",
+        "ifr_multi_hop_stop_words",
+        "ifr_multi_hop_both",
+        "ifr_multi_hop_split_hop",
+        "ft_attnlrp",
+        "attnlrp_aggregated_multi_hop",
+    ):
+        parts.append(f"n{int(testing_dict.get('n_hops', 0))}")
+    if attr_func in ("attnlrp", "ft_attnlrp", "attnlrp_aggregated_multi_hop"):
+        parts.append(f"neg{str(testing_dict.get('attnlrp_neg_handling', ''))}")
+        parts.append(f"norm{str(testing_dict.get('attnlrp_norm_mode', ''))}")
+    if modes:
+        parts.append("m" + "+".join(modes))
+    parts.append(f"{int(total)}ex")
+    return "_".join(parts)
+def _token_importance_vector(attr: torch.Tensor) -> np.ndarray:
+    """Return token importance vector w = sum_rows(attr) in shape [P+G]."""
+    w = torch.nan_to_num(attr.sum(0).to(dtype=torch.float32), nan=0.0).clamp(min=0.0)
+    return w.detach().cpu().numpy().astype(np.float32, copy=False)
+def _build_sample_trace_payload(
+    example: ds_utils.CachedExample,
+    *,
+    attr_list: List[torch.Tensor],
+    prompt_len: int,
+    user_prompt_indices: Optional[List[int]],
+    keep_prompt_token_indices: Optional[List[int]],
+    gold_prompt_token_indices: Optional[List[int]],
+    hop_payload: Optional[Dict[str, np.ndarray]],
+    faithfulness_scores: Optional[np.ndarray],
+    recovery_scores: Optional[np.ndarray],
+    time_attr_s: Optional[float],
+    time_faith_s: Optional[float],
+    time_recovery_s: Optional[float],
+) -> Dict[str, np.ndarray]:
+    seq_attr, row_attr, rec_attr = attr_list
+    gen_len = int(seq_attr.shape[0])
+    v_seq_all = _token_importance_vector(seq_attr)
+    v_row_all = _token_importance_vector(row_attr)
+    v_rec_all = _token_importance_vector(rec_attr)
+    payload: Dict[str, np.ndarray] = {
+        "v_seq_all": v_seq_all,
+        "v_row_all": v_row_all,
+        "v_rec_all": v_rec_all,
+        "v_seq_prompt": v_seq_all[:prompt_len],
+        "v_row_prompt": v_row_all[:prompt_len],
+        "v_rec_prompt": v_rec_all[:prompt_len],
+        "prompt_len": np.asarray(int(prompt_len), dtype=np.int64),
+        "gen_len": np.asarray(int(gen_len), dtype=np.int64),
+        "indices_to_explain_gen": np.asarray(list(example.indices_to_explain or []), dtype=np.int64),
+    }
+    if example.sink_span is not None:
+        payload["sink_span_gen"] = np.asarray(list(example.sink_span), dtype=np.int64)
+    if example.thinking_span is not None:
+        payload["thinking_span_gen"] = np.asarray(list(example.thinking_span), dtype=np.int64)
+    if user_prompt_indices is not None:
+        payload["user_prompt_indices"] = np.asarray(list(user_prompt_indices), dtype=np.int64)
+    if keep_prompt_token_indices is not None:
+        payload["keep_prompt_token_indices"] = np.asarray(list(keep_prompt_token_indices), dtype=np.int64)
+    if gold_prompt_token_indices is not None:
+        payload["gold_prompt_token_indices"] = np.asarray(list(gold_prompt_token_indices), dtype=np.int64)
+    if faithfulness_scores is not None:
+        payload["faithfulness_scores"] = np.asarray(faithfulness_scores, dtype=np.float64)
+    if recovery_scores is not None:
+        payload["recovery_scores"] = np.asarray(recovery_scores, dtype=np.float64)
+    if time_attr_s is not None:
+        payload["time_attr_s"] = np.asarray(float(time_attr_s), dtype=np.float64)
+    if time_faith_s is not None:
+        payload["time_faith_s"] = np.asarray(float(time_faith_s), dtype=np.float64)
+    if time_recovery_s is not None:
+        payload["time_recovery_s"] = np.asarray(float(time_recovery_s), dtype=np.float64)
+    if hop_payload is not None:
+        for k, v in hop_payload.items():
+            if k in payload:
+                continue
+            payload[k] = v
+    return payload
+def _write_sample_trace(
+    trace_dir: Path,
+    *,
+    example_idx: int,
+    attr_func: str,
+    prompt: str,
+    target: Optional[str],
+    payload: Dict[str, np.ndarray],
+    manifest_handle,
+    recovery_skipped_reason: Optional[str],
+) -> None:
+    trace_dir.mkdir(parents=True, exist_ok=True)
+    npz_name = f"ex_{example_idx:06d}.npz"
+    npz_path = trace_dir / npz_name
+    np.savez_compressed(npz_path, **payload)
+    prompt_len = int(np.asarray(payload.get("prompt_len", 0)).item())
+    gen_len = int(np.asarray(payload.get("gen_len", 0)).item())
+    record: Dict[str, Any] = {
+        "example_idx": int(example_idx),
+        "attr_func": attr_func,
+        "file": npz_name,
+        "prompt_sha1": _sha1_text(prompt),
+        "target_sha1": _sha1_text(target) if target is not None else None,
+        "prompt_len": prompt_len,
+        "gen_len": gen_len,
+        "indices_to_explain_gen": payload.get("indices_to_explain_gen").tolist()
+        if payload.get("indices_to_explain_gen") is not None
+        else None,
+        "sink_span_gen": payload.get("sink_span_gen").tolist() if payload.get("sink_span_gen") is not None else None,
+        "thinking_span_gen": payload.get("thinking_span_gen").tolist()
+        if payload.get("thinking_span_gen") is not None
+        else None,
+        "faithfulness_scores": payload.get("faithfulness_scores").tolist()
+        if payload.get("faithfulness_scores") is not None
+        else None,
+        "recovery_scores": payload.get("recovery_scores").tolist() if payload.get("recovery_scores") is not None else None,
+        "recovery_skipped_reason": recovery_skipped_reason,
+        "time_attr_s": float(np.asarray(payload.get("time_attr_s")).item()) if payload.get("time_attr_s") is not None else None,
+        "time_faith_s": float(np.asarray(payload.get("time_faith_s")).item()) if payload.get("time_faith_s") is not None else None,
+        "time_recovery_s": float(np.asarray(payload.get("time_recovery_s")).item())
+        if payload.get("time_recovery_s") is not None
+        else None,
+    }
+    # Derived, sample-level bookkeeping (token lengths and per-sample MAS/RISE).
+    record["input_len"] = int(prompt_len)
+    sink_span = record.get("sink_span_gen")
+    if isinstance(sink_span, list) and len(sink_span) == 2:
+        try:
+            start = int(sink_span[0])
+            end = int(sink_span[1])
+            record["output_len"] = (end - start + 1) if end >= start else None
+        except Exception:
+            record["output_len"] = None
+    else:
+        record["output_len"] = None
+    thinking_span = record.get("thinking_span_gen")
+    if isinstance(thinking_span, list) and len(thinking_span) == 2:
+        try:
+            start = int(thinking_span[0])
+            end = int(thinking_span[1])
+            record["cot_len"] = (end - start + 1) if end >= start else None
+        except Exception:
+            record["cot_len"] = None
+    else:
+        record["cot_len"] = None
+    record["rise_seq"] = None
+    record["mas_seq"] = None
+    record["rise_row"] = None
+    record["mas_row"] = None
+    record["rise_rec"] = None
+    record["mas_rec"] = None
+    faith = record.get("faithfulness_scores")
+    if isinstance(faith, list) and len(faith) == 3:
+        try:
+            record["rise_seq"] = float(faith[0][0])
+            record["mas_seq"] = float(faith[0][1])
+            record["rise_row"] = float(faith[1][0])
+            record["mas_row"] = float(faith[1][1])
+            record["rise_rec"] = float(faith[2][0])
+            record["mas_rec"] = float(faith[2][1])
+        except Exception:
+            pass
+    if payload.get("vh") is not None:
+        vh = payload["vh"]
+        record["n_hops_plus_one"] = int(vh.shape[0])
+        record["total_len"] = int(vh.shape[1])
+        record["attnlrp_neg_handling"] = str(payload.get("attnlrp_neg_handling").item()) if payload.get("attnlrp_neg_handling") is not None else ""
+        record["attnlrp_norm_mode"] = str(payload.get("attnlrp_norm_mode").item()) if payload.get("attnlrp_norm_mode") is not None else ""
+        record["attnlrp_ratio_enabled"] = int(payload.get("attnlrp_ratio_enabled").item()) if payload.get("attnlrp_ratio_enabled") is not None else -1
+    manifest_handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+    manifest_handle.flush()
+def _compute_faithfulness_scores(
+    testing_dict: Dict[str, Any],
+    *,
+    attr_list: List[torch.Tensor],
+    prompt_len: int,
+    prompt: str,
+    generation: str,
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    user_prompt_indices: Optional[List[int]],
+    keep_prompt_token_indices: Optional[List[int]],
+) -> np.ndarray:
+    attr_func = str(testing_dict.get("attr_func") or "")
+    results: List[Tuple[float, float, float]] = []
+    for attr in attr_list:
+        attr_prompt = attr[:, :prompt_len]
+        if attr_func in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both") and keep_prompt_token_indices is not None:
+            import ft_ifr_improve
+            scores = ft_ifr_improve.faithfulness_test_skip_tokens(
+                llm_evaluator,
+                attr_prompt,
+                prompt,
+                generation,
+                keep_prompt_token_indices=keep_prompt_token_indices,
+                user_prompt_indices=user_prompt_indices,
+            )
+        elif user_prompt_indices is not None:
+            scores = _faithfulness_test_with_user_prompt_indices(
+                llm_evaluator,
+                attr_prompt,
+                prompt,
+                generation,
+                user_prompt_indices=user_prompt_indices,
+            )
+        else:
+            scores = llm_evaluator.faithfulness_test(attr_prompt, prompt, generation)
+        results.append(scores)
+    return np.asarray(results, dtype=np.float64)
+def _compute_recovery_scores(
+    testing_dict: Dict[str, Any],
+    *,
+    attr_list: List[torch.Tensor],
+    prompt_len: int,
+    gold_prompt_token_indices: List[int],
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    keep_prompt_token_indices: Optional[List[int]],
+) -> Tuple[Optional[np.ndarray], Optional[str]]:
+    attr_func = str(testing_dict.get("attr_func") or "")
+    if prompt_len <= 0:
+        return None, "empty_prompt_len"
+    gold_prompt = [int(x) for x in (gold_prompt_token_indices or [])]
+    if not gold_prompt:
+        return None, "empty_gold_prompt"
+    if attr_func in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both") and keep_prompt_token_indices is not None:
+        import ft_ifr_improve
+        keep_set = {int(x) for x in keep_prompt_token_indices}
+        gold_filtered = [idx for idx in gold_prompt if int(idx) in keep_set]
+        if not gold_filtered:
+            return None, "empty_gold_after_keep_filter"
+        scores = [
+            ft_ifr_improve.evaluate_attr_recovery_skip_tokens(
+                attr[:, :prompt_len],
+                keep_prompt_token_indices=keep_prompt_token_indices,
+                gold_prompt_token_indices=gold_prompt,
+                top_fraction=0.1,
+            )
+            for attr in attr_list
+        ]
+    else:
+        scores = [
+            llm_evaluator.evaluate_attr_recovery(
+                attr,
+                prompt_len=prompt_len,
+                gold_prompt_token_indices=gold_prompt,
+                top_fraction=0.1,
+            )
+            for attr in attr_list
+        ]
+    return np.asarray(scores, dtype=np.float64), None
+def evaluate_dataset_multi(
+    args,
+    dataset_name: str,
+    examples: List[ds_utils.CachedExample],
+    testing_dict: Dict[str, Any],
+    *,
+    modes: List[str],
+) -> Dict[str, Any]:
+    tokenizer = testing_dict["tokenizer"]
+    llm_evaluator = llm_attr_eval.LLMAttributionEvaluator(testing_dict["model"], tokenizer)
+    want_faith = "faithfulness_gen" in modes
+    want_recovery = "recovery_ruler" in modes
+    faith_results: List[np.ndarray] = []
+    faith_durations: List[float] = []
+    recovery_results: List[np.ndarray] = []
+    recovery_attr_durations: List[float] = []
+    recovery_skipped = 0
+    total = min(len(examples), args.num_examples)
+    iterator = islice(examples, total)
+    save_traces = bool(getattr(args, "save_hop_traces", False))
+    manifest_handle = None
+    trace_dir: Optional[Path] = None
+    if save_traces:
+        model_tag = str(testing_dict.get("model_tag", "model"))
+        run_tag = _trace_run_tag(testing_dict, modes=modes, total=total)
+        trace_dir = Path(args.output_root) / "traces" / dataset_name / model_tag / run_tag
+        trace_dir.mkdir(parents=True, exist_ok=True)
+        manifest_handle = open(trace_dir / "manifest.jsonl", "w", encoding="utf-8")
+    try:
+        for example_idx, ex in enumerate(iterator):
+            if want_recovery:
+                needle_spans = (ex.metadata or {}).get("needle_spans")
+                if not isinstance(needle_spans, list) or not needle_spans:
+                    raise SystemExit(
+                        "recovery_ruler requires RULER samples with metadata.needle_spans; "
+                        f"dataset={dataset_name} has missing/empty needle_spans."
+                    )
+                if ex.target is None:
+                    raise SystemExit(
+                        "recovery_ruler requires cached targets (CoT+answer) so row/rec attribution is well-defined. "
+                        f"dataset={dataset_name} has target=None; run exp/exp2/sample_and_filter.py first."
+                    )
+            # Determine generation/target once.
+            target = ex.target
+            if target is None:
+                generation, full_output = llm_evaluator.response(ex.prompt)
+                target = generation
+                response_len = len(tokenizer(full_output).input_ids)
+            else:
+                response_len = len(tokenizer(llm_evaluator.format_prompt(" " + ex.prompt) + target).input_ids)
+            testing_dict["batch_size"] = max(1, math.floor((testing_dict["max_input_len"] - 100) / max(1, response_len)))
+            gold_prompt: Optional[List[int]] = None
+            if want_recovery:
+                gold_prompt = ds_utils.ruler_gold_prompt_token_indices(ex, tokenizer)
+            if want_recovery and not want_faith and not save_traces:
+                # Preserve recovery-only fast path when not saving traces: skip samples with empty gold.
+                if not gold_prompt:
+                    recovery_skipped += 1
+                    continue
+            time_attr_s = None
+            time_faith_s = None
+            time_recovery_s = None
+            t0 = time.perf_counter()
+            attr_list, hop_payload, user_prompt_indices, keep_prompt_token_indices = run_attribution(testing_dict, ex, target)
+            time_attr_s = time.perf_counter() - t0
+            seq_attr = attr_list[0]
+            prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])  # cols=(P+G), rows=G
+            if want_recovery and gold_prompt:
+                recovery_attr_durations.append(float(time_attr_s))
+            faith_scores = None
+            if want_faith:
+                t1 = time.perf_counter()
+                faith_scores = _compute_faithfulness_scores(
+                    testing_dict,
+                    attr_list=attr_list,
+                    prompt_len=prompt_len,
+                    prompt=ex.prompt,
+                    generation=target,
+                    llm_evaluator=llm_evaluator,
+                    user_prompt_indices=user_prompt_indices,
+                    keep_prompt_token_indices=keep_prompt_token_indices,
+                )
+                time_faith_s = time.perf_counter() - t1
+                faith_results.append(faith_scores)
+                faith_durations.append(float(time_attr_s))
+            recovery_scores = None
+            recovery_skip_reason = None
+            if want_recovery:
+                if not gold_prompt:
+                    recovery_skip_reason = "empty_gold_prompt"
+                    recovery_skipped += 1
+                else:
+                    t2 = time.perf_counter()
+                    recovery_scores, recovery_skip_reason = _compute_recovery_scores(
+                        testing_dict,
+                        attr_list=attr_list,
+                        prompt_len=prompt_len,
+                        gold_prompt_token_indices=gold_prompt,
+                        llm_evaluator=llm_evaluator,
+                        keep_prompt_token_indices=keep_prompt_token_indices,
+                    )
+                    time_recovery_s = time.perf_counter() - t2
+                    if recovery_scores is None:
+                        recovery_skipped += 1
+                    else:
+                        recovery_results.append(recovery_scores)
+            if manifest_handle is not None and trace_dir is not None:
+                try:
+                    payload = _build_sample_trace_payload(
+                        ex,
+                        attr_list=attr_list,
+                        prompt_len=prompt_len,
+                        user_prompt_indices=user_prompt_indices,
+                        keep_prompt_token_indices=keep_prompt_token_indices,
+                        gold_prompt_token_indices=gold_prompt,
+                        hop_payload=hop_payload,
+                        faithfulness_scores=faith_scores,
+                        recovery_scores=recovery_scores,
+                        time_attr_s=time_attr_s,
+                        time_faith_s=time_faith_s,
+                        time_recovery_s=time_recovery_s,
+                    )
+                    _write_sample_trace(
+                        trace_dir,
+                        example_idx=example_idx,
+                        attr_func=str(testing_dict.get("attr_func") or ""),
+                        prompt=ex.prompt,
+                        target=target,
+                        payload=payload,
+                        manifest_handle=manifest_handle,
+                        recovery_skipped_reason=recovery_skip_reason,
+                    )
+                except Exception as exc:
+                    print(f"[warn] sample trace save failed for {testing_dict.get('attr_func')} ex={example_idx}: {exc}")
+    finally:
+        if manifest_handle is not None:
+            try:
+                manifest_handle.close()
+            except Exception:
+                pass
+    out: Dict[str, Any] = {}
+    if want_faith:
+        if not faith_results:
+            out["faithfulness"] = None
+        else:
+            scores = np.stack(faith_results, axis=0)  # [N, 3, 3]
+            out["faithfulness"] = {
+                "mean": scores.mean(0),
+                "std": scores.std(0),
+                "avg_time": float(np.mean(faith_durations)) if faith_durations else 0.0,
+            }
+    if want_recovery:
+        if not recovery_results:
+            out["recovery"] = None
+        else:
+            scores = np.stack(recovery_results, axis=0)  # [N, 3]
+            out["recovery"] = {
+                "mean": scores.mean(0),
+                "std": scores.std(0),
+                "avg_time": float(np.mean(recovery_attr_durations)) if recovery_attr_durations else 0.0,
+                "used": int(scores.shape[0]),
+                "skipped": int(recovery_skipped),
+            }
+    return out
+def _faithfulness_test_with_user_prompt_indices(
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    attribution: torch.Tensor,
+    prompt: str,
+    generation: str,
+    *,
+    user_prompt_indices: List[int],
+    k: int = 20, ### control the MAS steps per sample
+) -> Tuple[float, float, float]:
+    """Token-level MAS/RISE faithfulness via guided deletion in k perturbation steps using provided prompt indices.
+    This mirrors llm_attr_eval.LLMAttributionEvaluator.faithfulness_test, but avoids
+    locating the user prompt span via token-id subsequence matching (which may fail
+    for some tokenizers due to non-compositional BPE merges at template boundaries).
+    """
+    def auc(arr: np.ndarray) -> float:
+        return (arr.sum() - arr[0] / 2 - arr[-1] / 2) / max(1, (arr.shape[0] - 1))
+    pad_token_id = llm_evaluator._ensure_pad_token_id()
+    user_prompt = " " + prompt
+    formatted_prompt = llm_evaluator.format_prompt(user_prompt)
+    formatted_ids = llm_evaluator.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False).input_ids
+    prompt_ids = formatted_ids.to(llm_evaluator.device)
+    prompt_ids_perturbed = prompt_ids.clone()
+    generation_ids = llm_evaluator.tokenizer(
+        generation + llm_evaluator.tokenizer.eos_token,
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).input_ids.to(llm_evaluator.device)
+    attr_cpu = attribution.detach().cpu()
+    w = attr_cpu.sum(0)
+    sorted_attr_indices = torch.argsort(w, descending=True)
+    attr_sum = float(w.sum().item())
+    P = int(w.numel())
+    if len(user_prompt_indices) != P:
+        raise ValueError(
+            "user_prompt_indices length does not match prompt-side attribution length: "
+            f"indices P={len(user_prompt_indices)}, attr P={P}."
+        )
+    if P == 0:
+        return 0.0, 0.0, 0.0
+    if max(user_prompt_indices) >= int(prompt_ids_perturbed.shape[1]):
+        raise ValueError("user_prompt_indices contains an out-of-bounds index for formatted prompt ids.")
+    if P > 0:
+        steps = int(k) if k is not None else 0
+        if steps <= 0:
+            steps = 1
+        steps = min(steps, P)
+    else:
+        steps = 0
+    scores = np.zeros(steps + 1, dtype=np.float64)
+    density = np.zeros(steps + 1, dtype=np.float64)
+    scores[0] = (
+        llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+    )
+    density[0] = 1.0
+    if attr_sum <= 0:
+        density = np.linspace(1.0, 0.0, steps + 1)
+    base = P // steps
+    remainder = P % steps
+    start = 0
+    for step in range(steps):
+        size = base + (1 if step < remainder else 0)
+        group = sorted_attr_indices[start : start + size]
+        start += size
+        for idx in group:
+            j = int(idx.item())
+            abs_pos = int(user_prompt_indices[j])
+            prompt_ids_perturbed[0, abs_pos] = pad_token_id
+        scores[step + 1] = (
+            llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+        )
+        if attr_sum > 0:
+            dec = float(w.index_select(0, group).sum().item()) / attr_sum
+            density[step + 1] = density[step] - dec
+    min_normalized_pred = 1.0
+    normalized_model_response = scores.copy()
+    for i in range(len(scores)):
+        normalized_pred = (normalized_model_response[i] - scores[-1]) / (abs(scores[0] - scores[-1]))
+        normalized_pred = np.clip(normalized_pred, 0.0, 1.0)
+        min_normalized_pred = min(min_normalized_pred, normalized_pred)
+        normalized_model_response[i] = min_normalized_pred
+    alignment_penalty = np.abs(normalized_model_response - density)
+    corrected_scores = normalized_model_response + alignment_penalty
+    corrected_scores = corrected_scores.clip(0.0, 1.0)
+    corrected_scores = (corrected_scores - np.min(corrected_scores)) / (np.max(corrected_scores) - np.min(corrected_scores))
+    if np.isnan(corrected_scores).any():
+        corrected_scores = np.linspace(1.0, 0.0, len(scores))
+    return auc(normalized_model_response), auc(corrected_scores), auc(normalized_model_response + alignment_penalty)
+def load_model(model_name: str, device: str):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="auto" if device == "auto" else {"": int(device.split(":")[1])} if device.startswith("cuda:") else None,
+        torch_dtype=torch.float16,
+        attn_implementation="eager",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+def resolve_device(args) -> str:
+    if args.cuda is not None and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+        return "auto"
+    if args.cuda is not None and args.cuda.strip():
+        return f"cuda:{args.cuda}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{args.cuda_num}" if torch.cuda.is_available() else "cpu"
+def run_attribution(
+    testing_dict, example: ds_utils.CachedExample, target: Optional[str]
+) -> Tuple[List[torch.Tensor], Optional[Dict[str, np.ndarray]], Optional[List[int]]]:
+    model = testing_dict["model"]
+    tokenizer = testing_dict["tokenizer"]
+    attr_func = testing_dict["attr_func"]
+    indices_to_explain = example.indices_to_explain
+    if not (isinstance(indices_to_explain, list) and len(indices_to_explain) == 2):
+        raise ValueError(
+            "exp2 requires token-span indices_to_explain=[start_tok,end_tok]. "
+            "Please re-sample or run exp/exp2/migrate_indices_to_explain_token_span.py on your cache."
+        )
+    llm_attributor = None
+    if "IG" in attr_func:
+        llm_attributor = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        attr = llm_attributor.calculate_IG_per_generation(
+            example.prompt,
+            20,
+            tokenizer.eos_token_id,
+            batch_size=testing_dict["batch_size"],
+            target=target,
+        )
+    elif "perturbation" in attr_func:
+        if attr_func in ("perturbation_all_fast", "perturbation_CLP_fast", "perturbation_REAGENT_fast"):
+            import perturbation_fast
+            llm_attributor = perturbation_fast.LLMPerturbationFastAttribution(model, tokenizer)
+            if attr_func == "perturbation_all_fast":
+                attr = llm_attributor.calculate_feature_ablation_segments(
+                    example.prompt,
+                    baseline=tokenizer.eos_token_id,
+                    measure="log_loss",
+                    target=target,
+                    source_k=20,
+                )
+            elif attr_func == "perturbation_CLP_fast":
+                attr = llm_attributor.calculate_feature_ablation_segments(
+                    example.prompt,
+                    baseline=tokenizer.eos_token_id,
+                    measure="KL",
+                    target=target,
+                    source_k=20,
+                )
+            else:
+                attr = llm_attributor.calculate_feature_ablation_segments_mlm(
+                    example.prompt,
+                    target=target,
+                    source_k=20,
+                )
+        else:
+            llm_attributor = llm_attr.LLMPerturbationAttribution(model, tokenizer)
+            if attr_func == "perturbation_all":
+                attr = llm_attributor.calculate_feature_ablation_sentences(
+                    example.prompt, baseline=tokenizer.eos_token_id, measure="log_loss", target=target
+                )
+            elif attr_func == "perturbation_CLP":
+                attr = llm_attributor.calculate_feature_ablation_sentences(
+                    example.prompt, baseline=tokenizer.eos_token_id, measure="KL", target=target
+                )
+            elif attr_func == "perturbation_REAGENT":
+                attr = llm_attributor.calculate_feature_ablation_sentences_mlm(example.prompt, target=target)
+            else:
+                raise ValueError(f"Unsupported perturbation attr_func {attr_func}")
+    elif "attention" in attr_func:
+        llm_attributor = llm_attr.LLMAttentionAttribution(model, tokenizer)
+        llm_attributor_ig = llm_attr.LLMGradientAttribtion(model, tokenizer)
+        attr = llm_attributor.calculate_attention_attribution(example.prompt, target=target)
+        attr_b = llm_attributor_ig.calculate_IG_per_generation(
+            example.prompt, 20, tokenizer.eos_token_id, batch_size=testing_dict["batch_size"], target=target
+        )
+        attr.attribution_matrix = attr.attribution_matrix * attr_b.attribution_matrix
+    elif attr_func == "ifr_all_positions":
+        llm_attributor = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_for_all_positions(example.prompt, target=target)
+    elif attr_func == "ifr_all_positions_output_only":
+        llm_attributor = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        sink_span = tuple(example.sink_span) if example.sink_span else tuple(indices_to_explain)
+        attr = llm_attributor.calculate_ifr_for_all_positions_output_only(
+            example.prompt,
+            target=target,
+            sink_span=sink_span,
+        )
+    elif attr_func == "ifr_multi_hop":
+        llm_attributor = llm_attr.LLMIFRAttribution(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_multi_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+        )
+    elif attr_func == "ifr_in_all_gen":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionInAllGen(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_in_all_gen(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+        )
+    elif attr_func == "ifr_multi_hop_stop_words":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionImproved(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_multi_hop_stop_words(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+        )
+    elif attr_func == "ifr_multi_hop_both":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionBoth(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_multi_hop_both(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+        )
+    elif attr_func == "ifr_multi_hop_split_hop":
+        import ft_ifr_improve
+        llm_attributor = ft_ifr_improve.LLMIFRAttributionSplitHop(
+            model,
+            tokenizer,
+            chunk_tokens=testing_dict["chunk_tokens"],
+            sink_chunk_tokens=testing_dict["sink_chunk_tokens"],
+        )
+        attr = llm_attributor.calculate_ifr_multi_hop_split_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+        )
+    elif attr_func == "attnlrp":
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_ft_hop0(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            neg_handling=str(testing_dict.get("attnlrp_neg_handling", "drop")),
+            norm_mode=str(testing_dict.get("attnlrp_norm_mode", "norm")),
+        )
+    elif attr_func in ("ft_attnlrp", "attnlrp_aggregated_multi_hop"):
+        llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_attnlrp_aggregated_multi_hop(
+            example.prompt,
+            target=target,
+            sink_span=tuple(example.sink_span) if example.sink_span else None,
+            thinking_span=tuple(example.thinking_span) if example.thinking_span else None,
+            n_hops=testing_dict["n_hops"],
+            neg_handling=str(testing_dict.get("attnlrp_neg_handling", "drop")),
+            norm_mode=str(testing_dict.get("attnlrp_norm_mode", "norm")),
+        )
+    elif attr_func == "basic":
+        llm_attributor = llm_attr.LLMBasicAttribution(model, tokenizer)
+        attr = llm_attributor.calculate_basic_attribution(example.prompt, target=target)
+    else:
+        raise ValueError(f"Unsupported attr_func {attr_func}")
+    seq_attr, row_attr, rec_attr = attr.get_all_token_attrs(indices_to_explain)
+    hop_payload = None
+    if bool(testing_dict.get("save_hop_traces", False)):
+        try:
+            hop_payload = _build_hop_trace_payload(attr_func, attr, indices_to_explain=indices_to_explain)
+        except Exception as exc:
+            print(f"[warn] hop trace extraction failed for {attr_func}: {exc}")
+            hop_payload = None
+    user_prompt_indices = getattr(llm_attributor, "user_prompt_indices", None)
+    if isinstance(user_prompt_indices, list):
+        user_prompt_indices = [int(x) for x in user_prompt_indices]
+    else:
+        user_prompt_indices = None
+    keep_prompt_token_indices = None
+    if attr_func in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both"):
+        try:
+            import ft_ifr_improve
+            keep_prompt_token_indices = ft_ifr_improve.keep_token_indices(list(attr.prompt_tokens))
+        except Exception:
+            keep_prompt_token_indices = None
+    return [seq_attr, row_attr, rec_attr], hop_payload, user_prompt_indices, keep_prompt_token_indices
+def faithfulness_generation(
+    testing_dict, example: ds_utils.CachedExample, target: str, llm_evaluator
+) -> Tuple[np.ndarray, Optional[Dict[str, np.ndarray]]]:
+    prompt = example.prompt
+    generation = target
+    attr_func = str(testing_dict.get("attr_func") or "")
+    attr_list, hop_payload, user_prompt_indices, keep_prompt_token_indices = run_attribution(
+        testing_dict, example, target
+    )
+    seq_attr = attr_list[0]
+    prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])  # cols=(P+G), rows=G
+    results = []
+    for attr in attr_list:
+        # Only use prompt-side attribution, matching evaluations/faithfulness.py
+        attr_prompt = attr[:, :prompt_len]
+        if attr_func in ("ifr_multi_hop_stop_words", "ifr_multi_hop_both") and keep_prompt_token_indices is not None:
+            import ft_ifr_improve
+            scores = ft_ifr_improve.faithfulness_test_skip_tokens(
+                llm_evaluator,
+                attr_prompt,
+                prompt,
+                generation,
+                keep_prompt_token_indices=keep_prompt_token_indices,
+                user_prompt_indices=user_prompt_indices,
+            )
+        elif user_prompt_indices is not None:
+            scores = _faithfulness_test_with_user_prompt_indices(
+                llm_evaluator,
+                attr_prompt,
+                prompt,
+                generation,
+                user_prompt_indices=user_prompt_indices,
+            )
+        else:
+            scores = llm_evaluator.faithfulness_test(attr_prompt, prompt, generation)
+        results.append(scores)
+    return np.array(results), hop_payload
+def evaluate_dataset(args, dataset_name: str, examples: List[ds_utils.CachedExample], testing_dict):
+    out = evaluate_dataset_multi(args, dataset_name, examples, testing_dict, modes=["faithfulness_gen"])
+    faith = out.get("faithfulness")
+    if not faith:
+        return None
+    return faith["mean"], faith["std"], faith["avg_time"]
+def evaluate_dataset_recovery_ruler(args, dataset_name: str, examples: List[ds_utils.CachedExample], testing_dict):
+    out = evaluate_dataset_multi(args, dataset_name, examples, testing_dict, modes=["recovery_ruler"])
+    rec = out.get("recovery")
+    if not rec:
+        return None
+    return rec["mean"], rec["std"], rec["avg_time"], rec["used"], rec["skipped"]
+def main():
+    parser = argparse.ArgumentParser("Experiment 2 runner (math skipped, AT2 skipped).")
+    parser.add_argument("--datasets", type=str, required=True, help="Comma-separated names or paths.")
+    parser.add_argument("--attr_funcs", type=str, required=True, help="Comma-separated attr funcs (no AT2).")
+    parser.add_argument("--model", type=str, default=None, help="HF repo id (required unless --model_path set).")
+    parser.add_argument("--model_path", type=str, default=None, help="Local path; overrides --model for loading.")
+    parser.add_argument("--cuda", type=str, default=None)
+    parser.add_argument("--cuda_num", type=int, default=0)
+    parser.add_argument("--num_examples", type=int, default=100)
+    parser.add_argument(
+        "--mode",
+        type=str,
+        nargs="+",
+        default=["faithfulness_gen"],
+        help=(
+            "One or more of: faithfulness_gen, recovery_ruler. "
+            "Accepts comma-separated values, e.g. '--mode faithfulness_gen,recovery_ruler' "
+            "or '--mode faithfulness_gen, recovery_ruler'."
+        ),
+    )
+    parser.add_argument("--sample", type=int, default=None, help="Optional subsample before num_examples.")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--chunk_tokens", type=int, default=128)
+    parser.add_argument("--sink_chunk_tokens", type=int, default=32)
+    parser.add_argument("--n_hops", type=int, default=3)
+    parser.add_argument(
+        "--attnlrp_neg_handling",
+        type=str,
+        choices=["drop", "abs"],
+        default="drop",
+        help="FT-AttnLRP: how to handle negative values after each hop (drop=clamp>=0, abs=absolute value).",
+    )
+    parser.add_argument(
+        "--attnlrp_norm_mode",
+        type=str,
+        choices=["norm", "no_norm"],
+        default="norm",
+        help="FT-AttnLRP: norm enables per-hop global+thinking normalization + ratios; no_norm disables all three.",
+    )
+    parser.add_argument("--data_root", type=str, default="exp/exp2/data", help="Filtered dataset cache directory.")
+    parser.add_argument("--output_root", type=str, default="exp/exp2/output", help="Directory to store evaluation outputs.")
+    parser.add_argument(
+        "--save_hop_traces",
+        action="store_true",
+        help=(
+            "Save per-sample trace artifacts (attribution vectors + per-sample metrics) under output_root/traces/. "
+            "For multi-hop methods, also saves per-hop token vectors (vh)."
+        ),
+    )
+    args = parser.parse_args()
+    modes = _parse_modes(args.mode)
+    if args.model_path:
+        model_name = args.model_path
+    elif args.model:
+        model_name = args.model
+    else:
+        raise SystemExit("Please set --model or --model_path.")
+    model_tag = args.model if args.model else Path(args.model_path).name
+    datasets = [d.strip() for d in args.datasets.split(",") if d.strip()]
+    attr_funcs = [a.strip() for a in args.attr_funcs.split(",") if a.strip()]
+    device = resolve_device(args)
+    model, tokenizer = load_model(model_name, device)
+    max_input_len = {
+        "llama-1B": 5500,
+        "llama-3B": 4800,
+        "llama-8B": 3500,
+        "qwen-1.7B": 5500,
+        "qwen-4B": 3500,
+        "qwen-8B": 5000,
+        "qwen-32B": 1500,
+        "gemma-12B": 1500,
+        "gemma-27B": 2000,
+    }.get(args.model, 2000)
+    for ds_name in datasets:
+        if "recovery_ruler" in modes and ds_name == "morehopqa":
+            raise SystemExit("recovery_ruler only supports RULER datasets (with needle_spans), not morehopqa.")
+        if "recovery_ruler" in modes and ds_name.startswith("math"):
+            raise SystemExit("recovery_ruler only supports RULER datasets (with needle_spans), not math.")
+        # Resolve dataset (prefer prepared cache under data_root)
+        cached_path = Path(args.data_root) / f"{ds_name}.jsonl"
+        if cached_path.exists():
+            examples = ds_utils.load_cached(cached_path, sample=args.sample, seed=args.seed)
+        else:
+            # allow direct cached path or raw loader
+            p = Path(ds_name)
+            if p.exists():
+                examples = ds_utils.load_cached(p, sample=args.sample, seed=args.seed)
+            else:
+                hint = "please run exp/exp2/sample_and_filter.py first (or pass an explicit cached JSONL path)."
+                if ds_name.startswith("math"):
+                    hint = "please run exp/exp2/map_math_mine_to_exp2_cache.py first (or pass an explicit cached JSONL path)."
+                raise SystemExit(f"Missing exp2 cache for '{ds_name}'. Expected {cached_path}; {hint}")
+        for attr_func in attr_funcs:
+            if attr_func.lower() == "at2":
+                print("Skipping AT2 as requested.")
+                continue
+            testing_dict: Dict[str, any] = {
+                "model": model,
+                "model_tag": model_tag,
+                "tokenizer": tokenizer,
+                "attr_func": attr_func,
+                "max_input_len": max_input_len,
+                "chunk_tokens": args.chunk_tokens,
+                "sink_chunk_tokens": args.sink_chunk_tokens,
+                "n_hops": args.n_hops,
+                "attnlrp_neg_handling": args.attnlrp_neg_handling,
+                "attnlrp_norm_mode": args.attnlrp_norm_mode,
+                "device": device,
+                "batch_size": 1,
+                "save_hop_traces": bool(args.save_hop_traces),
+            }
+            result = evaluate_dataset_multi(args, ds_name, examples, testing_dict, modes=modes)
+            if "faithfulness_gen" in modes:
+                faith = result.get("faithfulness")
+                if not faith:
+                    print(f"No faithfulness results for {ds_name} with {attr_func}.")
+                else:
+                    mean = faith["mean"]
+                    std = faith["std"]
+                    avg_time = float(faith["avg_time"])
+                    out_dir = Path(args.output_root) / "faithfulness" / ds_name / model_tag
+                    out_dir.mkdir(parents=True, exist_ok=True)
+                    filename = f"{attr_func}_{args.num_examples}_examples.csv"
+                    with open(out_dir / filename, "w") as f:
+                        f.write("Method,RISE,MAS,RISE+AP\n")
+                        f.write(",".join(["Seq Attr Scores Mean"] + [str(x) for x in mean[0].tolist()]) + "\n")
+                        f.write(",".join(["Row Attr Scores Mean"] + [str(x) for x in mean[1].tolist()]) + "\n")
+                        f.write(",".join(["Recursive Attr Scores Mean"] + [str(x) for x in mean[2].tolist()]) + "\n")
+                        f.write(",".join(["Seq Attr Scores Var"] + [str(x) for x in std[0].tolist()]) + "\n")
+                        f.write(",".join(["Row Attr Scores Var"] + [str(x) for x in std[1].tolist()]) + "\n")
+                        f.write(",".join(["Recursive Attr Scores Var"] + [str(x) for x in std[2].tolist()]) + "\n")
+                        f.write(f"Avg Sample Time (s),{avg_time}\n")
+                    print(f"[{ds_name}] {attr_func} -> {out_dir/filename} (avg sample time: {avg_time:.2f}s)")
+            if "recovery_ruler" in modes:
+                rec = result.get("recovery")
+                if not rec:
+                    print(f"No recovery results for {ds_name} with {attr_func}.")
+                else:
+                    mean = rec["mean"]
+                    std = rec["std"]
+                    avg_time = float(rec["avg_time"])
+                    used = int(rec["used"])
+                    skipped = int(rec["skipped"])
+                    out_dir = Path(args.output_root) / "recovery" / ds_name / model_tag
+                    out_dir.mkdir(parents=True, exist_ok=True)
+                    filename = f"{attr_func}_{args.num_examples}_examples.csv"
+                    with open(out_dir / filename, "w") as f:
+                        f.write("Method,Recovery@10%\n")
+                        f.write(f"Seq Attr Recovery Mean,{mean[0]}\n")
+                        f.write(f"Row Attr Recovery Mean,{mean[1]}\n")
+                        f.write(f"Recursive Attr Recovery Mean,{mean[2]}\n")
+                        f.write(f"Seq Attr Recovery Std,{std[0]}\n")
+                        f.write(f"Row Attr Recovery Std,{std[1]}\n")
+                        f.write(f"Recursive Attr Recovery Std,{std[2]}\n")
+                        f.write(f"Examples Used,{used}\n")
+                        f.write(f"Examples Skipped,{skipped}\n")
+                        f.write(f"Avg Sample Time (s),{avg_time}\n")
+                    print(
+                        f"[{ds_name}] {attr_func} -> {out_dir/filename} "
+                        f"(used={used} skipped={skipped} avg sample time: {avg_time:.2f}s)"
+                    )
+if __name__ == "__main__":
+    main()

exp/exp2/sample_and_filter.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+"""
+Dataset sampler for Experiment 2.
+Steps:
+- Load a dataset item (MoreHopQA / HotpotQA / RULER niah / RULER vt).
+- Call the generation model (qwen3-235b-a22b-2507) with a system prompt that
+  asks for brief reasoning and a final answer wrapped in \\box{}.
+- Enforce the output format: keep only generations that look like
+  "<reasoning text> + final \\box{} answer" with nothing after the box.
+- Call the judge model (deepseek-v3-1-terminus) to check whether the boxed
+  answer matches the dataset reference answer; keep only judged True samples.
+- Rebuild `target` as "<reasoning>\\n<answer text (no box)>" and store filtered
+  samples to exp/exp2/data/<dataset>.jsonl (or a custom path) with inferred spans.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+from transformers import AutoTokenizer
+from tqdm import tqdm
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from exp.exp2.dataset_utils import (
+    CachedExample,
+    DatasetLoader,
+    attach_spans_from_answer,
+    split_boxed_generation,
+)
+class RateLimitError(RuntimeError):
+    """Raised when API returns 429; carries a suggested wait time."""
+    def __init__(self, wait_seconds: float, detail: str) -> None:
+        super().__init__(detail)
+        self.wait_seconds = wait_seconds
+# GEN_SYSTEM_PROMPT = (
+#     "You are a careful reasoning assistant. "
+#     "Before answering, engage in an extremely detailed and exhaustive chain of thought. **No fewer than 2k tokens.** "
+#     "Do not skip any logical steps, even if they seem obvious. "
+#     "Process this freely and naturally without using specific headers or strict formatting. "
+#     "When you reach the conclusion, wrap the entire final sentence containing the answer inside \\box{}. "
+#     "Ensure the box wraps the **sentence** that naturally delivers the answer. DO NOT rewrite the answer word for the box separately."
+# )
+GEN_SYSTEM_PROMPT = (
+    "You are a reasoning assistant. "
+    "Before answering, engage in an chain of thought. "
+    "Process this freely and naturally without using specific headers or strict formatting. "
+    "When you reach the conclusion, wrap the entire final sentence containing the answer inside \\box{}. "
+    "Ensure the box wraps the **sentence** that naturally delivers the answer. DO NOT rewrite the answer word for the box separately."
+)
+JUDGE_SYSTEM_PROMPT = (
+    "You verify whether the model's boxed answer matches the reference answer. "
+    "Reply strictly with True or False and nothing else."
+)
+def call_chat_api(
+    api_base: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    *,
+    timeout: int,
+    max_tokens: int,
+    temperature: float,
+    cache_ttl: int,
+    cache_namespace: Optional[str],
+    rate_limit_delay: Optional[float] = None,
+) -> str:
+    url = api_base.rstrip("/") + "/chat/completions"
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+    }
+    if cache_ttl > 0:
+        cache_obj: Dict[str, Any] = {"ttl": cache_ttl}
+        if cache_namespace:
+            cache_obj["namespace"] = cache_namespace
+        payload["cache"] = cache_obj
+    data = json.dumps(payload).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+    opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
+    try:
+        with opener.open(req, timeout=timeout) as resp:
+            resp_bytes = resp.read()
+    except urllib.error.HTTPError as e:
+        detail = e.read().decode("utf-8", errors="ignore") if hasattr(e, "read") else ""
+        if e.code == 429:
+            retry_after = None
+            if hasattr(e, "headers") and e.headers:
+                retry_after_header = e.headers.get("Retry-After")
+                if retry_after_header:
+                    try:
+                        retry_after = float(retry_after_header)
+                    except ValueError:
+                        retry_after = None
+            wait = retry_after or rate_limit_delay or 5.0
+            raise RateLimitError(wait, f"API HTTP 429: {detail}") from e
+        raise RuntimeError(f"API HTTP error {e.code}: {detail}") from e
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"API request failed: {e}") from e
+    try:
+        response = json.loads(resp_bytes.decode("utf-8"))
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Failed to decode API response: {resp_bytes!r}") from e
+    choices = response.get("choices", [])
+    if not choices:
+        raise RuntimeError(f"Empty choices from API: {response}")
+    content = choices[0].get("message", {}).get("content", "")
+    if not content:
+        raise RuntimeError(f"Empty content from API: {response}")
+    return content.strip()
+def build_gen_messages(prompt: str) -> List[Dict[str, str]]:
+    return [
+        {"role": "system", "content": GEN_SYSTEM_PROMPT},
+        {"role": "user", "content": prompt},
+    ]
+def build_judge_messages(reference_answer: str, candidate_answer: str) -> List[Dict[str, str]]:
+    user = (
+        "Decide if the model's boxed answer matches the reference answer.\n"
+        f"Reference answer: {reference_answer}\n"
+        f"Model boxed answer (only the content inside \\box{{}}): {candidate_answer}\n"
+        "Output only True if they are semantically consistent; otherwise output False."
+    )
+    return [
+        {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
+        {"role": "user", "content": user},
+    ]
+def parse_bool(text: str) -> bool:
+    first = text.strip().splitlines()[0].strip().lower()
+    if first in {"true", "yes"}:
+        return True
+    if first in {"false", "no"}:
+        return False
+    # fallback: check substring
+    if "true" in first and "false" not in first:
+        return True
+    if "false" in first:
+        return False
+    raise ValueError(f"Cannot parse boolean from: {text!r}")
+def write_cache(out_path: Path, examples: Iterable[CachedExample]) -> int:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with out_path.open("w", encoding="utf-8") as f:
+        for ex in examples:
+            obj: Dict[str, Any] = {
+                "prompt": ex.prompt,
+                "target": ex.target,
+                "indices_to_explain": ex.indices_to_explain,
+                "attr_mask_indices": ex.attr_mask_indices,
+                "sink_span": ex.sink_span,
+                "thinking_span": ex.thinking_span,
+                "metadata": ex.metadata,
+            }
+            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+def main():
+    parser = argparse.ArgumentParser("Sample and filter dataset examples for exp2.")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="morehopqa | hotpotqa_long | niah_* | vt_* | <morehopqa_json_path> | <ruler_jsonl_path>",
+    )
+    parser.add_argument("--max_examples", type=int, default=100, help="Number of raw examples to sample before filtering.")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--api_base", type=str, default="http://localhost:4000/v1", help="Chat API base URL.")
+    parser.add_argument("--api_key", type=str, default=None, help="API key; defaults to FLASHTRACE_API_KEY/OPENAI_API_KEY.")
+    parser.add_argument("--generator_model", type=str, default="qwen3-235b-a22b-2507")
+    parser.add_argument("--judge_model", type=str, default="deepseek-v3-1-terminus")
+    parser.add_argument("--api_timeout", type=int, default=300)
+    parser.add_argument("--api_max_tokens", type=int, default=8192)
+    parser.add_argument("--api_temperature", type=float, default=0.0)
+    parser.add_argument("--api_cache_ttl", type=int, default=600)
+    parser.add_argument("--api_cache_namespace", type=str, default="flashtrace-exp2")
+    parser.add_argument("--retry_delay", type=float, default=2.0)
+    parser.add_argument("--retries", type=int, default=2, help="Additional retries on API failure.")
+    parser.add_argument("--request_interval", type=float, default=1.0, help="Sleep seconds between generation calls.")
+    parser.add_argument("--judge_interval", type=float, default=1.0, help="Sleep seconds between judge calls.")
+    parser.add_argument("--tokenizer_model", type=str, default=None, help="Tokenizer path for span extraction (default: generator model).")
+    parser.add_argument("--data_root", type=str, default="exp/exp2/data", help="Output directory for filtered caches.")
+    parser.add_argument("--out", type=str, default=None, help="Optional explicit output path (JSONL).")
+    parser.add_argument("--rate_limit_delay", type=float, default=5.0, help="Seconds to wait on HTTP 429 before retrying.")
+    args = parser.parse_args()
+    api_key = args.api_key or os.environ.get("FLASHTRACE_API_KEY") or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise SystemExit("Set --api_key or FLASHTRACE_API_KEY/OPENAI_API_KEY for API access.")
+    loader = DatasetLoader(seed=args.seed, data_root=args.data_root)
+    # Load full dataset; we will stop early once enough kept examples are collected.
+    raw_examples = loader.load_raw(args.dataset, sample=None)
+    if not raw_examples:
+        raise SystemExit("No examples loaded.")
+    tok_name = args.tokenizer_model or args.generator_model
+    tok_path = Path(tok_name)
+    if tok_path.exists():
+        tokenizer = AutoTokenizer.from_pretrained(tok_path.as_posix(), local_files_only=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tok_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    kept: List[CachedExample] = []
+    total = len(raw_examples)
+    kept_bar = tqdm(total=args.max_examples, desc="Kept (judge=True)", position=1, leave=False)
+    attempted = 0
+    for idx, ex in enumerate(tqdm(raw_examples, total=total, desc="Sampling"), 1):
+        if len(kept) >= args.max_examples:
+            break
+        reference_answer = ex.metadata.get("reference_answer") or ex.target or ""
+        gen_messages = build_gen_messages(ex.prompt)
+        attempted = idx
+        # Step 1: generation
+        for attempt in range(args.retries + 1):
+            try:
+                generation = call_chat_api(
+                    args.api_base,
+                    api_key,
+                    args.generator_model,
+                    gen_messages,
+                    timeout=args.api_timeout,
+                    max_tokens=args.api_max_tokens,
+                    temperature=args.api_temperature,
+                    cache_ttl=args.api_cache_ttl,
+                    cache_namespace=args.api_cache_namespace,
+                    rate_limit_delay=args.rate_limit_delay,
+                )
+                break
+            except RateLimitError as e:
+                if attempt >= args.retries:
+                    raise
+                time.sleep(e.wait_seconds)
+            except Exception:  # noqa: BLE001
+                if attempt >= args.retries:
+                    raise
+                time.sleep(args.retry_delay)
+        if args.request_interval > 0:
+            time.sleep(args.request_interval)
+        parsed = split_boxed_generation(generation)
+        if not parsed:
+            print(f"[{idx}/{total}] skipped=format")
+            continue
+        thinking_text, boxed_segment, boxed_answer = parsed
+        target_text = f"{thinking_text}\n{boxed_answer}" if thinking_text else boxed_answer
+        judge_messages = build_judge_messages(reference_answer, boxed_answer)
+        ok = False
+        judge_resp = ""
+        for attempt in range(args.retries + 1):
+            try:
+                judge_resp = call_chat_api(
+                    args.api_base,
+                    api_key,
+                    args.judge_model,
+                    judge_messages,
+                    timeout=args.api_timeout,
+                    max_tokens=64,
+                    temperature=0.0,
+                    cache_ttl=args.api_cache_ttl,
+                    cache_namespace=args.api_cache_namespace,
+                    rate_limit_delay=args.rate_limit_delay,
+                )
+                ok = parse_bool(judge_resp)
+                break
+            except RateLimitError as e:
+                if attempt >= args.retries:
+                    raise
+                time.sleep(e.wait_seconds)
+            except Exception:  # noqa: BLE001
+                if attempt >= args.retries:
+                    raise
+                time.sleep(args.retry_delay)
+        if args.judge_interval > 0:
+            time.sleep(args.judge_interval)
+        status = "kept" if ok else "filtered"
+        print(f"[{idx}/{total}] judge={status}")
+        if not ok:
+            continue
+        new_meta = dict(ex.metadata)
+        new_meta["reference_answer"] = reference_answer
+        new_meta["judge_response"] = judge_resp
+        new_ex = CachedExample(
+            prompt=ex.prompt,
+            target=target_text,
+            indices_to_explain=None,
+            attr_mask_indices=ex.attr_mask_indices,
+            sink_span=None,
+            thinking_span=None,
+            metadata=new_meta,
+        )
+        new_ex = attach_spans_from_answer(new_ex, tokenizer, boxed_answer)
+        if not (isinstance(new_ex.sink_span, list) and len(new_ex.sink_span) == 2):
+            print(f"[{idx}/{total}] skipped=span")
+            continue
+        # Token-level indices_to_explain: boxed-inner answer token span in target (closed interval).
+        new_ex = CachedExample(
+            prompt=new_ex.prompt,
+            target=new_ex.target,
+            indices_to_explain=new_ex.sink_span,
+            attr_mask_indices=new_ex.attr_mask_indices,
+            sink_span=new_ex.sink_span,
+            thinking_span=new_ex.thinking_span,
+            metadata=new_ex.metadata,
+        )
+        kept.append(new_ex)
+        kept_bar.update(1)
+    kept_bar.close()
+    out_path = Path(args.out) if args.out else Path(args.data_root) / f"{args.dataset}.jsonl"
+    written = write_cache(out_path, kept)
+    attempted_total = attempted or 0
+    print(f"Kept {written} / target {args.max_examples} (attempted {attempted_total} / {total}) -> {out_path}")
+if __name__ == "__main__":
+    main()

exp/exp3/README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# FlashTrace 实验 3：长/短 CoT 对比（case study）
+本目录提供一个「长/短 CoT」的最小可复现实验：
+- 从 RULER `niah_mq_q2 (1024)` 中分别筛出：
+  - short-CoT：短推理 + `\box{}` 最终答案
+  - long-CoT：长推理 + `\box{}` 最终答案
+- 只跑 `attnlrp`（hop0）并只计算 token-level `recovery@10%`（gold 来自 `needle_spans`）。
+- 落盘 trace（npz + manifest）到 `exp/exp3/output/`，格式对齐 `exp/exp2/run_exp.py` 的 trace 习惯。
+## 1) 采样与过滤（生成 + judge）
+默认读取：
+`data/ruler_multihop/1024/niah_mq_q2/validation.jsonl`
+需要一个 OpenAI-compatible 的 chat API（默认 `http://localhost:4000/v1`）以及 API key。
+```bash
+export FLASHTRACE_API_KEY=...  # 或 OPENAI_API_KEY
+python exp/exp3/sample_and_filter.py \
+  --tokenizer_model /opt/share/models/Qwen/Qwen3-8B/ \
+  --min_long_thinking_tokens 512 \
+  --max_short_thinking_tokens 256
+```
+输出（默认）：
+- `exp/exp3/data/niah_mq_q2_short_cot.jsonl`
+- `exp/exp3/data/niah_mq_q2_long_cot.jsonl`
+说明：
+- 默认各采 1 条；可用 `--max_short` / `--max_long` 分别指定数量（`--max_pairs` 是两者的兼容别名）。
+## 2) 归因与 recovery（AttnLRP hop0）
+```bash
+python exp/exp3/run_exp.py \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 3,4,5,7
+```
+输出：
+- recovery CSV：`exp/exp3/output/recovery/<dataset>/<model>/attnlrp_1_examples.csv`
+- trace：`exp/exp3/output/traces/<dataset>/<model>/<run_tag>/ex_*.npz` + `manifest.jsonl`
+- 汇总 JSON：`exp/exp3/output/recovery/summary_<model>.json`
+常用参数：
+- `--top_fraction`：recovery 的 top fraction（默认 0.1）
+- `--attnlrp_neg_handling drop|abs`
+- `--attnlrp_norm_mode norm|no_norm`

exp/exp3/extract_segment_weights.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""
+Extract CoT/output segment attribution weights from exp3 trace artifacts.
+Background
+----------
+exp/exp3/run_exp.py saves per-sample trace npz files that contain token-level
+importance vectors over the FULL (prompt + generation) token sequence:
+  - v_seq_all: sum over rows of seq attribution matrix (shape [P+G])
+  - v_row_all: row attribution vector for indices_to_explain (shape [P+G])
+  - v_rec_all: recursive attribution vector for indices_to_explain (shape [P+G])
+For exp3 cached samples, we also have generation-token spans:
+  - thinking_span_gen: CoT span [start,end] in generation-token coordinates
+  - sink_span_gen: output span [start,end] in generation-token coordinates
+This script slices v_*_all into:
+  - cot: tokens in thinking_span_gen (offset by prompt_len)
+  - output: tokens in sink_span_gen (offset by prompt_len)
+and reports segment sums/fractions (and optionally writes a JSON summary).
+"""
+from __future__ import annotations
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+@dataclass(frozen=True)
+class TracePaths:
+    dataset: str
+    model_tag: str
+    run_tag: str
+    npz_path: Path
+def _pick_latest_subdir(path: Path) -> Optional[Path]:
+    if not path.exists():
+        return None
+    subs = [p for p in path.iterdir() if p.is_dir()]
+    if not subs:
+        return None
+    subs.sort(key=lambda p: p.stat().st_mtime, reverse=True)
+    return subs[0]
+def _resolve_trace_paths(
+    *,
+    output_root: Path,
+    dataset: str,
+    model_tag: Optional[str],
+    run_tag: Optional[str],
+    example_idx: int,
+) -> TracePaths:
+    base = output_root / "traces" / dataset
+    if not base.exists():
+        raise FileNotFoundError(f"Trace dataset dir not found: {base}")
+    if model_tag is None:
+        model_dirs = [p for p in base.iterdir() if p.is_dir()]
+        if not model_dirs:
+            raise FileNotFoundError(f"No model subdir under: {base}")
+        if len(model_dirs) != 1:
+            raise SystemExit(f"Multiple model dirs under {base}; pass --model_tag. Found: {[p.name for p in model_dirs]}")
+        model_dir = model_dirs[0]
+        model_tag = model_dir.name
+    else:
+        model_dir = base / model_tag
+        if not model_dir.exists():
+            raise FileNotFoundError(f"Trace model dir not found: {model_dir}")
+    if run_tag is None:
+        run_dir = _pick_latest_subdir(model_dir)
+        if run_dir is None:
+            raise FileNotFoundError(f"No run subdir under: {model_dir}")
+        run_tag = run_dir.name
+    else:
+        run_dir = model_dir / run_tag
+        if not run_dir.exists():
+            raise FileNotFoundError(f"Trace run dir not found: {run_dir}")
+    npz_name = f"ex_{int(example_idx):06d}.npz"
+    npz_path = run_dir / npz_name
+    if not npz_path.exists():
+        raise FileNotFoundError(f"Trace npz not found: {npz_path}")
+    return TracePaths(dataset=dataset, model_tag=model_tag, run_tag=run_tag, npz_path=npz_path)
+def _as_span(arr: Any) -> Optional[Tuple[int, int]]:
+    if arr is None:
+        return None
+    try:
+        a = np.asarray(arr).reshape(-1).tolist()
+    except Exception:
+        return None
+    if len(a) != 2:
+        return None
+    try:
+        start = int(a[0])
+        end = int(a[1])
+    except Exception:
+        return None
+    if start < 0 or end < start:
+        return None
+    return start, end
+def _segment_stats(v: np.ndarray, start: int, end: int) -> Dict[str, float]:
+    if end < start:
+        return {"sum": 0.0, "mean": 0.0, "max": 0.0}
+    seg = v[start : end + 1]
+    if seg.size == 0:
+        return {"sum": 0.0, "mean": 0.0, "max": 0.0}
+    return {
+        "sum": float(seg.sum()),
+        "mean": float(seg.mean()),
+        "max": float(seg.max()),
+    }
+def _slice_segment(v: np.ndarray, start: int, end: int) -> List[float]:
+    if end < start:
+        return []
+    seg = v[start : end + 1]
+    return [float(x) for x in seg.tolist()]
+def extract_one(npz_path: Path) -> Dict[str, Any]:
+    d = np.load(npz_path)
+    required = ["prompt_len", "gen_len", "v_seq_all", "v_row_all", "v_rec_all"]
+    for k in required:
+        if k not in d:
+            raise KeyError(f"Missing key in trace npz {npz_path}: {k}")
+    prompt_len = int(np.asarray(d["prompt_len"]).item())
+    gen_len = int(np.asarray(d["gen_len"]).item())
+    total_len = prompt_len + gen_len
+    v_seq_all = np.asarray(d["v_seq_all"], dtype=np.float64).reshape(-1)
+    v_row_all = np.asarray(d["v_row_all"], dtype=np.float64).reshape(-1)
+    v_rec_all = np.asarray(d["v_rec_all"], dtype=np.float64).reshape(-1)
+    for name, v in [("v_seq_all", v_seq_all), ("v_row_all", v_row_all), ("v_rec_all", v_rec_all)]:
+        if int(v.size) != int(total_len):
+            raise ValueError(f"{name} length mismatch: expected {total_len}, got {int(v.size)}")
+    sink_span_gen = _as_span(d.get("sink_span_gen"))
+    thinking_span_gen = _as_span(d.get("thinking_span_gen"))
+    if sink_span_gen is None:
+        raise KeyError("Trace missing sink_span_gen; cannot define output span.")
+    if thinking_span_gen is None:
+        # Best-effort: infer thinking span as [0, sink_start-1].
+        sink_start, _ = sink_span_gen
+        thinking_span_gen = (0, max(0, sink_start - 1))
+    think_start_g, think_end_g = thinking_span_gen
+    sink_start_g, sink_end_g = sink_span_gen
+    cot_start = prompt_len + think_start_g
+    cot_end = min(prompt_len + think_end_g, total_len - 1)
+    out_start = prompt_len + sink_start_g
+    out_end = min(prompt_len + sink_end_g, total_len - 1)
+    def pack(v: np.ndarray) -> Dict[str, Any]:
+        total = float(v.sum())
+        cot = _segment_stats(v, cot_start, cot_end)
+        out = _segment_stats(v, out_start, out_end)
+        denom = cot["sum"] + out["sum"]
+        return {
+            "total_sum": total,
+            "cot": {
+                "start_abs": int(cot_start),
+                "end_abs": int(cot_end),
+                "len": int(max(0, cot_end - cot_start + 1)),
+                **cot,
+                "fraction_of_total": float(cot["sum"] / total) if total > 0 else float("nan"),
+                "fraction_of_cot_plus_output": float(cot["sum"] / denom) if denom > 0 else float("nan"),
+            },
+            "output": {
+                "start_abs": int(out_start),
+                "end_abs": int(out_end),
+                "len": int(max(0, out_end - out_start + 1)),
+                **out,
+                "fraction_of_total": float(out["sum"] / total) if total > 0 else float("nan"),
+                "fraction_of_cot_plus_output": float(out["sum"] / denom) if denom > 0 else float("nan"),
+            },
+            "cot_weights": _slice_segment(v, cot_start, cot_end),
+            "output_weights": _slice_segment(v, out_start, out_end),
+        }
+    return {
+        "prompt_len": int(prompt_len),
+        "gen_len": int(gen_len),
+        "total_len": int(total_len),
+        "thinking_span_gen": [int(think_start_g), int(think_end_g)],
+        "sink_span_gen": [int(sink_start_g), int(sink_end_g)],
+        "seq": pack(v_seq_all),
+        "row": pack(v_row_all),
+        "rec": pack(v_rec_all),
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser("Extract CoT/output weights from exp3 traces.")
+    parser.add_argument("--output_root", type=str, default="exp/exp3/output")
+    parser.add_argument("--dataset_tag", type=str, default="niah_mq_q2")
+    parser.add_argument("--model_tag", type=str, default=None, help="If omitted, auto-detect when unique.")
+    parser.add_argument("--run_tag", type=str, default=None, help="If omitted, picks the latest run subdir.")
+    parser.add_argument("--example_idx", type=int, default=0)
+    parser.add_argument("--out", type=str, default=None, help="Optional JSON output path.")
+    args = parser.parse_args()
+    output_root = Path(args.output_root)
+    datasets = [f"{args.dataset_tag}_short_cot", f"{args.dataset_tag}_long_cot"]
+    results: List[Dict[str, Any]] = []
+    for ds_name in datasets:
+        paths = _resolve_trace_paths(
+            output_root=output_root,
+            dataset=ds_name,
+            model_tag=args.model_tag,
+            run_tag=args.run_tag,
+            example_idx=args.example_idx,
+        )
+        out = extract_one(paths.npz_path)
+        out["dataset"] = paths.dataset
+        out["model_tag"] = paths.model_tag
+        out["run_tag"] = paths.run_tag
+        out["npz_path"] = str(paths.npz_path)
+        results.append(out)
+    text = json.dumps(results, ensure_ascii=False, indent=2)
+    if args.out:
+        out_path = Path(args.out)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(text + "\n", encoding="utf-8")
+        print(f"Wrote -> {out_path}")
+    else:
+        print(text)
+if __name__ == "__main__":
+    main()

exp/exp3/part_weights.py ADDED Viewed

	@@ -0,0 +1,228 @@

+#!/usr/bin/env python3
+"""
+Compute attribution mass on (input, cot, output) segments from exp3 trace npz files.
+Definitions (token-level, aligned with exp2/exp3 runners):
+- input  : prompt-side tokens (user prompt), indices [0, prompt_len)
+- cot    : generation tokens in thinking span, indices [prompt_len + t0, prompt_len + t1]
+- output : generation tokens in sink span (answer), indices [prompt_len + s0, prompt_len + s1]
+The trace stores token-importance vectors:
+  - v_seq_all, v_row_all, v_rec_all  (length = prompt_len + gen_len)
+This script sums those vectors over each segment and reports both absolute sums
+and fractions of the total sum.
+"""
+from __future__ import annotations
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+import numpy as np
+@dataclass(frozen=True)
+class TraceRun:
+    dataset: str
+    model: str
+    run_dir: Path
+def _pick_single_subdir(parent: Path) -> Path:
+    subdirs = [p for p in parent.iterdir() if p.is_dir()]
+    if not subdirs:
+        raise FileNotFoundError(f"No subdirectories found under {parent}")
+    if len(subdirs) == 1:
+        return subdirs[0]
+    subdirs.sort(key=lambda p: p.stat().st_mtime, reverse=True)
+    return subdirs[0]
+def _resolve_run(
+    trace_root: Path,
+    *,
+    dataset: str,
+    model: Optional[str],
+    run_tag: Optional[str],
+) -> TraceRun:
+    ds_dir = trace_root / dataset
+    if not ds_dir.exists():
+        raise FileNotFoundError(f"Dataset trace directory not found: {ds_dir}")
+    if model is None:
+        model_dir = _pick_single_subdir(ds_dir)
+    else:
+        model_dir = ds_dir / model
+        if not model_dir.exists():
+            raise FileNotFoundError(f"Model trace directory not found: {model_dir}")
+    if run_tag is None:
+        run_dir = _pick_single_subdir(model_dir)
+    else:
+        run_dir = model_dir / run_tag
+        if not run_dir.exists():
+            raise FileNotFoundError(f"Run directory not found: {run_dir}")
+    return TraceRun(dataset=dataset, model=model_dir.name, run_dir=run_dir)
+def _iter_manifest(run_dir: Path) -> Iterable[dict]:
+    manifest = run_dir / "manifest.jsonl"
+    if not manifest.exists():
+        raise FileNotFoundError(f"Missing manifest: {manifest}")
+    with manifest.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                yield json.loads(line)
+def _as_span(arr: np.ndarray, *, name: str) -> Tuple[int, int]:
+    if arr is None:
+        raise ValueError(f"Missing {name} in trace npz.")
+    a = np.asarray(arr).reshape(-1)
+    if a.size != 2:
+        raise ValueError(f"Expected {name} to have 2 ints, got shape {a.shape}.")
+    return int(a[0]), int(a[1])
+def _segment_sums(
+    v: np.ndarray,
+    *,
+    prompt_len: int,
+    gen_len: int,
+    thinking_span_gen: Optional[Tuple[int, int]],
+    sink_span_gen: Optional[Tuple[int, int]],
+) -> Dict[str, float]:
+    total_len = int(prompt_len) + int(gen_len)
+    if int(v.shape[0]) != total_len:
+        raise ValueError(f"Vector length mismatch: len(v)={int(v.shape[0])} vs prompt_len+gen_len={total_len}.")
+    v = np.asarray(v, dtype=np.float64).reshape(-1)
+    prompt_len = int(prompt_len)
+    gen_len = int(gen_len)
+    # Default: no cot/output when spans missing (should not happen in exp3).
+    think_start, think_end = (0, -1) if thinking_span_gen is None else thinking_span_gen
+    sink_start, sink_end = (0, -1) if sink_span_gen is None else sink_span_gen
+    # Clamp spans into [0, gen_len-1].
+    def _clamp_span(a: int, b: int) -> Tuple[int, int]:
+        a = max(0, min(int(a), gen_len - 1))
+        b = max(0, min(int(b), gen_len - 1))
+        if b < a:
+            return 0, -1
+        return a, b
+    think_start, think_end = _clamp_span(think_start, think_end)
+    sink_start, sink_end = _clamp_span(sink_start, sink_end)
+    mask = np.zeros((total_len,), dtype=bool)
+    # input = all prompt tokens
+    input_slice = slice(0, prompt_len)
+    mask[input_slice] = True
+    cot_slice = slice(prompt_len + think_start, prompt_len + think_end + 1) if think_end >= think_start else slice(0, 0)
+    output_slice = slice(prompt_len + sink_start, prompt_len + sink_end + 1) if sink_end >= sink_start else slice(0, 0)
+    mask[cot_slice] = True
+    mask[output_slice] = True
+    input_sum = float(v[input_slice].sum())
+    cot_sum = float(v[cot_slice].sum()) if think_end >= think_start else 0.0
+    output_sum = float(v[output_slice].sum()) if sink_end >= sink_start else 0.0
+    other_sum = float(v[~mask].sum())
+    total_sum = float(v.sum())
+    return {
+        "total": total_sum,
+        "input": input_sum,
+        "cot": cot_sum,
+        "output": output_sum,
+        "other": other_sum,
+    }
+def _with_fracs(sums: Dict[str, float]) -> Dict[str, float]:
+    total = float(sums.get("total") or 0.0)
+    if total <= 0.0:
+        return {**sums, "input_frac": float("nan"), "cot_frac": float("nan"), "output_frac": float("nan"), "other_frac": float("nan")}
+    return {
+        **sums,
+        "input_frac": float(sums["input"]) / total,
+        "cot_frac": float(sums["cot"]) / total,
+        "output_frac": float(sums["output"]) / total,
+        "other_frac": float(sums["other"]) / total,
+    }
+def _analyze_npz(npz_path: Path) -> Dict[str, dict]:
+    d = np.load(npz_path)
+    prompt_len = int(np.asarray(d["prompt_len"]).item())
+    gen_len = int(np.asarray(d["gen_len"]).item())
+    thinking_span_gen = _as_span(d["thinking_span_gen"], name="thinking_span_gen") if "thinking_span_gen" in d.files else None
+    sink_span_gen = _as_span(d["sink_span_gen"], name="sink_span_gen") if "sink_span_gen" in d.files else None
+    out: Dict[str, dict] = {"prompt_len": prompt_len, "gen_len": gen_len}
+    for key in ("v_seq_all", "v_row_all", "v_rec_all"):
+        if key not in d.files:
+            raise ValueError(f"Missing {key} in trace npz: {npz_path}")
+        sums = _segment_sums(
+            d[key],
+            prompt_len=prompt_len,
+            gen_len=gen_len,
+            thinking_span_gen=thinking_span_gen,
+            sink_span_gen=sink_span_gen,
+        )
+        out[key] = _with_fracs(sums)
+    out["thinking_span_gen"] = list(thinking_span_gen) if thinking_span_gen is not None else None
+    out["sink_span_gen"] = list(sink_span_gen) if sink_span_gen is not None else None
+    return out
+def main() -> None:
+    parser = argparse.ArgumentParser("Summarize input/cot/output attribution mass from exp3 traces.")
+    parser.add_argument("--trace_root", type=str, default="exp/exp3/output/traces")
+    parser.add_argument("--dataset_tag", type=str, default="niah_mq_q2", help="Base tag; expands to <tag>_short_cot and <tag>_long_cot.")
+    parser.add_argument("--datasets", type=str, default=None, help="Comma-separated dataset names (overrides --dataset_tag expansion).")
+    parser.add_argument("--model", type=str, default=None, help="Model directory name under traces (default: auto if single).")
+    parser.add_argument("--run_tag", type=str, default=None, help="Run tag directory (default: auto pick newest/single).")
+    args = parser.parse_args()
+    trace_root = Path(args.trace_root)
+    if not trace_root.exists():
+        raise SystemExit(f"trace_root not found: {trace_root}")
+    if args.datasets:
+        datasets = [x.strip() for x in str(args.datasets).split(",") if x.strip()]
+    else:
+        datasets = [f"{args.dataset_tag}_short_cot", f"{args.dataset_tag}_long_cot"]
+    for ds in datasets:
+        run = _resolve_run(trace_root, dataset=ds, model=args.model, run_tag=args.run_tag)
+        records = list(_iter_manifest(run.run_dir))
+        if not records:
+            raise SystemExit(f"Empty manifest: {run.run_dir/'manifest.jsonl'}")
+        for rec in records:
+            npz_path = run.run_dir / str(rec["file"])
+            analysis = _analyze_npz(npz_path)
+            print(
+                json.dumps(
+                    {
+                        "dataset": run.dataset,
+                        "model": run.model,
+                        "run_dir": str(run.run_dir),
+                        "example_idx": int(rec.get("example_idx", -1)),
+                        **analysis,
+                    },
+                    ensure_ascii=False,
+                )
+            )
+if __name__ == "__main__":
+    main()

exp/exp3/run_exp.py ADDED Viewed

	@@ -0,0 +1,430 @@

+#!/usr/bin/env python3
+"""
+Experiment 3 runner: long-vs-short CoT case study (AttnLRP hop0, Recovery@10%).
+This runner is intentionally minimal:
+  - Only reads two cached samples produced by exp/exp3/sample_and_filter.py:
+      <dataset_tag>_short_cot.jsonl
+      <dataset_tag>_long_cot.jsonl
+  - Only runs attribution method: attnlrp (hop0 path, aligned with exp2).
+  - Only computes token-level recovery (Recall@10%) using RULER needle_spans.
+  - Always saves per-sample trace artifacts under exp/exp3/output/traces/.
+All outputs are written under exp/exp3/output/ (configurable via --output_root).
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import sys
+import time
+from itertools import islice
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+def _early_set_cuda_visible_devices() -> None:
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    if args.cuda and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+_early_set_cuda_visible_devices()
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, utils
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+import llm_attr
+import llm_attr_eval
+from exp.exp2 import dataset_utils as ds_utils
+utils.logging.set_verbosity_error()
+def _sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def _token_importance_vector(attr: torch.Tensor) -> np.ndarray:
+    w = torch.nan_to_num(attr.sum(0).to(dtype=torch.float32), nan=0.0).clamp(min=0.0)
+    return w.detach().cpu().numpy().astype(np.float32, copy=False)
+def _trace_run_tag(*, neg_handling: str, norm_mode: str, total: int) -> str:
+    return f"attnlrp_neg{neg_handling}_norm{norm_mode}_recovery_{int(total)}ex"
+def _build_sample_trace_payload(
+    example: ds_utils.CachedExample,
+    *,
+    seq_attr: torch.Tensor,
+    row_attr: torch.Tensor,
+    rec_attr: torch.Tensor,
+    prompt_len: int,
+    user_prompt_indices: Optional[List[int]],
+    gold_prompt_token_indices: Optional[List[int]],
+    recovery_scores: Optional[np.ndarray],
+    time_attr_s: Optional[float],
+    time_recovery_s: Optional[float],
+) -> Dict[str, np.ndarray]:
+    gen_len = int(seq_attr.shape[0])
+    v_seq_all = _token_importance_vector(seq_attr)
+    v_row_all = _token_importance_vector(row_attr)
+    v_rec_all = _token_importance_vector(rec_attr)
+    payload: Dict[str, np.ndarray] = {
+        "v_seq_all": v_seq_all,
+        "v_row_all": v_row_all,
+        "v_rec_all": v_rec_all,
+        "v_seq_prompt": v_seq_all[:prompt_len],
+        "v_row_prompt": v_row_all[:prompt_len],
+        "v_rec_prompt": v_rec_all[:prompt_len],
+        "prompt_len": np.asarray(int(prompt_len), dtype=np.int64),
+        "gen_len": np.asarray(int(gen_len), dtype=np.int64),
+        "indices_to_explain_gen": np.asarray(list(example.indices_to_explain or []), dtype=np.int64),
+    }
+    if example.sink_span is not None:
+        payload["sink_span_gen"] = np.asarray(list(example.sink_span), dtype=np.int64)
+    if example.thinking_span is not None:
+        payload["thinking_span_gen"] = np.asarray(list(example.thinking_span), dtype=np.int64)
+    if user_prompt_indices is not None:
+        payload["user_prompt_indices"] = np.asarray(list(user_prompt_indices), dtype=np.int64)
+    if gold_prompt_token_indices is not None:
+        payload["gold_prompt_token_indices"] = np.asarray(list(gold_prompt_token_indices), dtype=np.int64)
+    if recovery_scores is not None:
+        payload["recovery_scores"] = np.asarray(recovery_scores, dtype=np.float64)
+    if time_attr_s is not None:
+        payload["time_attr_s"] = np.asarray(float(time_attr_s), dtype=np.float64)
+    if time_recovery_s is not None:
+        payload["time_recovery_s"] = np.asarray(float(time_recovery_s), dtype=np.float64)
+    return payload
+def _write_sample_trace(
+    trace_dir: Path,
+    *,
+    example_idx: int,
+    prompt: str,
+    target: str,
+    payload: Dict[str, np.ndarray],
+    manifest_handle,
+    neg_handling: str,
+    norm_mode: str,
+    recovery_skipped_reason: Optional[str],
+) -> None:
+    trace_dir.mkdir(parents=True, exist_ok=True)
+    npz_name = f"ex_{example_idx:06d}.npz"
+    npz_path = trace_dir / npz_name
+    np.savez_compressed(npz_path, **payload)
+    prompt_len = int(np.asarray(payload.get("prompt_len", 0)).item())
+    gen_len = int(np.asarray(payload.get("gen_len", 0)).item())
+    record: Dict[str, Any] = {
+        "example_idx": int(example_idx),
+        "attr_func": "attnlrp",
+        "file": npz_name,
+        "prompt_sha1": _sha1_text(prompt),
+        "target_sha1": _sha1_text(target),
+        "prompt_len": prompt_len,
+        "gen_len": gen_len,
+        "indices_to_explain_gen": payload.get("indices_to_explain_gen").tolist()
+        if payload.get("indices_to_explain_gen") is not None
+        else None,
+        "sink_span_gen": payload.get("sink_span_gen").tolist() if payload.get("sink_span_gen") is not None else None,
+        "thinking_span_gen": payload.get("thinking_span_gen").tolist()
+        if payload.get("thinking_span_gen") is not None
+        else None,
+        "gold_prompt_token_indices": payload.get("gold_prompt_token_indices").tolist()
+        if payload.get("gold_prompt_token_indices") is not None
+        else None,
+        "recovery_scores": payload.get("recovery_scores").tolist() if payload.get("recovery_scores") is not None else None,
+        "recovery_skipped_reason": recovery_skipped_reason,
+        "time_attr_s": float(np.asarray(payload.get("time_attr_s")).item()) if payload.get("time_attr_s") is not None else None,
+        "time_recovery_s": float(np.asarray(payload.get("time_recovery_s")).item())
+        if payload.get("time_recovery_s") is not None
+        else None,
+        "attnlrp_neg_handling": str(neg_handling),
+        "attnlrp_norm_mode": str(norm_mode),
+    }
+    manifest_handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+    manifest_handle.flush()
+def resolve_device(args) -> str:
+    if args.cuda is not None and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+        return "auto"
+    if args.cuda is not None and str(args.cuda).strip():
+        return f"cuda:{args.cuda}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{args.cuda_num}" if torch.cuda.is_available() else "cpu"
+def load_model(model_name: str, device: str):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="auto" if device == "auto" else {"": int(device.split(":")[1])} if device.startswith("cuda:") else None,
+        torch_dtype=torch.float16,
+        attn_implementation="eager",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+def _evaluate_one_dataset(
+    *,
+    dataset_name: str,
+    examples: List[ds_utils.CachedExample],
+    model,
+    tokenizer,
+    output_root: Path,
+    model_tag: str,
+    neg_handling: str,
+    norm_mode: str,
+    top_fraction: float,
+    num_examples: int,
+) -> Tuple[np.ndarray, np.ndarray, float, int, int]:
+    llm_evaluator = llm_attr_eval.LLMAttributionEvaluator(model, tokenizer)
+    results: List[np.ndarray] = []
+    durations: List[float] = []
+    skipped = 0
+    total = min(len(examples), int(num_examples))
+    iterator = islice(examples, total)
+    run_tag = _trace_run_tag(neg_handling=neg_handling, norm_mode=norm_mode, total=total)
+    trace_dir = output_root / "traces" / dataset_name / model_tag / run_tag
+    trace_dir.mkdir(parents=True, exist_ok=True)
+    manifest_handle = open(trace_dir / "manifest.jsonl", "w", encoding="utf-8")
+    try:
+        for example_idx, ex in enumerate(iterator):
+            time_recovery_s: Optional[float] = None
+            recovery_scores: Optional[np.ndarray] = None
+            needle_spans = (ex.metadata or {}).get("needle_spans")
+            if not isinstance(needle_spans, list) or not needle_spans:
+                raise SystemExit(
+                    "exp3 recovery requires RULER samples with metadata.needle_spans; "
+                    f"dataset={dataset_name} has missing/empty needle_spans."
+                )
+            if ex.target is None:
+                raise SystemExit(
+                    "exp3 recovery requires cached targets (CoT+answer) so row/rec attribution is well-defined. "
+                    f"dataset={dataset_name} has target=None; run exp/exp3/sample_and_filter.py first."
+                )
+            if not (isinstance(ex.indices_to_explain, list) and len(ex.indices_to_explain) == 2):
+                raise SystemExit(
+                    "exp3 expects indices_to_explain=[start_tok,end_tok] in generation-token coordinates; "
+                    f"dataset={dataset_name} has indices_to_explain={ex.indices_to_explain!r}; "
+                    "run exp/exp3/sample_and_filter.py first."
+                )
+            gold_prompt = ds_utils.ruler_gold_prompt_token_indices(ex, tokenizer)
+            recovery_skip_reason: Optional[str] = None
+            sample_start = time.perf_counter()
+            llm_attributor = llm_attr.LLMLRPAttribution(model, tokenizer)
+            attr_result = llm_attributor.calculate_attnlrp_ft_hop0(
+                ex.prompt,
+                target=ex.target,
+                sink_span=tuple(ex.sink_span) if ex.sink_span else None,
+                thinking_span=tuple(ex.thinking_span) if ex.thinking_span else None,
+                neg_handling=str(neg_handling),
+                norm_mode=str(norm_mode),
+            )
+            seq_attr, row_attr, rec_attr = attr_result.get_all_token_attrs(list(ex.indices_to_explain))
+            time_attr_s = time.perf_counter() - sample_start
+            durations.append(float(time_attr_s))
+            prompt_len = int(seq_attr.shape[1] - seq_attr.shape[0])
+            if prompt_len <= 0:
+                recovery_skip_reason = "empty_prompt_len"
+            elif not gold_prompt:
+                recovery_skip_reason = "empty_gold_prompt"
+            else:
+                t2 = time.perf_counter()
+                recovery_scores = np.asarray(
+                    [
+                        llm_evaluator.evaluate_attr_recovery(
+                            a,
+                            prompt_len=prompt_len,
+                            gold_prompt_token_indices=gold_prompt,
+                            top_fraction=top_fraction,
+                        )
+                        for a in (seq_attr, row_attr, rec_attr)
+                    ],
+                    dtype=np.float64,
+                )
+                time_recovery_s = time.perf_counter() - t2
+                if np.isnan(recovery_scores).any():
+                    recovery_scores = None
+                    recovery_skip_reason = "nan_recovery"
+            if recovery_scores is None and recovery_skip_reason is not None:
+                skipped += 1
+            elif recovery_scores is not None:
+                results.append(recovery_scores)
+            payload = _build_sample_trace_payload(
+                ex,
+                seq_attr=seq_attr,
+                row_attr=row_attr,
+                rec_attr=rec_attr,
+                prompt_len=prompt_len,
+                user_prompt_indices=getattr(llm_attributor, "user_prompt_indices", None),
+                gold_prompt_token_indices=gold_prompt,
+                recovery_scores=recovery_scores,
+                time_attr_s=time_attr_s,
+                time_recovery_s=time_recovery_s,
+            )
+            _write_sample_trace(
+                trace_dir,
+                example_idx=example_idx,
+                prompt=ex.prompt,
+                target=str(ex.target),
+                payload=payload,
+                manifest_handle=manifest_handle,
+                neg_handling=str(neg_handling),
+                norm_mode=str(norm_mode),
+                recovery_skipped_reason=recovery_skip_reason,
+            )
+    finally:
+        try:
+            manifest_handle.close()
+        except Exception:
+            pass
+    scores = np.stack(results, axis=0) if results else np.zeros((0, 3), dtype=np.float64)
+    used = int(scores.shape[0])
+    mean = scores.mean(0) if used else np.full((3,), np.nan, dtype=np.float64)
+    std = scores.std(0) if used else np.full((3,), np.nan, dtype=np.float64)
+    avg_time = float(np.mean(durations)) if durations else 0.0
+    return mean, std, avg_time, used, int(skipped)
+def main() -> None:
+    parser = argparse.ArgumentParser("Experiment 3 runner (attnlrp hop0, recovery only).")
+    parser.add_argument("--dataset_tag", type=str, default="niah_mq_q2", help="Base tag for exp3 caches.")
+    parser.add_argument("--data_root", type=str, default="exp/exp3/data")
+    parser.add_argument("--output_root", type=str, default="exp/exp3/output")
+    parser.add_argument("--num_examples", type=int, default=1, help="How many examples to evaluate per dataset (default 1).")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--model", type=str, default=None, help="HF repo id (required unless --model_path set).")
+    parser.add_argument("--model_path", type=str, default=None, help="Local path; overrides --model for loading.")
+    parser.add_argument("--cuda_num", type=int, default=0)
+    parser.add_argument("--cuda", type=str, default=None)
+    parser.add_argument("--top_fraction", type=float, default=0.1, help="Top fraction of prompt tokens used for recovery.")
+    parser.add_argument(
+        "--attnlrp_neg_handling",
+        type=str,
+        choices=["drop", "abs"],
+        default="drop",
+        help="AttnLRP hop0: how to handle negative values (drop=clamp>=0, abs=absolute value).",
+    )
+    parser.add_argument(
+        "--attnlrp_norm_mode",
+        type=str,
+        choices=["norm", "no_norm"],
+        default="norm",
+        help="AttnLRP hop0: norm enables internal normalization; no_norm disables it.",
+    )
+    args = parser.parse_args()
+    if args.model_path:
+        model_name = args.model_path
+    elif args.model:
+        model_name = args.model
+    else:
+        raise SystemExit("Please set --model or --model_path.")
+    model_tag = args.model if args.model else Path(args.model_path).name
+    device = resolve_device(args)
+    model, tokenizer = load_model(model_name, device)
+    data_root = Path(args.data_root)
+    output_root = Path(args.output_root)
+    output_root.mkdir(parents=True, exist_ok=True)
+    short_name = f"{args.dataset_tag}_short_cot"
+    long_name = f"{args.dataset_tag}_long_cot"
+    dataset_names = [short_name, long_name]
+    summary_rows: List[Dict[str, Any]] = []
+    for ds_name in dataset_names:
+        cache_path = data_root / f"{ds_name}.jsonl"
+        if not cache_path.exists():
+            raise SystemExit(f"Missing exp3 cache: {cache_path}. Run exp/exp3/sample_and_filter.py first.")
+        examples = ds_utils.load_cached(cache_path, sample=None, seed=args.seed)
+        mean, std, avg_time, used, skipped = _evaluate_one_dataset(
+            dataset_name=ds_name,
+            examples=examples,
+            model=model,
+            tokenizer=tokenizer,
+            output_root=output_root,
+            model_tag=model_tag,
+            neg_handling=args.attnlrp_neg_handling,
+            norm_mode=args.attnlrp_norm_mode,
+            top_fraction=float(args.top_fraction),
+            num_examples=int(args.num_examples),
+        )
+        out_dir = output_root / "recovery" / ds_name / model_tag
+        out_dir.mkdir(parents=True, exist_ok=True)
+        filename = f"attnlrp_{int(args.num_examples)}_examples.csv"
+        with (out_dir / filename).open("w", encoding="utf-8") as f:
+            f.write("Method,Recovery@10%\n")
+            f.write(f"Seq Attr Recovery Mean,{mean[0]}\n")
+            f.write(f"Row Attr Recovery Mean,{mean[1]}\n")
+            f.write(f"Recursive Attr Recovery Mean,{mean[2]}\n")
+            f.write(f"Seq Attr Recovery Std,{std[0]}\n")
+            f.write(f"Row Attr Recovery Std,{std[1]}\n")
+            f.write(f"Recursive Attr Recovery Std,{std[2]}\n")
+            f.write(f"Examples Used,{used}\n")
+            f.write(f"Examples Skipped,{skipped}\n")
+            f.write(f"Avg Sample Time (s),{avg_time}\n")
+        print(f"[{ds_name}] attnlrp -> {out_dir/filename} (used={used} skipped={skipped} avg {avg_time:.2f}s)")
+        summary_rows.append(
+            {
+                "dataset": ds_name,
+                "model": model_tag,
+                "neg_handling": args.attnlrp_neg_handling,
+                "norm_mode": args.attnlrp_norm_mode,
+                "seq_recovery@10%": float(mean[0]) if used else float("nan"),
+                "row_recovery@10%": float(mean[1]) if used else float("nan"),
+                "rec_recovery@10%": float(mean[2]) if used else float("nan"),
+                "used": int(used),
+                "skipped": int(skipped),
+            }
+        )
+    # Lightweight combined summary for quick comparison.
+    summary_path = output_root / "recovery" / f"summary_{model_tag}.json"
+    summary_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_path.write_text(json.dumps(summary_rows, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+    print(f"Wrote summary -> {summary_path}")
+if __name__ == "__main__":
+    main()

exp/exp3/sample_and_filter.py ADDED Viewed

	@@ -0,0 +1,628 @@

+#!/usr/bin/env python3
+"""
+Experiment 3 sampler: long-vs-short CoT case study (RULER niah_mq_q2, 1024).
+This script searches the raw RULER JSONL for a *single* prompt where BOTH:
+  - a short-CoT generation and a long-CoT generation
+  - follow the strict format: "<thinking text> + final \\box{...} answer" with
+    nothing after the box
+  - pass a judge model verifying the boxed answer matches the reference answer
+  - satisfy length constraints (short <= max_short_thinking_tokens,
+    long >= min_long_thinking_tokens)
+It writes two exp2-compatible cache JSONLs to exp/exp3/data/:
+  - <dataset_tag>_short_cot.jsonl
+  - <dataset_tag>_long_cot.jsonl
+Each JSONL line matches exp/exp2/dataset_utils.CachedExample schema and keeps
+RULER metadata. The output caches are intended to be consumed by exp/exp3/run_exp.py.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+from tqdm import tqdm
+from transformers import AutoTokenizer
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from exp.exp2 import dataset_utils as ds_utils
+from exp.exp2.dataset_utils import CachedExample, attach_spans_from_answer, split_boxed_generation
+class RateLimitError(RuntimeError):
+    """Raised when API returns 429; carries a suggested wait time."""
+    def __init__(self, wait_seconds: float, detail: str) -> None:
+        super().__init__(detail)
+        self.wait_seconds = wait_seconds
+SHORT_COT_SYSTEM_PROMPT = (
+    "You are a reasoning assistant. "
+    "Before answering, engage in a brief chain of thought. "
+    "Process this freely and naturally without using specific headers or strict formatting. "
+    "When you reach the conclusion, wrap the entire final sentence containing the answer inside \\box{}. "
+    "Ensure the box wraps the **sentence** that naturally delivers the answer. "
+    "Do not add anything after the box."
+)
+LONG_COT_SYSTEM_PROMPT = (
+    "You are a careful reasoning assistant. "
+    "Before answering, engage in an extremely detailed and exhaustive chain of thought. "
+    "Do not skip any logical steps, even if they seem obvious. "
+    "Process this freely and naturally without using specific headers or strict formatting. "
+    "When you reach the conclusion, wrap the entire final sentence containing the answer inside \\box{}. "
+    "Ensure the box wraps the **sentence** that naturally delivers the answer. "
+    "Do not add anything after the box."
+)
+JUDGE_SYSTEM_PROMPT = (
+    "You verify whether the model's boxed answer matches the reference answer. "
+    "Reply strictly with True or False and nothing else."
+)
+def _sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def call_chat_api(
+    api_base: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    *,
+    timeout: int,
+    max_tokens: int,
+    temperature: float,
+    cache_ttl: int,
+    cache_namespace: Optional[str],
+    rate_limit_delay: Optional[float] = None,
+) -> str:
+    url = api_base.rstrip("/") + "/chat/completions"
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+    }
+    if cache_ttl > 0:
+        cache_obj: Dict[str, Any] = {"ttl": cache_ttl}
+        if cache_namespace:
+            cache_obj["namespace"] = cache_namespace
+        payload["cache"] = cache_obj
+    data = json.dumps(payload).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+    opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
+    try:
+        with opener.open(req, timeout=timeout) as resp:
+            resp_bytes = resp.read()
+    except urllib.error.HTTPError as e:
+        detail = e.read().decode("utf-8", errors="ignore") if hasattr(e, "read") else ""
+        if e.code == 429:
+            retry_after = None
+            if hasattr(e, "headers") and e.headers:
+                retry_after_header = e.headers.get("Retry-After")
+                if retry_after_header:
+                    try:
+                        retry_after = float(retry_after_header)
+                    except ValueError:
+                        retry_after = None
+            wait = retry_after or rate_limit_delay or 5.0
+            raise RateLimitError(wait, f"API HTTP 429: {detail}") from e
+        raise RuntimeError(f"API HTTP error {e.code}: {detail}") from e
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"API request failed: {e}") from e
+    try:
+        response = json.loads(resp_bytes.decode("utf-8"))
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Failed to decode API response: {resp_bytes!r}") from e
+    choices = response.get("choices", [])
+    if not choices:
+        raise RuntimeError(f"Empty choices from API: {response}")
+    content = choices[0].get("message", {}).get("content", "")
+    if not content:
+        raise RuntimeError(f"Empty content from API: {response}")
+    return content.strip()
+def _call_with_retries(
+    *,
+    api_base: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    timeout: int,
+    max_tokens: int,
+    temperature: float,
+    cache_ttl: int,
+    cache_namespace: Optional[str],
+    rate_limit_delay: float,
+    retries: int,
+    retry_delay: float,
+) -> str:
+    for attempt in range(retries + 1):
+        try:
+            return call_chat_api(
+                api_base,
+                api_key,
+                model,
+                messages,
+                timeout=timeout,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                cache_ttl=cache_ttl,
+                cache_namespace=cache_namespace,
+                rate_limit_delay=rate_limit_delay,
+            )
+        except RateLimitError as e:
+            if attempt >= retries:
+                raise
+            time.sleep(e.wait_seconds)
+        except Exception:  # noqa: BLE001
+            if attempt >= retries:
+                raise
+            time.sleep(retry_delay)
+    raise RuntimeError("Unreachable")
+def build_gen_messages(prompt: str, system_prompt: str) -> List[Dict[str, str]]:
+    return [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt},
+    ]
+def build_judge_messages(reference_answer: str, candidate_answer: str) -> List[Dict[str, str]]:
+    user = (
+        "Decide if the model's boxed answer matches the reference answer.\n"
+        f"Reference answer: {reference_answer}\n"
+        f"Model boxed answer (only the content inside \\box{{}}): {candidate_answer}\n"
+        "Output only True if they are semantically consistent; otherwise output False."
+    )
+    return [
+        {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
+        {"role": "user", "content": user},
+    ]
+def parse_bool(text: str) -> bool:
+    first = text.strip().splitlines()[0].strip().lower()
+    if first in {"true", "yes"}:
+        return True
+    if first in {"false", "no"}:
+        return False
+    if "true" in first and "false" not in first:
+        return True
+    if "false" in first:
+        return False
+    raise ValueError(f"Cannot parse boolean from: {text!r}")
+def write_cache(out_path: Path, examples: Iterable[CachedExample]) -> int:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with out_path.open("w", encoding="utf-8") as f:
+        for ex in examples:
+            obj: Dict[str, Any] = {
+                "prompt": ex.prompt,
+                "target": ex.target,
+                "indices_to_explain": ex.indices_to_explain,
+                "attr_mask_indices": ex.attr_mask_indices,
+                "sink_span": ex.sink_span,
+                "thinking_span": ex.thinking_span,
+                "metadata": ex.metadata,
+            }
+            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+@dataclass(frozen=True)
+class AcceptedGeneration:
+    thinking_text: str
+    boxed_answer: str
+    target_text: str
+    thinking_tokens: int
+    generation_text: str
+    judge_response: str
+def _infer_reference_answer(example: CachedExample) -> str:
+    meta = example.metadata or {}
+    ref = str(meta.get("reference_answer") or "").strip()
+    if ref:
+        return ref
+    outputs = meta.get("outputs") or []
+    if isinstance(outputs, list) and outputs:
+        return ", ".join(str(x) for x in outputs)
+    tgt = str(example.target or "").strip()
+    return tgt
+def _infer_dataset_tag(dataset_path: Path) -> str:
+    if dataset_path.name.endswith(".jsonl") and dataset_path.name != "validation.jsonl":
+        return dataset_path.stem
+    if dataset_path.name == "validation.jsonl":
+        return dataset_path.parent.name
+    return dataset_path.stem
+def _count_tokens(tokenizer, text: str) -> int:
+    return int(len(tokenizer(text, add_special_tokens=False).input_ids))
+def _generate_one_style(
+    *,
+    prompt: str,
+    reference_answer: str,
+    tokenizer,
+    style: str,
+    system_prompt: str,
+    api_base: str,
+    api_key: str,
+    generator_model: str,
+    judge_model: str,
+    timeout: int,
+    max_tokens: int,
+    temperature: float,
+    cache_ttl: int,
+    cache_namespace: Optional[str],
+    rate_limit_delay: float,
+    retries: int,
+    retry_delay: float,
+    request_interval: float,
+    judge_interval: float,
+    min_long_thinking_tokens: int,
+    max_short_thinking_tokens: int,
+) -> Optional[AcceptedGeneration]:
+    gen_messages = build_gen_messages(prompt, system_prompt)
+    generation = _call_with_retries(
+        api_base=api_base,
+        api_key=api_key,
+        model=generator_model,
+        messages=gen_messages,
+        timeout=timeout,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        cache_ttl=cache_ttl,
+        cache_namespace=cache_namespace,
+        rate_limit_delay=rate_limit_delay,
+        retries=retries,
+        retry_delay=retry_delay,
+    )
+    if request_interval > 0:
+        time.sleep(request_interval)
+    parsed = split_boxed_generation(generation)
+    if not parsed:
+        return None
+    thinking_text, _boxed_segment, boxed_answer = parsed
+    thinking_tokens = _count_tokens(tokenizer, thinking_text)
+    if style == "short":
+        if max_short_thinking_tokens > 0 and thinking_tokens > max_short_thinking_tokens:
+            return None
+    elif style == "long":
+        if min_long_thinking_tokens > 0 and thinking_tokens < min_long_thinking_tokens:
+            return None
+    else:
+        raise ValueError(f"Unsupported style: {style}")
+    judge_messages = build_judge_messages(reference_answer, boxed_answer)
+    judge_resp = _call_with_retries(
+        api_base=api_base,
+        api_key=api_key,
+        model=judge_model,
+        messages=judge_messages,
+        timeout=timeout,
+        max_tokens=64,
+        temperature=0.0,
+        cache_ttl=cache_ttl,
+        cache_namespace=cache_namespace,
+        rate_limit_delay=rate_limit_delay,
+        retries=retries,
+        retry_delay=retry_delay,
+    )
+    if judge_interval > 0:
+        time.sleep(judge_interval)
+    ok = parse_bool(judge_resp)
+    if not ok:
+        return None
+    target_text = f"{thinking_text}\n{boxed_answer}" if thinking_text else boxed_answer
+    return AcceptedGeneration(
+        thinking_text=thinking_text,
+        boxed_answer=boxed_answer,
+        target_text=target_text,
+        thinking_tokens=thinking_tokens,
+        generation_text=generation,
+        judge_response=judge_resp,
+    )
+def main() -> None:
+    parser = argparse.ArgumentParser("Sample short-CoT and long-CoT cases for exp3 (independently).")
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="data/ruler_multihop/1024/niah_mq_q2/validation.jsonl",
+        help="Raw RULER JSONL path (default: niah_mq_q2 1024 validation).",
+    )
+    parser.add_argument("--dataset_tag", type=str, default=None, help="Output tag; default inferred from dataset_path.")
+    parser.add_argument(
+        "--max_pairs",
+        type=int,
+        default=1,
+        help="Deprecated alias for --max_short and --max_long (kept for convenience).",
+    )
+    parser.add_argument("--max_short", type=int, default=None, help="How many short-CoT samples to keep (default: --max_pairs).")
+    parser.add_argument("--max_long", type=int, default=None, help="How many long-CoT samples to keep (default: --max_pairs).")
+    parser.add_argument("--max_raw_examples", type=int, default=None, help="Optional cap on raw examples to try.")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--api_base", type=str, default="http://localhost:4000/v1", help="Chat API base URL.")
+    parser.add_argument("--api_key", type=str, default=None, help="API key; defaults to FLASHTRACE_API_KEY/OPENAI_API_KEY.")
+    parser.add_argument("--generator_model", type=str, default="qwen3-235b-a22b-2507")
+    parser.add_argument("--judge_model", type=str, default="deepseek-v3-1-terminus")
+    parser.add_argument("--api_timeout", type=int, default=300)
+    parser.add_argument("--api_temperature", type=float, default=0.0)
+    parser.add_argument("--api_cache_ttl", type=int, default=600)
+    parser.add_argument("--api_cache_namespace", type=str, default="flashtrace-exp3")
+    parser.add_argument("--retry_delay", type=float, default=2.0)
+    parser.add_argument("--retries", type=int, default=2, help="Additional retries on API failure.")
+    parser.add_argument("--request_interval", type=float, default=1.0, help="Sleep seconds between generation calls.")
+    parser.add_argument("--judge_interval", type=float, default=1.0, help="Sleep seconds between judge calls.")
+    parser.add_argument("--rate_limit_delay", type=float, default=5.0, help="Seconds to wait on HTTP 429 before retrying.")
+    parser.add_argument(
+        "--api_max_tokens_short",
+        type=int,
+        default=2048,
+        help="Max tokens for the short-CoT generation call.",
+    )
+    parser.add_argument(
+        "--api_max_tokens_long",
+        type=int,
+        default=8192,
+        help="Max tokens for the long-CoT generation call.",
+    )
+    parser.add_argument(
+        "--min_long_thinking_tokens",
+        type=int,
+        default=512,
+        help="Minimum tokenizer tokens required in the long-CoT thinking segment.",
+    )
+    parser.add_argument(
+        "--max_short_thinking_tokens",
+        type=int,
+        default=256,
+        help="Maximum tokenizer tokens allowed in the short-CoT thinking segment.",
+    )
+    parser.add_argument(
+        "--tokenizer_model",
+        type=str,
+        default=None,
+        help="Tokenizer path for span extraction & length constraints (default: generator_model).",
+    )
+    parser.add_argument("--data_root", type=str, default="exp/exp3/data", help="Output directory for exp3 caches.")
+    parser.add_argument("--out_short", type=str, default=None, help="Optional explicit output path (short JSONL).")
+    parser.add_argument("--out_long", type=str, default=None, help="Optional explicit output path (long JSONL).")
+    args = parser.parse_args()
+    api_key = args.api_key or os.environ.get("FLASHTRACE_API_KEY") or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise SystemExit("Set --api_key or FLASHTRACE_API_KEY/OPENAI_API_KEY for API access.")
+    dataset_path = Path(args.dataset_path)
+    if not dataset_path.exists():
+        raise SystemExit(f"Dataset file not found: {dataset_path}")
+    dataset_tag = str(args.dataset_tag or _infer_dataset_tag(dataset_path))
+    tok_name = args.tokenizer_model or args.generator_model
+    tok_path = Path(tok_name)
+    if tok_path.exists():
+        tokenizer = AutoTokenizer.from_pretrained(tok_path.as_posix(), local_files_only=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tok_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    raw_examples = ds_utils.load_ruler(dataset_path, sample=None, seed=args.seed)
+    if not raw_examples:
+        raise SystemExit("No examples loaded from the RULER JSONL.")
+    max_short = int(args.max_short) if args.max_short is not None else int(args.max_pairs)
+    max_long = int(args.max_long) if args.max_long is not None else int(args.max_pairs)
+    if max_short < 0 or max_long < 0:
+        raise SystemExit("--max_short/--max_long must be >= 0.")
+    kept_short: List[CachedExample] = []
+    kept_long: List[CachedExample] = []
+    total = len(raw_examples)
+    attempted = 0
+    for idx, ex in enumerate(tqdm(raw_examples, total=total, desc="Scanning raw RULER"), 1):
+        attempted = idx
+        if args.max_raw_examples is not None and idx > int(args.max_raw_examples):
+            break
+        if len(kept_short) >= max_short and len(kept_long) >= max_long:
+            break
+        reference_answer = _infer_reference_answer(ex)
+        prompt = ex.prompt
+        sample_id = _sha1_text(prompt)
+        base_meta = dict(ex.metadata or {})
+        base_meta["reference_answer"] = reference_answer
+        base_meta["sample_id"] = sample_id
+        base_meta["pair_id"] = sample_id  # backward-compatible name (may not be paired)
+        base_meta["source_dataset_path"] = str(dataset_path)
+        base_meta["prompt_sha1"] = sample_id
+        if len(kept_short) < max_short:
+            short_gen = _generate_one_style(
+                prompt=prompt,
+                reference_answer=reference_answer,
+                tokenizer=tokenizer,
+                style="short",
+                system_prompt=SHORT_COT_SYSTEM_PROMPT,
+                api_base=args.api_base,
+                api_key=api_key,
+                generator_model=args.generator_model,
+                judge_model=args.judge_model,
+                timeout=args.api_timeout,
+                max_tokens=args.api_max_tokens_short,
+                temperature=args.api_temperature,
+                cache_ttl=args.api_cache_ttl,
+                cache_namespace=args.api_cache_namespace,
+                rate_limit_delay=args.rate_limit_delay,
+                retries=args.retries,
+                retry_delay=args.retry_delay,
+                request_interval=args.request_interval,
+                judge_interval=args.judge_interval,
+                min_long_thinking_tokens=args.min_long_thinking_tokens,
+                max_short_thinking_tokens=args.max_short_thinking_tokens,
+            )
+            if short_gen is not None:
+                short_meta = dict(base_meta)
+                short_meta.update(
+                    {
+                        "cot_style": "short",
+                        "generator_model": args.generator_model,
+                        "judge_model": args.judge_model,
+                        "judge_response": short_gen.judge_response,
+                        "boxed_answer": short_gen.boxed_answer,
+                        "thinking_tokens": int(short_gen.thinking_tokens),
+                    }
+                )
+                short_ex = CachedExample(
+                    prompt=prompt,
+                    target=short_gen.target_text,
+                    indices_to_explain=None,
+                    attr_mask_indices=ex.attr_mask_indices,
+                    sink_span=None,
+                    thinking_span=None,
+                    metadata=short_meta,
+                )
+                short_ex = attach_spans_from_answer(short_ex, tokenizer, short_gen.boxed_answer)
+                if isinstance(short_ex.sink_span, list) and len(short_ex.sink_span) == 2:
+                    short_ex = CachedExample(
+                        prompt=short_ex.prompt,
+                        target=short_ex.target,
+                        indices_to_explain=short_ex.sink_span,
+                        attr_mask_indices=short_ex.attr_mask_indices,
+                        sink_span=short_ex.sink_span,
+                        thinking_span=short_ex.thinking_span,
+                        metadata=short_ex.metadata,
+                    )
+                    kept_short.append(short_ex)
+                    print(
+                        f"[kept short] raw_idx={idx}/{total} thinking_tokens={short_gen.thinking_tokens} "
+                        f"sample_id={sample_id[:8]} kept={len(kept_short)}/{max_short}"
+                    )
+        if len(kept_long) < max_long:
+            long_gen = _generate_one_style(
+                prompt=prompt,
+                reference_answer=reference_answer,
+                tokenizer=tokenizer,
+                style="long",
+                system_prompt=LONG_COT_SYSTEM_PROMPT,
+                api_base=args.api_base,
+                api_key=api_key,
+                generator_model=args.generator_model,
+                judge_model=args.judge_model,
+                timeout=args.api_timeout,
+                max_tokens=args.api_max_tokens_long,
+                temperature=args.api_temperature,
+                cache_ttl=args.api_cache_ttl,
+                cache_namespace=args.api_cache_namespace,
+                rate_limit_delay=args.rate_limit_delay,
+                retries=args.retries,
+                retry_delay=args.retry_delay,
+                request_interval=args.request_interval,
+                judge_interval=args.judge_interval,
+                min_long_thinking_tokens=args.min_long_thinking_tokens,
+                max_short_thinking_tokens=args.max_short_thinking_tokens,
+            )
+            if long_gen is not None:
+                long_meta = dict(base_meta)
+                long_meta.update(
+                    {
+                        "cot_style": "long",
+                        "generator_model": args.generator_model,
+                        "judge_model": args.judge_model,
+                        "judge_response": long_gen.judge_response,
+                        "boxed_answer": long_gen.boxed_answer,
+                        "thinking_tokens": int(long_gen.thinking_tokens),
+                    }
+                )
+                long_ex = CachedExample(
+                    prompt=prompt,
+                    target=long_gen.target_text,
+                    indices_to_explain=None,
+                    attr_mask_indices=ex.attr_mask_indices,
+                    sink_span=None,
+                    thinking_span=None,
+                    metadata=long_meta,
+                )
+                long_ex = attach_spans_from_answer(long_ex, tokenizer, long_gen.boxed_answer)
+                if isinstance(long_ex.sink_span, list) and len(long_ex.sink_span) == 2:
+                    long_ex = CachedExample(
+                        prompt=long_ex.prompt,
+                        target=long_ex.target,
+                        indices_to_explain=long_ex.sink_span,
+                        attr_mask_indices=long_ex.attr_mask_indices,
+                        sink_span=long_ex.sink_span,
+                        thinking_span=long_ex.thinking_span,
+                        metadata=long_ex.metadata,
+                    )
+                    kept_long.append(long_ex)
+                    print(
+                        f"[kept long] raw_idx={idx}/{total} thinking_tokens={long_gen.thinking_tokens} "
+                        f"sample_id={sample_id[:8]} kept={len(kept_long)}/{max_long}"
+                    )
+    data_root = Path(args.data_root)
+    out_short = Path(args.out_short) if args.out_short else data_root / f"{dataset_tag}_short_cot.jsonl"
+    out_long = Path(args.out_long) if args.out_long else data_root / f"{dataset_tag}_long_cot.jsonl"
+    n_short = write_cache(out_short, kept_short)
+    n_long = write_cache(out_long, kept_long)
+    print(
+        f"Wrote short={n_short} -> {out_short}\n"
+        f"Wrote long ={n_long} -> {out_long}\n"
+        f"Attempted {attempted} / {total}"
+    )
+    missing: List[str] = []
+    if len(kept_short) < max_short:
+        missing.append(f"short({len(kept_short)}/{max_short})")
+    if len(kept_long) < max_long:
+        missing.append(f"long({len(kept_long)}/{max_long})")
+    if missing:
+        raise SystemExit(f"Could not find enough samples: {', '.join(missing)} (attempted {attempted} / {total}).")
+if __name__ == "__main__":
+    main()

exp/exp4/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# FlashTrace 实验 4（Aider 归因忠实度 / row-only）
+本目录提供 Aider 数据集上的 token-level 归因忠实度评测工具，**只输出 row 部分的 RISE/MAS**，不保存样本级 trace。
+评测范围（固定）：
+- 数据集：`exp/exp4/data/aider.jsonl`
+- 方法：
+  - `ifr_all_positions`
+  - `ifr_multi_hop_both`（FlashTrace）
+- 指标：`RISE`、`MAS`（row attribution only）
+主要文件：
+- `run_exp.py`：归因 + 忠实度评测，输出到 `exp/exp4/output/`
+---
+## 数据格式
+`exp/exp4/data/aider.jsonl` 每行一个 JSON，对应一个样本：
+- `input`：prompt（直接作为 user prompt 内容）
+- `output`：target（直接作为模型生成文本；脚本会内部追加 EOS 做打分）
+- `length`：数据自带字段（脚本不依赖，仅透传到 metadata）
+说明：Aider 的 `output` 形如：
+1) 第一行 `xxx.py`
+2) 第二行 opening fence ```
+3) 中间为代码
+4) 最后一行为 closing fence ```
+---
+## 归因与 sink 选择
+脚本对每个样本都将 `input` 作为 `prompt`，将 `output` 作为 `target`（不做重新生成），并在归因结果上选择不同的 sink（`indices_to_explain=[start_tok,end_tok]`，均基于 `tokenizer(target, add_special_tokens=False)` 的 token span；不含 EOS）。
+### `ifr_all_positions`（输出两个 sink）
+- `last_line`：取 `output` 中 **closing fence 之前最后一个“非空且非 ```”行**，并将该行的字符 span 映射到 token span；若无法解析则回退为 `full_output`。
+- `last_token`：取 `last_line` 的最后一个 token（单点 span `[end,end]`）。
+注意：脚本会对同一个样本只计算一次 `ifr_all_positions` 的归因矩阵，然后分别在两个 sink 上取 row attribution 并计算忠实度。
+### `ifr_multi_hop_both`（FlashTrace，只输出一个 sink）
+- `full_output`：用完整 `output` 作为 sink（token span `[0, n_tok-1]`）。
+- 忠实度扰动侧会沿用 exp2 的协议：对 prompt-side 会跳过 stop tokens（由 `ft_ifr_improve.py` 的 stop-token 配置决定）。
+---
+## 指标输出（row-only）
+输出 CSV 仅包含 row attribution 的 `RISE/MAS` 聚合统计：
+- `Method,Sink,Row_RISE_Mean,Row_RISE_Std,Row_MAS_Mean,Row_MAS_Std,Used,Skipped,Avg_Sample_Time_s`
+输出路径：
+- `exp/exp4/output/faithfulness/aider/<model_tag>/row_only_<N>_examples.csv`
+其中 `<model_tag>` 优先取 `--model`，否则取 `--model_path` 的目录名。
+---
+## 使用说明
+推荐从 repo root 运行（保证相对路径可用）：
+```bash
+python exp/exp4/run_exp.py \
+  --data_path exp/exp4/data/aider.jsonl \
+  --output_root exp/exp4/output \
+  --model qwen-8B \
+  --model_path /opt/share/models/Qwen/Qwen3-8B/ \
+  --cuda 2,3,4,5,6,7 \
+  --num_examples 100 \
+  --n_hops 1 \
+  --k 20
+```
+常用参数：
+- `--model_path` / `--model`：本地模型路径或 HF repo id（至少提供其一）
+- `--tokenizer_path`：可选；不提供则默认复用模型路径/id
+- `--cuda`：支持 `0`（单卡）或 `0,1,2`（多卡，内部会设置 `CUDA_VISIBLE_DEVICES` 并用 `device_map=auto`）
+- `--num_examples`：评测前 N 条（按文件顺序；`--seed` 预留，当前不做随机抽样）
+- `--n_hops`：FlashTrace（`ifr_multi_hop_both`）的 hop 数
+- `--k`：MAS/RISE 的扰动步数
+- `--chunk_tokens` / `--sink_chunk_tokens`：IFR 计算的 chunk 参数（一般保持默认）

exp/exp4/run_exp.py ADDED Viewed

	@@ -0,0 +1,487 @@

+#!/usr/bin/env python3
+"""
+Experiment 4 runner: Aider token-level attribution faithfulness.
+Evaluates only:
+- IFR: ifr_all_positions
+  - sink = last meaningful code line (excluding fences)
+  - sink = last token of that code line
+- FlashTrace: ifr_multi_hop_both
+  - sink = full output (excluding appended EOS)
+Outputs only row-level faithfulness scores (RISE, MAS). No sample-level traces.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+from dataclasses import dataclass
+from itertools import islice
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+def _early_set_cuda_visible_devices() -> None:
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--cuda", type=str, default=None)
+    args, _ = parser.parse_known_args(sys.argv[1:])
+    if args.cuda and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+_early_set_cuda_visible_devices()
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, utils
+# Ensure repo root on path for `import llm_attr`, `import ft_ifr_improve`, etc.
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+import ft_ifr_improve
+import llm_attr
+import llm_attr_eval
+utils.logging.set_verbosity_error()
+@dataclass(frozen=True)
+class AiderExample:
+    prompt: str
+    target: str
+    metadata: Dict[str, Any]
+def _read_jsonl(path: Path) -> List[Dict[str, Any]]:
+    rows: List[Dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            rows.append(json.loads(line))
+    return rows
+def load_aider(path: Path) -> List[AiderExample]:
+    rows = _read_jsonl(path)
+    examples: List[AiderExample] = []
+    for row in rows:
+        prompt = str(row.get("input") or "")
+        target = str(row.get("output") or "")
+        examples.append(AiderExample(prompt=prompt, target=target, metadata={"length": row.get("length")}))
+    return examples
+def _token_span_full_output(tokenizer, target: str) -> List[int]:
+    ids = tokenizer(target, add_special_tokens=False).input_ids
+    if not ids:
+        return [0, 0]
+    return [0, int(len(ids) - 1)]
+def _last_meaningful_code_line_char_span(target: str) -> Optional[Tuple[int, int]]:
+    lines = target.splitlines(keepends=True)
+    pos = 0
+    spans: List[Tuple[int, int, str]] = []
+    for line in lines:
+        start = pos
+        pos += len(line)
+        spans.append((start, pos, line))
+    for start, end, line in reversed(spans):
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("```"):
+            continue
+        if start == 0 and stripped.endswith(".py"):
+            return None
+        line_no_nl = line.rstrip("\r\n")
+        end_no_nl = start + len(line_no_nl)
+        if end_no_nl <= start:
+            continue
+        return start, end_no_nl
+    return None
+def _char_span_to_token_span(tokenizer, text: str, span: Tuple[int, int]) -> Optional[List[int]]:
+    start_char, end_char = int(span[0]), int(span[1])
+    if end_char <= start_char:
+        return None
+    enc = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
+    offsets = enc.get("offset_mapping")
+    if offsets is None:
+        raise ValueError("Tokenizer does not provide offset_mapping; cannot map char spans to tokens.")
+    tok_indices: List[int] = []
+    for idx, off in enumerate(offsets):
+        if off is None:
+            continue
+        s, e = int(off[0]), int(off[1])
+        if s < end_char and e > start_char:
+            tok_indices.append(int(idx))
+    if not tok_indices:
+        return None
+    return [min(tok_indices), max(tok_indices)]
+def _last_meaningful_code_line_token_span(tokenizer, target: str) -> List[int]:
+    full_span = _token_span_full_output(tokenizer, target)
+    span_chars = _last_meaningful_code_line_char_span(target)
+    if span_chars is None:
+        return full_span
+    span_toks = _char_span_to_token_span(tokenizer, target, span_chars)
+    if span_toks is None:
+        return full_span
+    span_toks[0] = max(int(span_toks[0]), int(full_span[0]))
+    span_toks[1] = min(int(span_toks[1]), int(full_span[1]))
+    if span_toks[1] < span_toks[0]:
+        return full_span
+    return span_toks
+def _last_token_span(token_span: Sequence[int]) -> List[int]:
+    if not (isinstance(token_span, Sequence) and len(token_span) == 2):
+        return [0, 0]
+    end = int(token_span[1])
+    return [end, end]
+def resolve_device(args) -> str:
+    if args.cuda is not None and "," in args.cuda:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
+        return "auto"
+    if args.cuda is not None and args.cuda.strip():
+        return f"cuda:{args.cuda}" if torch.cuda.is_available() else "cpu"
+    return f"cuda:{args.cuda_num}" if torch.cuda.is_available() else "cpu"
+def load_model_and_tokenizer(args) -> tuple[Any, Any]:
+    model_id = args.model_path or args.model
+    if not model_id:
+        raise SystemExit("Provide --model_path (local) or --model (HF repo id).")
+    tokenizer_id = args.tokenizer_path or model_id
+    device = resolve_device(args)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto" if device == "auto" else {"": int(device.split(":")[1])} if device.startswith("cuda:") else None,
+        torch_dtype=torch.float16,
+        attn_implementation="eager",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+def _faithfulness_test_with_user_prompt_indices(
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    attribution: torch.Tensor,
+    prompt: str,
+    generation: str,
+    *,
+    user_prompt_indices: List[int],
+    k: int = 20,
+) -> Tuple[float, float, float]:
+    def auc(arr: np.ndarray) -> float:
+        return (arr.sum() - arr[0] / 2 - arr[-1] / 2) / max(1, (arr.shape[0] - 1))
+    pad_token_id = llm_evaluator._ensure_pad_token_id()
+    user_prompt = " " + prompt
+    formatted_prompt = llm_evaluator.format_prompt(user_prompt)
+    formatted_ids = llm_evaluator.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False).input_ids
+    prompt_ids = formatted_ids.to(llm_evaluator.device)
+    prompt_ids_perturbed = prompt_ids.clone()
+    generation_ids = llm_evaluator.tokenizer(
+        generation + llm_evaluator.tokenizer.eos_token,
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).input_ids.to(llm_evaluator.device)
+    attr_cpu = attribution.detach().cpu()
+    w = attr_cpu.sum(0)
+    sorted_attr_indices = torch.argsort(w, descending=True)
+    attr_sum = float(w.sum().item())
+    P = int(w.numel())
+    if len(user_prompt_indices) != P:
+        raise ValueError(
+            "user_prompt_indices length does not match prompt-side attribution length: "
+            f"indices P={len(user_prompt_indices)}, attr P={P}."
+        )
+    if P == 0:
+        return 0.0, 0.0, 0.0
+    if max(user_prompt_indices) >= int(prompt_ids_perturbed.shape[1]):
+        raise ValueError("user_prompt_indices contains an out-of-bounds index for formatted prompt ids.")
+    steps = int(k) if k is not None else 0
+    if steps <= 0:
+        steps = 1
+    steps = min(steps, P)
+    scores = np.zeros(steps + 1, dtype=np.float64)
+    density = np.zeros(steps + 1, dtype=np.float64)
+    scores[0] = (
+        llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+    )
+    density[0] = 1.0
+    if attr_sum <= 0:
+        density = np.linspace(1.0, 0.0, steps + 1)
+    base = P // steps
+    remainder = P % steps
+    start = 0
+    for step in range(steps):
+        size = base + (1 if step < remainder else 0)
+        group = sorted_attr_indices[start : start + size]
+        start += size
+        for idx in group:
+            j = int(idx.item())
+            abs_pos = int(user_prompt_indices[j])
+            prompt_ids_perturbed[0, abs_pos] = pad_token_id
+        scores[step + 1] = (
+            llm_evaluator.compute_logprob_response_given_prompt(prompt_ids_perturbed, generation_ids).sum().cpu().detach().item()
+        )
+        if attr_sum > 0:
+            dec = float(w.index_select(0, group).sum().item()) / attr_sum
+            density[step + 1] = density[step] - dec
+    min_normalized_pred = 1.0
+    normalized_model_response = scores.copy()
+    for i in range(len(scores)):
+        normalized_pred = (normalized_model_response[i] - scores[-1]) / (abs(scores[0] - scores[-1]))
+        normalized_pred = np.clip(normalized_pred, 0.0, 1.0)
+        min_normalized_pred = min(min_normalized_pred, normalized_pred)
+        normalized_model_response[i] = min_normalized_pred
+    alignment_penalty = np.abs(normalized_model_response - density)
+    corrected_scores = normalized_model_response + alignment_penalty
+    corrected_scores = corrected_scores.clip(0.0, 1.0)
+    corrected_scores = (corrected_scores - np.min(corrected_scores)) / (np.max(corrected_scores) - np.min(corrected_scores))
+    if np.isnan(corrected_scores).any():
+        corrected_scores = np.linspace(1.0, 0.0, len(scores))
+    return auc(normalized_model_response), auc(corrected_scores), auc(normalized_model_response + alignment_penalty)
+def _row_faithfulness_scores(
+    *,
+    llm_evaluator: llm_attr_eval.LLMAttributionEvaluator,
+    attribution_prompt: torch.Tensor,
+    prompt: str,
+    generation: str,
+    user_prompt_indices: Optional[List[int]],
+    keep_prompt_token_indices: Optional[Sequence[int]] = None,
+    k: int = 20,
+) -> Tuple[float, float]:
+    if keep_prompt_token_indices is not None:
+        rise, mas, _ = ft_ifr_improve.faithfulness_test_skip_tokens(
+            llm_evaluator,
+            attribution_prompt,
+            prompt,
+            generation,
+            keep_prompt_token_indices=keep_prompt_token_indices,
+            user_prompt_indices=user_prompt_indices,
+            k=int(k),
+        )
+        return float(rise), float(mas)
+    if user_prompt_indices is not None:
+        rise, mas, _ = _faithfulness_test_with_user_prompt_indices(
+            llm_evaluator,
+            attribution_prompt,
+            prompt,
+            generation,
+            user_prompt_indices=user_prompt_indices,
+            k=int(k),
+        )
+        return float(rise), float(mas)
+    rise, mas, _ = llm_evaluator.faithfulness_test(attribution_prompt, prompt, generation, k=int(k))
+    return float(rise), float(mas)
+def _model_tag(args) -> str:
+    if args.model:
+        return str(args.model)
+    if args.model_path:
+        return Path(args.model_path).name
+    return "model"
+def main() -> None:
+    parser = argparse.ArgumentParser("Experiment 4 runner: aider faithfulness (row-only).")
+    parser.add_argument("--data_path", type=str, default="exp/exp4/data/aider.jsonl")
+    parser.add_argument("--output_root", type=str, default="exp/exp4/output")
+    parser.add_argument("--model", type=str, default=None, help="HF repo id (required unless --model_path set).")
+    parser.add_argument("--model_path", type=str, default=None, help="Local path; overrides --model for loading.")
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="Optional tokenizer path/id (defaults to model).")
+    parser.add_argument("--cuda", type=str, default=None)
+    parser.add_argument("--cuda_num", type=int, default=0)
+    parser.add_argument("--num_examples", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=42, help="Reserved for future use; exp4 runs in file order.")
+    parser.add_argument("--chunk_tokens", type=int, default=128)
+    parser.add_argument("--sink_chunk_tokens", type=int, default=32)
+    parser.add_argument("--n_hops", type=int, default=3)
+    parser.add_argument("--k", type=int, default=20, help="Perturbation steps for MAS/RISE.")
+    args = parser.parse_args()
+    data_path = Path(args.data_path)
+    if not data_path.exists():
+        raise SystemExit(f"Missing Aider JSONL: {data_path}")
+    model, tokenizer = load_model_and_tokenizer(args)
+    llm_evaluator = llm_attr_eval.LLMAttributionEvaluator(model, tokenizer)
+    examples = load_aider(data_path)
+    total = min(len(examples), int(args.num_examples))
+    iterator = islice(examples, total)
+    ifr = llm_attr.LLMIFRAttribution(
+        model,
+        tokenizer,
+        chunk_tokens=int(args.chunk_tokens),
+        sink_chunk_tokens=int(args.sink_chunk_tokens),
+    )
+    flashtrace = ft_ifr_improve.LLMIFRAttributionBoth(
+        model,
+        tokenizer,
+        chunk_tokens=int(args.chunk_tokens),
+        sink_chunk_tokens=int(args.sink_chunk_tokens),
+    )
+    results: Dict[Tuple[str, str], List[Tuple[float, float]]] = {
+        ("ifr_all_positions", "last_line"): [],
+        ("ifr_all_positions", "last_token"): [],
+        ("ifr_multi_hop_both", "full_output"): [],
+    }
+    skipped: Dict[Tuple[str, str], int] = {k: 0 for k in results}
+    sample_times: Dict[Tuple[str, str], List[float]] = {k: [] for k in results}
+    for example_idx, ex in enumerate(iterator):
+        prompt = ex.prompt
+        target = ex.target
+        full_span = _token_span_full_output(tokenizer, target)
+        last_line_span = _last_meaningful_code_line_token_span(tokenizer, target)
+        last_token_span = _last_token_span(last_line_span)
+        attr_all = None
+        attr_all_time_s = 0.0
+        user_prompt_indices_all: Optional[List[int]] = None
+        prompt_len_all = 0
+        try:
+            t_attr = time.perf_counter()
+            attr_all = ifr.calculate_ifr_for_all_positions(prompt, target=target)
+            attr_all_time_s = float(time.perf_counter() - t_attr)
+            user_prompt_indices_all = list(getattr(ifr, "user_prompt_indices", []) or [])
+            prompt_len_all = int(len(attr_all.prompt_tokens))
+        except Exception as exc:
+            skipped[("ifr_all_positions", "last_line")] += 1
+            skipped[("ifr_all_positions", "last_token")] += 1
+            print(f"[warn] ifr_all_positions attribution failed ex={example_idx}: {exc}")
+        if attr_all is not None and user_prompt_indices_all is not None and prompt_len_all >= 0:
+            for sink_name, span in (("last_line", last_line_span), ("last_token", last_token_span)):
+                key = ("ifr_all_positions", sink_name)
+                try:
+                    t_faith = time.perf_counter()
+                    row = attr_all.get_all_token_attrs(list(span))[1]
+                    rise, mas = _row_faithfulness_scores(
+                        llm_evaluator=llm_evaluator,
+                        attribution_prompt=row[:, :prompt_len_all],
+                        prompt=prompt,
+                        generation=target,
+                        user_prompt_indices=user_prompt_indices_all,
+                        k=int(args.k),
+                    )
+                    faith_time_s = float(time.perf_counter() - t_faith)
+                    results[key].append((rise, mas))
+                    sample_times[key].append(attr_all_time_s + faith_time_s)
+                except Exception as exc:
+                    skipped[key] += 1
+                    print(f"[warn] ifr_all_positions {sink_name} failed ex={example_idx}: {exc}")
+        try:
+            t_attr = time.perf_counter()
+            attr_ft = flashtrace.calculate_ifr_multi_hop_both(
+                prompt,
+                target=target,
+                sink_span=None,
+                thinking_span=None,
+                n_hops=int(args.n_hops),
+            )
+            attr_ft_time_s = float(time.perf_counter() - t_attr)
+            user_prompt_indices_ft = list(getattr(flashtrace, "user_prompt_indices", []) or [])
+            prompt_len_ft = int(len(attr_ft.prompt_tokens))
+            keep_prompt_token_indices = ft_ifr_improve.keep_token_indices(list(attr_ft.prompt_tokens))
+            t_faith = time.perf_counter()
+            row_full = attr_ft.get_all_token_attrs(full_span)[1]
+            rise, mas = _row_faithfulness_scores(
+                llm_evaluator=llm_evaluator,
+                attribution_prompt=row_full[:, :prompt_len_ft],
+                prompt=prompt,
+                generation=target,
+                user_prompt_indices=user_prompt_indices_ft,
+                keep_prompt_token_indices=keep_prompt_token_indices,
+                k=int(args.k),
+            )
+            faith_time_s = float(time.perf_counter() - t_faith)
+            results[("ifr_multi_hop_both", "full_output")].append((rise, mas))
+            sample_times[("ifr_multi_hop_both", "full_output")].append(attr_ft_time_s + faith_time_s)
+        except Exception as exc:
+            skipped[("ifr_multi_hop_both", "full_output")] += 1
+            print(f"[warn] ifr_multi_hop_both failed ex={example_idx}: {exc}")
+    model_tag = _model_tag(args)
+    out_dir = Path(args.output_root) / "faithfulness" / "aider" / model_tag
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"row_only_{total}_examples.csv"
+    with out_path.open("w", encoding="utf-8") as f:
+        f.write("Method,Sink,Row_RISE_Mean,Row_RISE_Std,Row_MAS_Mean,Row_MAS_Std,Used,Skipped,Avg_Sample_Time_s\n")
+        for (method, sink), vals in results.items():
+            arr = np.asarray(vals, dtype=np.float64)
+            used = int(arr.shape[0])
+            if used == 0:
+                rise_mean = float("nan")
+                rise_std = float("nan")
+                mas_mean = float("nan")
+                mas_std = float("nan")
+            else:
+                rise_mean = float(arr[:, 0].mean())
+                rise_std = float(arr[:, 0].std())
+                mas_mean = float(arr[:, 1].mean())
+                mas_std = float(arr[:, 1].std())
+            times = sample_times.get((method, sink)) or []
+            avg_time = float(np.mean(times)) if times else 0.0
+            f.write(
+                f"{method},{sink},{rise_mean},{rise_std},{mas_mean},{mas_std},{used},{int(skipped[(method, sink)])},{avg_time}\n"
+            )
+    print(f"[done] wrote {out_path}")
+if __name__ == "__main__":
+    main()

exp/exp5/README.md ADDED Viewed

	@@ -0,0 +1,119 @@

+# FlashTrace 实验 5：跨模型（Qwen → Llama）token-span 映射
+## 背景：为什么需要映射
+`exp/exp2/run_exp.py` 的归因与评估是严格 **token-level** 的，并且依赖缓存数据中的 token-span 字段：
+- `indices_to_explain = [start_tok, end_tok]`（generation token indices，闭区间）
+- `sink_span` / `thinking_span`（同样是 generation token spans）
+这些 span 在生成缓存（`exp/exp2/sample_and_filter.py`、`exp/exp2/map_math_mine_to_exp2_cache.py`）时是用某个 tokenizer 计算并写死的（通常是 `Qwen3-8B` 的 tokenizer）。
+当你切换到新模型（例如 `Llama-3.1-8B-Instruct`）时，**tokenizer 不同**，`target` 的 tokenization 长度/边界会变化，导致旧的 span 在新 tokenizer 下经常越界，从而让 exp2 在归因阶段直接报错（`IndexError: end_tok out of range`）。
+## 解决方案：exp5 映射脚本
+`exp/exp5/map_exp2_cache_token_spans.py` 将 exp2 缓存里的旧 token-span 从旧 tokenizer（默认 `Qwen3-8B`）映射到新 tokenizer（默认 `Llama-3.1-8B-Instruct`），并输出到：
+`exp/exp5/data/<同名数据集>.jsonl`
+映射策略（默认）：
+1) 用旧 tokenizer 对 `target` 做 `return_offsets_mapping=True`
+2) 把旧的 token-span 转成 `target` 的字符区间
+3) 用新 tokenizer 对同一个 `target` 做 offsets，再把字符区间映射回新的 token-span
+如遇极端情况（缓存并非由预期旧 tokenizer 产生），可启用 `--allow_fallback_answer`，用 `metadata.boxed_answer`（或 `reference_answer`）在新 tokenizer 下重新定位 span 作为兜底。
+---
+## Step 1：把 exp2 数据集缓存映射到 exp5/data
+推荐使用仓库的 venv：
+```bash
+.venv/bin/python exp/exp5/map_exp2_cache_token_spans.py \
+  --in_jsonl exp/exp2/data/niah_mq_q2.jsonl \
+  --out_dir exp/exp5/data \
+  --old_tokenizer_model /opt/share/models/Qwen/Qwen3-8B \
+  --new_tokenizer_model /opt/share/models/meta-llama/Llama-3.1-8B-Instruct
+```
+一次映射多个数据集（示例：RULER + math）：
+```bash
+.venv/bin/python exp/exp5/map_exp2_cache_token_spans.py \
+  --in_jsonl exp/exp2/data/niah_mq_q2.jsonl exp/exp2/data/math.jsonl \
+  --out_dir exp/exp5/data \
+  --old_tokenizer_model /opt/share/models/Qwen/Qwen3-8B \
+  --new_tokenizer_model /opt/share/models/meta-llama/Llama-3.1-8B-Instruct
+```
+如果输出文件已存在，加 `--overwrite`。
+默认行为：若某条样本无法映射，脚本会将其 **drop** 并在输出统计中报告；如需严格一致性请加 `--strict`（遇到首个失败样本直接退出）。如怀疑原缓存并非由 `--old_tokenizer_model` 产生，可加 `--allow_fallback_answer` 启用基于 `metadata.boxed_answer` 的兜底定位。
+---
+## Step 2：用 exp2 直接跑 Llama 归因评测（但数据/输出都指向 exp5）
+关键点：
+- **数据读取**：用 `--data_root exp/exp5/data`（让 exp2 读取映射后的缓存）
+- **结果输出**：用 `--output_root exp/exp5/output`（避免写入 `exp/exp2/output`）
+- **不要加** `--save_hop_traces`（避免写 trace）
+### RULER（可跑 recovery + faithfulness）
+```bash
+CUDA_VISIBLE_DEVICES=0 .venv/bin/python exp/exp2/run_exp.py \
+  --datasets niah_mq_q2 \
+  --data_root exp/exp5/data \
+  --output_root exp/exp5/output \
+  --attr_funcs ifr_all_positions,attnlrp,ifr_multi_hop_both \
+  --model_path /opt/share/models/meta-llama/Llama-3.1-8B-Instruct \
+  --cuda 0 \
+  --num_examples 100 \
+  --mode faithfulness_gen,recovery_ruler
+```
+### math（只能跑 faithfulness；recovery 会被 exp2 显式拒绝）
+```bash
+CUDA_VISIBLE_DEVICES=0 .venv/bin/python exp/exp2/run_exp.py \
+  --datasets math \
+  --data_root exp/exp5/data \
+  --output_root exp/exp5/output \
+  --attr_funcs ifr_all_positions,attnlrp,ifr_multi_hop_both \
+  --model_path /opt/share/models/meta-llama/Llama-3.1-8B-Instruct \
+  --cuda 0 \
+  --num_examples 100 \
+  --mode faithfulness_gen
+```
+## 关于“是否会污染 exp2 文件夹”
+- **不会污染 `exp/exp2/data/`**：我们不改 exp2 的缓存，而是输出到 `exp/exp5/data/`。
+- **不加 `--save_hop_traces` 不会写 trace**。
+- 但注意：`exp/exp2/run_exp.py` 本身**一定会写 CSV 指标文件**到 `--output_root`（代码行为如此，exp5 不改 exp2），所以要做到“exp2 文件夹不新增文件”，请把 `--output_root` 指向 `exp/exp5/output`（或其它目录）。
+```bash
+python exp/exp2/run_exp.py \
+  --datasets niah_mq_q2 \
+  --data_root exp/exp5/data \
+  --output_root exp/exp5/output \
+  --attr_funcs ifr_all_positions,attnlrp,ifr_multi_hop_both \
+  --model_path /opt/share/models/meta-llama/Llama-3.1-8B-Instruct \
+  --cuda 2,3,4,5,6,7 \
+  --num_examples 100 \
+  --mode faithfulness_gen \
+  --n_hops 1
+&& python exp/exp2/run_exp.py \
+  --datasets math \
+  --data_root exp/exp5/data \
+  --output_root exp/exp5/output \
+  --attr_funcs ifr_all_positions,attnlrp,ifr_multi_hop_both \
+  --model_path /opt/share/models/meta-llama/Llama-3.1-8B-Instruct \
+  --cuda 2,3,4,5,6,7 \
+  --num_examples 100 \
+  --mode faithfulness_gen \
+  --n_hops 1
+```

exp/exp5/map_exp2_cache_token_spans.py ADDED Viewed

	@@ -0,0 +1,407 @@

+#!/usr/bin/env python3
+"""Map exp2 cached JSONL token spans across tokenizers (Qwen -> Llama).
+Background
+----------
+`exp/exp2/run_exp.py` expects cached datasets to provide token-level generation spans:
+  - indices_to_explain: [start_tok, end_tok] (generation-token indices; closed interval)
+  - sink_span / thinking_span: same tokenizer convention as indices_to_explain
+These spans are computed under a specific tokenizer (often Qwen3-8B). When switching
+to a different model/tokenizer (e.g., Llama-3.1-8B-Instruct), the stored spans can
+become out-of-range and crash exp2 attribution (IndexError in token-span checks).
+This script remaps spans by:
+  1) Tokenizing `target` with the OLD tokenizer to obtain offset_mapping
+  2) Converting the OLD token span into a character span in `target`
+  3) Tokenizing `target` with the NEW tokenizer and mapping the character span back
+     into NEW token indices
+Outputs are written under `exp/exp5/data/` by default, keeping `exp/exp2/` untouched.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from transformers import AutoTokenizer
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+def _split_args(values: Iterable[str]) -> List[str]:
+    out: List[str] = []
+    for v in values:
+        for part in str(v).split(","):
+            part = part.strip()
+            if part:
+                out.append(part)
+    return out
+def _load_tokenizer(tokenizer_model: str):
+    path = Path(tokenizer_model)
+    if path.exists():
+        return AutoTokenizer.from_pretrained(path.as_posix(), local_files_only=True)
+    # May require network access; keep as fallback for environments that allow it.
+    return AutoTokenizer.from_pretrained(tokenizer_model)
+def _is_token_span(span: Any) -> bool:
+    return (
+        isinstance(span, list)
+        and len(span) == 2
+        and all(isinstance(x, int) for x in span)
+        and span[0] >= 0
+        and span[1] >= span[0]
+    )
+def _pick_old_span(obj: Dict[str, Any]) -> Optional[List[int]]:
+    span = obj.get("indices_to_explain")
+    if _is_token_span(span):
+        return list(span)
+    span = obj.get("sink_span")
+    if _is_token_span(span):
+        return list(span)
+    return None
+def _offsets_to_char_span(offsets: Any, token_span: List[int]) -> Optional[Tuple[int, int]]:
+    """Convert a token span [start,end] to a character span [char_start,char_end) using offsets."""
+    if offsets is None:
+        return None
+    if not isinstance(offsets, list):
+        return None
+    start_tok, end_tok = token_span
+    if end_tok >= len(offsets):
+        return None
+    char_starts: List[int] = []
+    char_ends: List[int] = []
+    for idx in range(start_tok, end_tok + 1):
+        off = offsets[idx]
+        if off is None:
+            continue
+        if not (isinstance(off, (list, tuple)) and len(off) == 2):
+            continue
+        try:
+            s, e = int(off[0]), int(off[1])
+        except Exception:
+            continue
+        if e <= s:
+            continue
+        char_starts.append(s)
+        char_ends.append(e)
+    if not char_starts or not char_ends:
+        return None
+    return min(char_starts), max(char_ends)
+def _char_span_to_token_span(offsets: Any, char_span: Tuple[int, int]) -> Optional[List[int]]:
+    """Convert a character span [char_start,char_end) to a token span [start,end] by overlap."""
+    if offsets is None:
+        return None
+    if not isinstance(offsets, list):
+        return None
+    char_start, char_end = int(char_span[0]), int(char_span[1])
+    if char_end <= char_start:
+        return None
+    hit: List[int] = []
+    for tok_idx, off in enumerate(offsets):
+        if off is None:
+            continue
+        if not (isinstance(off, (list, tuple)) and len(off) == 2):
+            continue
+        try:
+            s, e = int(off[0]), int(off[1])
+        except Exception:
+            continue
+        if e <= s:
+            continue
+        if s < char_end and e > char_start:
+            hit.append(int(tok_idx))
+    if not hit:
+        return None
+    return [min(hit), max(hit)]
+def _validate_span_with_eos(tokenizer, target: str, token_span: List[int]) -> bool:
+    eos = tokenizer.eos_token or ""
+    gen_ids = tokenizer(target + eos, add_special_tokens=False).input_ids
+    gen_len = int(len(gen_ids))
+    return 0 <= token_span[0] <= token_span[1] < gen_len
+def _guess_answer_text(obj: Dict[str, Any]) -> Optional[str]:
+    meta = obj.get("metadata") or {}
+    if isinstance(meta, dict):
+        boxed = (meta.get("boxed_answer") or "").strip()
+        if boxed:
+            return boxed
+        ref = (meta.get("reference_answer") or "").strip()
+        if ref:
+            return ref
+    tgt = obj.get("target")
+    if isinstance(tgt, str) and tgt.strip():
+        # Common exp2 cache convention: last line is the final answer.
+        last_line = tgt.strip().splitlines()[-1].strip()
+        return last_line or None
+    return None
+def _fallback_map_via_answer_text(
+    obj: Dict[str, Any],
+    *,
+    new_tokenizer,
+) -> Optional[List[int]]:
+    tgt = obj.get("target")
+    if not isinstance(tgt, str) or not tgt:
+        return None
+    from exp.exp2.dataset_utils import CachedExample, attach_spans_from_answer  # lazy import
+    answer_text = _guess_answer_text(obj)
+    ex = CachedExample(
+        prompt=str(obj.get("prompt") or ""),
+        target=tgt,
+        indices_to_explain=None,
+        attr_mask_indices=obj.get("attr_mask_indices"),
+        sink_span=None,
+        thinking_span=None,
+        metadata=obj.get("metadata") or {},
+    )
+    out = attach_spans_from_answer(ex, new_tokenizer, answer_text)
+    if out.sink_span is None:
+        return None
+    if not _is_token_span(out.sink_span):
+        return None
+    return list(out.sink_span)
+def _map_one_obj(
+    obj: Dict[str, Any],
+    *,
+    old_tokenizer,
+    new_tokenizer,
+    allow_fallback_answer: bool,
+) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
+    target = obj.get("target")
+    if not isinstance(target, str) or not target:
+        return None, "missing_target"
+    old_span = _pick_old_span(obj)
+    if old_span is None:
+        return None, "missing_old_span"
+    # 1) Old token span -> char span in target.
+    old_enc = old_tokenizer(target, add_special_tokens=False, return_offsets_mapping=True)
+    old_offsets = old_enc.get("offset_mapping")
+    char_span = _offsets_to_char_span(old_offsets, old_span)
+    if char_span is None:
+        if not allow_fallback_answer:
+            return None, "old_span_to_char_failed"
+        new_span = _fallback_map_via_answer_text(obj, new_tokenizer=new_tokenizer)
+        if new_span is None:
+            return None, "fallback_answer_failed"
+        if not _validate_span_with_eos(new_tokenizer, target, new_span):
+            return None, "fallback_answer_span_invalid"
+        mapped = dict(obj)
+        mapped["indices_to_explain"] = new_span
+        mapped["sink_span"] = new_span
+        mapped["thinking_span"] = [0, new_span[0] - 1] if new_span[0] > 0 else None
+        meta = mapped.get("metadata")
+        if not isinstance(meta, dict):
+            meta = {}
+        meta = dict(meta)
+        meta["exp5_span_map_method"] = "answer_text"
+        mapped["metadata"] = meta
+        return mapped, None
+    # 2) Char span -> new token span.
+    new_enc = new_tokenizer(target, add_special_tokens=False, return_offsets_mapping=True)
+    new_offsets = new_enc.get("offset_mapping")
+    new_span = _char_span_to_token_span(new_offsets, char_span)
+    if new_span is None:
+        if not allow_fallback_answer:
+            return None, "char_to_new_span_failed"
+        new_span = _fallback_map_via_answer_text(obj, new_tokenizer=new_tokenizer)
+        if new_span is None:
+            return None, "fallback_answer_failed"
+    if not _validate_span_with_eos(new_tokenizer, target, new_span):
+        if not allow_fallback_answer:
+            return None, "new_span_invalid"
+        fb = _fallback_map_via_answer_text(obj, new_tokenizer=new_tokenizer)
+        if fb is None or not _validate_span_with_eos(new_tokenizer, target, fb):
+            return None, "fallback_answer_span_invalid"
+        new_span = fb
+    mapped = dict(obj)
+    mapped["indices_to_explain"] = new_span
+    mapped["sink_span"] = new_span
+    mapped["thinking_span"] = [0, new_span[0] - 1] if new_span[0] > 0 else None
+    meta = mapped.get("metadata")
+    if not isinstance(meta, dict):
+        meta = {}
+    meta = dict(meta)
+    meta["exp5_span_map_method"] = "token_span_char_align"
+    mapped["metadata"] = meta
+    return mapped, None
+def _read_jsonl(path: Path) -> Iterable[Dict[str, Any]]:
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, start=1):
+            if not line.strip():
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as exc:  # pragma: no cover
+                raise RuntimeError(f"Invalid JSON at {path}:{line_no}: {exc}") from exc
+            if not isinstance(obj, dict):
+                raise RuntimeError(f"Expected JSON object per line at {path}:{line_no}.")
+            yield obj
+def _write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> int:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with path.open("w", encoding="utf-8") as f:
+        for obj in rows:
+            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+def _default_old_tokenizer() -> str:
+    # Repo defaults used in exp2 README examples for span extraction.
+    return "/opt/share/models/Qwen/Qwen3-8B"
+def _default_new_tokenizer() -> str:
+    return "/opt/share/models/meta-llama/Llama-3.1-8B-Instruct"
+def main() -> None:
+    ap = argparse.ArgumentParser("Map exp2 cache token spans from an old tokenizer to a new tokenizer.")
+    ap.add_argument(
+        "--in_jsonl",
+        type=str,
+        nargs="+",
+        required=True,
+        help="One or more exp2 cached JSONL files (comma-separated also accepted).",
+    )
+    ap.add_argument(
+        "--out_dir",
+        type=str,
+        default="exp/exp5/data",
+        help="Output directory for mapped JSONL files.",
+    )
+    ap.add_argument(
+        "--old_tokenizer_model",
+        type=str,
+        default=_default_old_tokenizer(),
+        help="Tokenizer used to produce the original token spans (default: Qwen3-8B local path).",
+    )
+    ap.add_argument(
+        "--new_tokenizer_model",
+        type=str,
+        default=_default_new_tokenizer(),
+        help="Tokenizer to map spans into (default: Llama-3.1-8B-Instruct local path).",
+    )
+    ap.add_argument("--strict", action="store_true", help="Fail on the first example that cannot be mapped.")
+    ap.add_argument(
+        "--allow_fallback_answer",
+        action="store_true",
+        help=(
+            "If span alignment fails, try to recompute spans by locating metadata.boxed_answer in target "
+            "(useful when caches were not built with the assumed old tokenizer)."
+        ),
+    )
+    ap.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite output files if they already exist.",
+    )
+    args = ap.parse_args()
+    in_paths = [Path(p) for p in _split_args(args.in_jsonl)]
+    out_dir = Path(args.out_dir)
+    old_tok = _load_tokenizer(str(args.old_tokenizer_model))
+    new_tok = _load_tokenizer(str(args.new_tokenizer_model))
+    # exp2 convention: ensure a pad token exists for downstream perturbation.
+    if new_tok.pad_token is None and new_tok.eos_token is not None:
+        new_tok.pad_token = new_tok.eos_token
+    summary: Dict[str, Any] = {
+        "old_tokenizer_model": str(args.old_tokenizer_model),
+        "new_tokenizer_model": str(args.new_tokenizer_model),
+        "datasets": [],
+    }
+    for in_path in in_paths:
+        if not in_path.exists():
+            raise SystemExit(f"Missing input JSONL: {in_path}")
+        out_path = out_dir / in_path.name
+        if out_path.exists() and not bool(args.overwrite):
+            raise SystemExit(f"Refusing to overwrite existing output: {out_path} (use --overwrite)")
+        total = 0
+        mapped_ok = 0
+        dropped = 0
+        errors: Dict[str, int] = {}
+        mapped_rows: List[Dict[str, Any]] = []
+        for obj in _read_jsonl(in_path):
+            total += 1
+            mapped, err = _map_one_obj(
+                obj,
+                old_tokenizer=old_tok,
+                new_tokenizer=new_tok,
+                allow_fallback_answer=bool(args.allow_fallback_answer),
+            )
+            if err is not None or mapped is None:
+                errors[err or "unknown_error"] = errors.get(err or "unknown_error", 0) + 1
+                if bool(args.strict):
+                    raise SystemExit(f"Failed to map {in_path} example #{total}: {err}")
+                dropped += 1
+                continue
+            mapped_ok += 1
+            mapped_rows.append(mapped)
+        written = _write_jsonl(out_path, mapped_rows)
+        if written != mapped_ok:  # pragma: no cover
+            raise SystemExit(f"Internal error: written={written} != mapped_ok={mapped_ok}")
+        record = {
+            "in_jsonl": str(in_path),
+            "out_jsonl": str(out_path),
+            "total": int(total),
+            "mapped_ok": int(mapped_ok),
+            "dropped": int(dropped),
+            "errors": errors,
+        }
+        summary["datasets"].append(record)
+        print(json.dumps(record, ensure_ascii=False))
+    # Human-readable compact summary at end.
+    print(json.dumps(summary, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()

exp/proc/README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+# exp/proc（exp2 trace 映射/对外导出）
+本目录提供把 `exp/exp2/run_exp.py --save_hop_traces` 产出的 trace 结果，整理成“给合作者使用”的精简样本级 `.npz` 的工具。
+主要文件：
+- `exp/proc/map_exp2_traces_to_proc.py`：读取 exp2 的 trace run 文件夹（`manifest.jsonl` + `ex_*.npz`），输出精简格式到 `exp/proc/output/`。
+---
+## 输入要求
+你需要提供（或可自动推断）：
+- `--trace_dir`：exp2 的 trace run 文件夹，例如：
+  - `exp/exp2/output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/ifr_all_positions_mfaithfulness_gen_95ex/`
+- `--dataset_jsonl`：与该 trace run 对应的 exp2 缓存数据集（必须包含 `prompt` + `target`），例如：
+  - `exp/exp2/data/morehopqa.jsonl`
+- `--tokenizer_model`：与 exp2 归因时一致的 tokenizer（本地路径或模型名），例如：
+  - `/opt/share/models/Qwen/Qwen3-8B/`
+注意：
+- 本脚本会严格复刻 exp2 的 token 对齐逻辑（prompt 前导空格、generation 用 `target + eos_token` 再 decode + offset 切片），因此 tokenizer 必须与 exp2 归因一致，否则会直接报错（长度对不上）。
+- 样本匹配使用 `manifest.jsonl` 中的 `prompt_sha1/target_sha1` 对齐 `--dataset_jsonl`；所以 `--dataset_jsonl` 必须是当次 trace run 使用的那份缓存。
+---
+## 输出位置与命名
+默认输出到：
+- `exp/proc/output/<trace_dir 在 traces/ 之后的同构路径>/`
+例如输入：
+- `.../output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/<run_tag>/`
+默认输出：
+- `exp/proc/output/exp/exp2/data/morehopqa.jsonl/qwen-8B/<run_tag>/`
+你也可以用 `--out_dir` 显式指定输出目录。
+输出目录内每个样本一个文件：`ex_000000.npz`、`ex_000001.npz` …
+---
+## 输出 `.npz` 字段（精简且仅包含必要信息）
+每个输出样本 `.npz` **仅包含**下列键：
+- `attr`：`float32[L]`，row 归因向量；已去掉 chat template，且去掉 EOS，仅覆盖 `input+cot+output` 的有效 token。
+- `hop`：`float32[H, L]`（可选，仅 FT-IFR 类方法），逐 hop 的向量；同样已去掉 EOS，并与 `attr` 等长对齐。
+- `tok`：`U[L]`，与 `attr/hop` 严格对齐的 token 文本片段序列（同样不含 chat template 与 EOS）。
+- `span_in`：`int64[2]`，input 在向量中的闭区间范围。
+- `span_cot`：`int64[2]`，cot 在向量中的闭区间范围（无 cot 时为 `[-1, -1]`）。
+- `span_out`：`int64[2]`，output 在向量中的闭区间范围。
+- `rise`：`float64`，row 的 RISE（faithfulness）。
+- `mas`：`float64`，row 的 MAS（faithfulness）。
+- `recovery`：`float64`，row 的 Recovery@10%（没有 recovery 时为 NaN）。
+---
+## 用法示例
+最常用（建议显式传入 dataset 与 tokenizer）：
+```bash
+python exp/proc/map_exp2_traces_to_proc.py \
+  --trace_dir exp/exp2/output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/ifr_all_positions_mfaithfulness_gen_95ex \
+  --dataset_jsonl exp/exp2/data/morehopqa.jsonl \
+  --tokenizer_model /opt/share/models/Qwen/Qwen3-8B/
+```
+显式指定输出目录（避免默认同构路径）：
+```bash
+python exp/proc/map_exp2_traces_to_proc.py \
+  --trace_dir exp/exp2/output/traces/exp/exp2/data/math.jsonl/qwen-8B/ifr_multi_hop_both_n1_mfaithfulness_gen_100ex/ \
+  --dataset_jsonl exp/exp2/data/math.jsonl \
+  --tokenizer_model /opt/share/models/Qwen/Qwen3-8B/ \
+  --out_dir exp/proc/output/math_ifr_multi_hop_both
+```
+调试：只处理前 5 条、允许覆盖输出文件：
+```bash
+python exp/proc/map_exp2_traces_to_proc.py \
+  --trace_dir ... \
+  --dataset_jsonl ... \
+  --tokenizer_model ... \
+  --limit 5 \
+  --overwrite
+```
+---
+## 常见问题
+- 报错 “Prompt/Generation token length mismatch”
+  - 几乎总是 tokenizer 不一致；请确认 `--tokenizer_model` 与 exp2 归因时使用的 tokenizer 完全一致（建议直接用同一个 `--model_path`）。
+- 报错 “Failed to match manifest sha1 to dataset_jsonl”
+  - `--dataset_jsonl` 不是当次 trace run 使用的缓存，或缓存里没有 `target`。
+- FT-IFR 方法输出缺 `hop`
+  - 对 `ifr_multi_hop_stop_words/ifr_multi_hop_both/ifr_multi_hop_split_hop/ifr_in_all_gen`，exp2 trace 必须包含 `vh`；若 trace 较旧请重新跑 exp2（带 `--save_hop_traces`）。
+  - 如确有需要可加 `--allow_missing_ft_hops` 强行输出（不推荐）。

exp/proc/map_exp2_traces_to_proc.py ADDED Viewed

	@@ -0,0 +1,411 @@

+#!/usr/bin/env python3
+"""Map exp2 trace artifacts into a collaborator-friendly per-sample NPZ format.
+Input: an exp2 trace run directory produced by `exp/exp2/run_exp.py --save_hop_traces`,
+e.g.:
+  exp/exp2/output/traces/exp/exp2/data/morehopqa.jsonl/qwen-8B/ifr_all_positions_mfaithfulness_gen_95ex/
+This directory contains:
+  - manifest.jsonl (one JSON object per sample)
+  - ex_*.npz (per-sample vectors and scores)
+Output: per-sample NPZ files under `exp/proc/output/` (or a user-provided output path),
+each containing only:
+  - attr: row attribution vector over [input + CoT + output] tokens, with chat template and EOS removed
+  - hop: per-hop vectors (FT-IFR only), aligned to attr (optional)
+  - tok: tokenized text pieces aligned to attr/hop (no chat template, no EOS)
+  - span_in/span_cot/span_out: inclusive ranges for input/CoT/output in the above vectors
+  - rise/mas: row faithfulness scores (RISE, MAS)
+  - recovery: row Recovery@10% score (NaN when unavailable)
+This script is intentionally self-contained under exp/proc/ and does not modify exp2.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from transformers import AutoTokenizer
+FT_IFR_ATTR_FUNCS: set[str] = {
+    "ifr_in_all_gen",
+    "ifr_multi_hop_stop_words",
+    "ifr_multi_hop_both",
+    "ifr_multi_hop_split_hop",
+}
+def _sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def _load_tokenizer(tokenizer_model: str):
+    tok_path = Path(tokenizer_model)
+    if tok_path.exists():
+        tokenizer = AutoTokenizer.from_pretrained(tok_path.as_posix(), local_files_only=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
+    if tokenizer.eos_token is None:
+        raise SystemExit("Tokenizer is missing eos_token; cannot match exp2 generation tokenization.")
+    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+def _decode_text_into_tokens(tokenizer, text: str) -> List[str]:
+    """Mirror llm_attr.LLMAttribution.decode_text_into_tokens (offset-slice tokens)."""
+    enc = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
+    ids = enc.get("input_ids")
+    offsets = enc.get("offset_mapping")
+    if ids is None or offsets is None:
+        raise ValueError("Tokenizer must provide input_ids and offset_mapping for exact exp2 token alignment.")
+    if len(ids) != len(offsets):
+        raise ValueError("Tokenizer returned mismatched input_ids vs offset_mapping lengths.")
+    tokens: List[str] = []
+    for start, end in offsets:
+        tokens.append(text[int(start) : int(end)])
+    return tokens
+@dataclass(frozen=True)
+class DatasetEntry:
+    prompt: str
+    target: str
+def _index_dataset_by_sha1(dataset_jsonl: Path) -> Dict[Tuple[str, str], DatasetEntry]:
+    """Build (prompt_sha1, target_sha1) -> (prompt, target) for cache lookup."""
+    index: Dict[Tuple[str, str], DatasetEntry] = {}
+    collisions: Dict[Tuple[str, str], int] = {}
+    with dataset_jsonl.open("r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, start=1):
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            prompt = str(obj.get("prompt") or "")
+            target = obj.get("target")
+            if target is None:
+                # exp2 trace matching requires cached targets.
+                continue
+            target = str(target)
+            key = (_sha1_text(prompt), _sha1_text(target))
+            if key in index:
+                collisions[key] = collisions.get(key, 1) + 1
+                continue
+            index[key] = DatasetEntry(prompt=prompt, target=target)
+    if collisions:
+        raise SystemExit(
+            "Dataset cache contains duplicate (prompt,target) pairs; cannot uniquely match by sha1. "
+            f"Example collision count={next(iter(collisions.values()))}. "
+            f"dataset_jsonl={dataset_jsonl}"
+        )
+    if not index:
+        raise SystemExit(
+            "No usable (prompt,target) pairs found in dataset cache. "
+            "Ensure you pass the exp2 cached JSONL used for attribution (with target filled)."
+        )
+    return index
+def _infer_trace_suffix(trace_dir: Path) -> Optional[Path]:
+    parts = list(trace_dir.parts)
+    if "traces" not in parts:
+        return None
+    idx = parts.index("traces")
+    suffix_parts = parts[idx + 1 :]
+    if not suffix_parts:
+        return None
+    return Path(*suffix_parts)
+def _parse_manifest(manifest_path: Path) -> List[dict]:
+    records: List[dict] = []
+    with manifest_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            records.append(json.loads(line))
+    if not records:
+        raise SystemExit(f"Empty manifest.jsonl: {manifest_path}")
+    return records
+def _read_span(npz: np.lib.npyio.NpzFile, key: str) -> Optional[Tuple[int, int]]:
+    if key not in npz.files:
+        return None
+    arr = npz[key]
+    if arr.shape != (2,):
+        raise ValueError(f"Expected {key} to have shape (2,), got {arr.shape}.")
+    return int(arr[0]), int(arr[1])
+def _span_or_empty(span: Optional[Tuple[int, int]]) -> Tuple[int, int]:
+    if span is None:
+        return -1, -1
+    return int(span[0]), int(span[1])
+def _tokenize_for_exp2_alignment(
+    tokenizer,
+    *,
+    prompt: str,
+    target: str,
+    expected_prompt_len: int,
+    expected_gen_len: int,
+) -> List[str]:
+    prompt_text = " " + (prompt or "")
+    prompt_tokens = _decode_text_into_tokens(tokenizer, prompt_text)
+    if len(prompt_tokens) != int(expected_prompt_len):
+        raise ValueError(f"Prompt token length mismatch: expected {expected_prompt_len}, got {len(prompt_tokens)}.")
+    gen_ids = tokenizer(target + tokenizer.eos_token, add_special_tokens=False).input_ids
+    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
+    gen_tokens = _decode_text_into_tokens(tokenizer, gen_text)
+    if len(gen_tokens) != int(expected_gen_len):
+        raise ValueError(f"Generation token length mismatch: expected {expected_gen_len}, got {len(gen_tokens)}.")
+    gen_tokens_no_eos = gen_tokens[:-1] if gen_tokens else []
+    return prompt_tokens + gen_tokens_no_eos
+def _clamp_span(span: Optional[Tuple[int, int]], *, max_index: int) -> Optional[Tuple[int, int]]:
+    if span is None:
+        return None
+    start, end = int(span[0]), int(span[1])
+    if max_index < 0:
+        return None
+    if end < 0 or start > max_index:
+        return None
+    start = max(0, start)
+    end = min(max_index, end)
+    if end < start:
+        return None
+    return start, end
+def _proc_one(
+    *,
+    trace_npz_path: Path,
+    record: dict,
+    dataset_index: Dict[Tuple[str, str], DatasetEntry],
+    tokenizer,
+    out_path: Path,
+    overwrite: bool,
+    allow_missing_ft_hops: bool,
+) -> None:
+    prompt_sha1 = str(record.get("prompt_sha1") or "")
+    target_sha1 = str(record.get("target_sha1") or "")
+    if not prompt_sha1 or not target_sha1:
+        raise ValueError("manifest record missing prompt_sha1/target_sha1; cannot match dataset.")
+    entry = dataset_index.get((prompt_sha1, target_sha1))
+    if entry is None:
+        raise ValueError(
+            "Failed to match manifest sha1 to dataset_jsonl. "
+            "Ensure --dataset_jsonl points to the exact cached JSONL used for this trace run."
+        )
+    if out_path.exists() and not overwrite:
+        raise FileExistsError(f"Refusing to overwrite existing file: {out_path} (use --overwrite).")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with np.load(trace_npz_path, allow_pickle=False) as f:
+        prompt_len = int(np.asarray(f.get("prompt_len")).item())
+        gen_len = int(np.asarray(f.get("gen_len")).item())
+        total_len = prompt_len + gen_len
+        gen_no_eos = max(0, gen_len - 1)
+        L = prompt_len + gen_no_eos
+        v_row_all = f.get("v_row_all")
+        if v_row_all is None:
+            raise ValueError("Missing v_row_all in trace npz; cannot build row attribution vector.")
+        v_row_all = np.asarray(v_row_all, dtype=np.float32)
+        if v_row_all.ndim != 1 or int(v_row_all.shape[0]) != int(total_len):
+            raise ValueError(f"v_row_all shape mismatch: expected ({total_len},), got {tuple(v_row_all.shape)}.")
+        attr = v_row_all[:L]
+        indices_to_explain = _read_span(f, "indices_to_explain_gen")
+        sink_span_gen = _read_span(f, "sink_span_gen") or indices_to_explain
+        if sink_span_gen is None:
+            raise ValueError("Missing sink_span_gen/indices_to_explain_gen; cannot define output span.")
+        thinking_span_gen = _read_span(f, "thinking_span_gen")
+        if thinking_span_gen is None:
+            sink_start = int(sink_span_gen[0])
+            think_end = sink_start - 1
+            thinking_span_gen = (0, think_end) if think_end >= 0 else None
+        sink_span_gen = _clamp_span(sink_span_gen, max_index=gen_no_eos - 1)
+        thinking_span_gen = _clamp_span(thinking_span_gen, max_index=gen_no_eos - 1)
+        span_in = (0, prompt_len - 1) if prompt_len > 0 else (-1, -1)
+        span_cot = (
+            (prompt_len + thinking_span_gen[0], prompt_len + thinking_span_gen[1])
+            if thinking_span_gen is not None
+            else (-1, -1)
+        )
+        span_out = (
+            (prompt_len + sink_span_gen[0], prompt_len + sink_span_gen[1]) if sink_span_gen is not None else (-1, -1)
+        )
+        tokens = _tokenize_for_exp2_alignment(
+            tokenizer,
+            prompt=entry.prompt,
+            target=entry.target,
+            expected_prompt_len=prompt_len,
+            expected_gen_len=gen_len,
+        )
+        if len(tokens) != int(L):
+            raise ValueError(f"Token length mismatch after EOS drop: expected {L}, got {len(tokens)}.")
+        # Scores: row = index 1.
+        rise = float("nan")
+        mas = float("nan")
+        faith = f.get("faithfulness_scores")
+        if faith is not None:
+            faith = np.asarray(faith, dtype=np.float64)
+            if faith.shape != (3, 3):
+                raise ValueError(f"faithfulness_scores shape mismatch: expected (3,3), got {tuple(faith.shape)}.")
+            rise = float(faith[1, 0])
+            mas = float(faith[1, 1])
+        recovery = float("nan")
+        rec = f.get("recovery_scores")
+        if rec is not None:
+            rec = np.asarray(rec, dtype=np.float64)
+            if rec.shape != (3,):
+                raise ValueError(f"recovery_scores shape mismatch: expected (3,), got {tuple(rec.shape)}.")
+            recovery = float(rec[1])
+        out_payload = {
+            "attr": np.asarray(attr, dtype=np.float32),
+            "tok": np.asarray(tokens, dtype=np.str_),
+            "span_in": np.asarray(span_in, dtype=np.int64),
+            "span_cot": np.asarray(span_cot, dtype=np.int64),
+            "span_out": np.asarray(span_out, dtype=np.int64),
+            "rise": np.asarray(rise, dtype=np.float64),
+            "mas": np.asarray(mas, dtype=np.float64),
+            "recovery": np.asarray(recovery, dtype=np.float64),
+        }
+        attr_func = str(record.get("attr_func") or "")
+        want_hops = attr_func in FT_IFR_ATTR_FUNCS
+        if want_hops:
+            vh = f.get("vh")
+            if vh is None:
+                if not allow_missing_ft_hops:
+                    raise ValueError(
+                        f"FT-IFR method '{attr_func}' requires per-hop vectors but trace npz is missing 'vh'. "
+                        "Re-run exp2 with --save_hop_traces using the updated code."
+                    )
+            else:
+                vh = np.asarray(vh, dtype=np.float32)
+                if vh.ndim != 2 or int(vh.shape[1]) != int(total_len):
+                    raise ValueError(
+                        f"vh shape mismatch: expected (H,{total_len}), got {tuple(vh.shape)} for {trace_npz_path}."
+                    )
+                out_payload["hop"] = vh[:, :L]
+        np.savez_compressed(out_path, **out_payload)
+def main() -> None:
+    ap = argparse.ArgumentParser("Map exp2 trace folder -> exp/proc/output per-sample npz files.")
+    ap.add_argument("--trace_dir", type=str, required=True, help="Path to an exp2 trace run directory (contains manifest.jsonl).")
+    ap.add_argument("--dataset_jsonl", type=str, default=None, help="Path to the exp2 cached dataset JSONL used for this trace.")
+    ap.add_argument(
+        "--tokenizer_model",
+        type=str,
+        required=True,
+        help="Tokenizer model name or local path (must match exp2 attribution tokenizer).",
+    )
+    ap.add_argument("--out_root", type=str, default="exp/proc/output", help="Root directory for proc outputs.")
+    ap.add_argument("--out_dir", type=str, default=None, help="Optional explicit output directory (overrides --out_root).")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing output files if present.")
+    ap.add_argument("--limit", type=int, default=None, help="Optional limit on number of samples to process (debug).")
+    ap.add_argument(
+        "--allow_missing_ft_hops",
+        action="store_true",
+        help="Allow producing FT-IFR outputs even when per-hop vectors (vh) are missing (not recommended).",
+    )
+    args = ap.parse_args()
+    trace_dir = Path(args.trace_dir)
+    if not trace_dir.exists() or not trace_dir.is_dir():
+        raise SystemExit(f"Missing trace_dir: {trace_dir}")
+    manifest_path = trace_dir / "manifest.jsonl"
+    if not manifest_path.exists():
+        raise SystemExit(f"Missing manifest.jsonl: {manifest_path}")
+    dataset_jsonl: Optional[Path] = Path(args.dataset_jsonl) if args.dataset_jsonl else None
+    if dataset_jsonl is None:
+        suffix = _infer_trace_suffix(trace_dir)
+        if suffix is not None and len(suffix.parts) >= 3:
+            # suffix = <dataset_name...>/<model_tag>/<run_tag>
+            inferred_dataset = Path(*suffix.parts[:-2])
+            if inferred_dataset.exists() and inferred_dataset.is_file():
+                dataset_jsonl = inferred_dataset
+    if dataset_jsonl is None:
+        raise SystemExit("Please pass --dataset_jsonl (could not infer it from --trace_dir).")
+    if not dataset_jsonl.exists():
+        raise SystemExit(f"Missing --dataset_jsonl: {dataset_jsonl}")
+    tokenizer = _load_tokenizer(str(args.tokenizer_model))
+    dataset_index = _index_dataset_by_sha1(dataset_jsonl)
+    records = _parse_manifest(manifest_path)
+    if args.out_dir:
+        out_dir = Path(args.out_dir)
+    else:
+        suffix = _infer_trace_suffix(trace_dir)
+        out_dir = Path(args.out_root) / suffix if suffix is not None else Path(args.out_root) / trace_dir.name
+    out_dir.mkdir(parents=True, exist_ok=True)
+    total = len(records)
+    limit = args.limit
+    if limit is not None:
+        if limit <= 0:
+            raise SystemExit("--limit must be a positive integer.")
+        total = min(total, int(limit))
+    processed = 0
+    for record in records[:total]:
+        file_name = str(record.get("file") or "")
+        if not file_name:
+            raise SystemExit("manifest record missing 'file' field.")
+        trace_npz_path = trace_dir / file_name
+        if not trace_npz_path.exists():
+            raise SystemExit(f"Missing trace npz referenced by manifest: {trace_npz_path}")
+        out_path = out_dir / file_name
+        try:
+            _proc_one(
+                trace_npz_path=trace_npz_path,
+                record=record,
+                dataset_index=dataset_index,
+                tokenizer=tokenizer,
+                out_path=out_path,
+                overwrite=bool(args.overwrite),
+                allow_missing_ft_hops=bool(args.allow_missing_ft_hops),
+            )
+        except Exception as exc:
+            raise SystemExit(f"Failed processing {trace_npz_path}: {exc}") from exc
+        processed += 1
+    print(f"Wrote {processed} proc samples -> {out_dir}")
+if __name__ == "__main__":
+    main()

exp/proc_1/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# exp/proc_1（exp2 trace 映射/对外导出 v1）
+本目录提供把 `exp/exp2/run_exp.py --save_hop_traces` 产出的 trace 结果，整理成“给合作者使用”的精简样本级 `.npz` 的工具（v1）。
+与 `exp/proc/` 的区别：
+- 去掉 `tok`（逐 token 文本片段）。
+- 新增 `length`（三段 token 长度）：`[in, cot, out]`，并保证与 `span_in/span_cot/span_out` 对齐。
+- `hop` 字段采用“默认策略”：当 trace 样本中存在 `vh` 时才输出 `hop`；否则不输出且不报错。
+- 支持一次性处理 `exp/exp2/output/traces/` 下所有 run 目录（所有数据集-方法组合）。
+---
+## 输入结构（exp2 traces）
+`exp2` 的 trace run 目录形如：
+- `exp/exp2/output/traces/<dataset>/<model>/<run_tag>/`
+每个 run 目录包含：
+- `manifest.jsonl`（每行一个样本记录，包含 `file=ex_*.npz`）
+- `ex_*.npz`（每样本一个 npz）
+---
+## 输出位置与命名
+默认输出到：
+- `exp/proc_1/output/<trace_dir 在 traces/ 之后的同构路径>/`
+例如输入：
+- `.../output/traces/exp/exp2/data/math.jsonl/qwen-8B/<run_tag>/`
+默认输出：
+- `exp/proc_1/output/exp/exp2/data/math.jsonl/qwen-8B/<run_tag>/`
+---
+## 输出 `.npz` 字段
+每个输出样本 `.npz` 仅包含下列键：
+- `attr`：`float32[L]`，row 归因向量；覆盖 `input+cot+output` 的有效 token（移除 generation 末尾 EOS）。
+- `hop`：`float32[H, L]`（可选），当 trace npz 中存在 `vh` 时输出（同样移除 EOS，并与 `attr` 等长对齐）。
+- `span_in`：`int64[2]`，input 在向量中的闭区间范围。
+- `span_cot`：`int64[2]`，cot 在向量中的闭区间范围（无 cot 时为 `[-1, -1]`）。
+- `span_out`：`int64[2]`，output 在向量中的闭区间范围。
+- `length`：`int64[3]`，顺序为 `[in, cot, out]`，长度与 `span_*` 严格对应（闭区间长度 `end-start+1`，空 span 长度为 0）。
+- `rise`：`float64`，row 的 RISE（faithfulness）。
+- `mas`：`float64`，row 的 MAS（faithfulness）。
+- `recovery`：`float64`，row 的 Recovery@10%（没有 recovery 时为 NaN）。
+---
+## 用法示例
+处理 traces 下所有 run（推荐）：
+```bash
+python exp/proc_1/map_exp2_traces_to_proc_1.py \
+  --traces_root exp/exp2/output/traces
+```
+只处理某一个 run 目录：
+```bash
+python exp/proc_1/map_exp2_traces_to_proc_1.py \
+  --trace_dir exp/exp2/output/traces/exp/exp2/data/math.jsonl/qwen-8B/ifr_multi_hop_both_n1_mfaithfulness_gen_100ex
+```
+调试：每个 run 只处理前 5 条、允许覆盖输出：
+```bash
+python exp/proc_1/map_exp2_traces_to_proc_1.py \
+  --traces_root exp/exp2/output/traces \
+  --limit 5 \
+  --overwrite
+```

exp/proc_1/map_exp2_traces_to_proc_1.py ADDED Viewed

	@@ -0,0 +1,338 @@

+#!/usr/bin/env python3
+"""Map exp2 trace artifacts into a collaborator-friendly per-sample NPZ format (proc_1).
+This is a lightweight variant of `exp/proc/map_exp2_traces_to_proc.py`:
+- Removes `tok` (per-token text pieces).
+- Adds `length` with three components [in, cot, out], aligned to span_in/span_cot/span_out.
+- Saves `hop` only when the trace sample contains `vh` (default strategy).
+- Can process a single exp2 trace run directory or all run directories under a traces root.
+Input: an exp2 trace run directory produced by `exp/exp2/run_exp.py --save_hop_traces`, e.g.:
+  exp/exp2/output/traces/exp/exp2/data/math.jsonl/qwen-8B/ifr_multi_hop_both_n1_mfaithfulness_gen_100ex/
+This directory contains:
+  - manifest.jsonl (one JSON object per sample)
+  - ex_*.npz (per-sample vectors and scores)
+Output: per-sample NPZ files under `exp/proc_1/output/` (or a user-provided output path),
+each containing only:
+  - attr: row attribution vector over [input + CoT + output] tokens, with EOS removed
+  - hop: per-hop vectors (optional; only if `vh` exists in the trace npz), aligned to attr
+  - span_in/span_cot/span_out: inclusive ranges for input/CoT/output in the above vectors
+  - length: int64[3] = [in, cot, out], derived strictly from spans
+  - rise/mas: row faithfulness scores (RISE, MAS)
+  - recovery: row Recovery@10% score (NaN when unavailable)
+This script is intentionally self-contained under exp/proc_1/ and does not modify exp2.
+"""
+from __future__ import annotations
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple
+import numpy as np
+def _infer_trace_suffix(trace_dir: Path) -> Optional[Path]:
+    parts = list(trace_dir.parts)
+    if "traces" not in parts:
+        return None
+    idx = parts.index("traces")
+    suffix_parts = parts[idx + 1 :]
+    if not suffix_parts:
+        return None
+    return Path(*suffix_parts)
+def _iter_run_dirs(traces_root: Path) -> List[Path]:
+    runs = {p.parent for p in traces_root.rglob("manifest.jsonl") if p.is_file()}
+    return sorted(runs)
+def _parse_manifest(manifest_path: Path) -> List[dict]:
+    records: List[dict] = []
+    with manifest_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            records.append(json.loads(line))
+    return records
+def _read_span(npz: np.lib.npyio.NpzFile, key: str) -> Optional[Tuple[int, int]]:
+    if key not in npz.files:
+        return None
+    arr = npz[key]
+    if arr.shape != (2,):
+        raise ValueError(f"Expected {key} to have shape (2,), got {arr.shape}.")
+    return int(arr[0]), int(arr[1])
+def _clamp_span(span: Optional[Tuple[int, int]], *, max_index: int) -> Optional[Tuple[int, int]]:
+    if span is None:
+        return None
+    start, end = int(span[0]), int(span[1])
+    if max_index < 0:
+        return None
+    if end < 0 or start > max_index:
+        return None
+    start = max(0, start)
+    end = min(max_index, end)
+    if end < start:
+        return None
+    return start, end
+def _span_len(span: Tuple[int, int]) -> int:
+    start, end = int(span[0]), int(span[1])
+    if start < 0 or end < 0 or end < start:
+        return 0
+    return int(end - start + 1)
+@dataclass(frozen=True)
+class ProcOneResult:
+    wrote: bool
+    has_hop: bool
+def _proc_one(
+    *,
+    trace_npz_path: Path,
+    record: dict,
+    out_path: Path,
+    overwrite: bool,
+) -> ProcOneResult:
+    if out_path.exists() and not overwrite:
+        raise FileExistsError(f"Refusing to overwrite existing file: {out_path} (use --overwrite).")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with np.load(trace_npz_path, allow_pickle=False) as f:
+        prompt_len = int(np.asarray(f.get("prompt_len")).item())
+        gen_len = int(np.asarray(f.get("gen_len")).item())
+        total_len = prompt_len + gen_len
+        gen_no_eos = max(0, gen_len - 1)
+        L = prompt_len + gen_no_eos
+        v_row_all = f.get("v_row_all")
+        if v_row_all is None:
+            raise ValueError("Missing v_row_all in trace npz; cannot build row attribution vector.")
+        v_row_all = np.asarray(v_row_all, dtype=np.float32)
+        if v_row_all.ndim != 1 or int(v_row_all.shape[0]) != int(total_len):
+            raise ValueError(f"v_row_all shape mismatch: expected ({total_len},), got {tuple(v_row_all.shape)}.")
+        attr = v_row_all[:L]
+        indices_to_explain = _read_span(f, "indices_to_explain_gen")
+        sink_span_gen = _read_span(f, "sink_span_gen") or indices_to_explain
+        if sink_span_gen is None:
+            raise ValueError("Missing sink_span_gen/indices_to_explain_gen; cannot define output span.")
+        thinking_span_gen = _read_span(f, "thinking_span_gen")
+        if thinking_span_gen is None:
+            sink_start = int(sink_span_gen[0])
+            think_end = sink_start - 1
+            thinking_span_gen = (0, think_end) if think_end >= 0 else None
+        sink_span_gen = _clamp_span(sink_span_gen, max_index=gen_no_eos - 1)
+        thinking_span_gen = _clamp_span(thinking_span_gen, max_index=gen_no_eos - 1)
+        span_in = (0, prompt_len - 1) if prompt_len > 0 else (-1, -1)
+        span_cot = (
+            (prompt_len + thinking_span_gen[0], prompt_len + thinking_span_gen[1])
+            if thinking_span_gen is not None
+            else (-1, -1)
+        )
+        span_out = (
+            (prompt_len + sink_span_gen[0], prompt_len + sink_span_gen[1]) if sink_span_gen is not None else (-1, -1)
+        )
+        length = np.asarray([_span_len(span_in), _span_len(span_cot), _span_len(span_out)], dtype=np.int64)
+        rise = float("nan")
+        mas = float("nan")
+        faith = f.get("faithfulness_scores")
+        if faith is not None:
+            faith = np.asarray(faith, dtype=np.float64)
+            if faith.shape != (3, 3):
+                raise ValueError(f"faithfulness_scores shape mismatch: expected (3,3), got {tuple(faith.shape)}.")
+            rise = float(faith[1, 0])
+            mas = float(faith[1, 1])
+        recovery = float("nan")
+        rec = f.get("recovery_scores")
+        if rec is not None:
+            rec = np.asarray(rec, dtype=np.float64)
+            if rec.shape != (3,):
+                raise ValueError(f"recovery_scores shape mismatch: expected (3,), got {tuple(rec.shape)}.")
+            recovery = float(rec[1])
+        out_payload = {
+            "attr": np.asarray(attr, dtype=np.float32),
+            "span_in": np.asarray(span_in, dtype=np.int64),
+            "span_cot": np.asarray(span_cot, dtype=np.int64),
+            "span_out": np.asarray(span_out, dtype=np.int64),
+            "length": np.asarray(length, dtype=np.int64),
+            "rise": np.asarray(rise, dtype=np.float64),
+            "mas": np.asarray(mas, dtype=np.float64),
+            "recovery": np.asarray(recovery, dtype=np.float64),
+        }
+        has_hop = False
+        vh = f.get("vh")
+        if vh is not None:
+            vh = np.asarray(vh, dtype=np.float32)
+            if vh.ndim != 2 or int(vh.shape[1]) != int(total_len):
+                raise ValueError(f"vh shape mismatch: expected (H,{total_len}), got {tuple(vh.shape)} for {trace_npz_path}.")
+            out_payload["hop"] = vh[:, :L]
+            has_hop = True
+        np.savez_compressed(out_path, **out_payload)
+        _ = record
+        return ProcOneResult(wrote=True, has_hop=has_hop)
+def _resolve_out_dir_for_trace_dir(*, trace_dir: Path, out_root: Path, out_dir: Optional[Path]) -> Path:
+    if out_dir is not None:
+        return out_dir
+    suffix = _infer_trace_suffix(trace_dir)
+    return (out_root / suffix) if suffix is not None else (out_root / trace_dir.name)
+def _process_trace_dir(
+    *,
+    trace_dir: Path,
+    out_root: Path,
+    out_dir: Optional[Path],
+    overwrite: bool,
+    limit: Optional[int],
+    skip_empty_manifest: bool,
+) -> Tuple[int, int]:
+    manifest_path = trace_dir / "manifest.jsonl"
+    if not manifest_path.exists():
+        raise SystemExit(f"Missing manifest.jsonl: {manifest_path}")
+    records = _parse_manifest(manifest_path)
+    if not records:
+        if skip_empty_manifest:
+            print(f"[skip] empty manifest: {manifest_path}")
+            return 0, 0
+        raise SystemExit(f"Empty manifest.jsonl: {manifest_path}")
+    total = len(records)
+    if limit is not None:
+        if limit <= 0:
+            raise SystemExit("--limit must be a positive integer.")
+        total = min(total, int(limit))
+    resolved_out_dir = _resolve_out_dir_for_trace_dir(trace_dir=trace_dir, out_root=out_root, out_dir=out_dir)
+    resolved_out_dir.mkdir(parents=True, exist_ok=True)
+    wrote = 0
+    wrote_with_hop = 0
+    for record in records[:total]:
+        file_name = str(record.get("file") or "")
+        if not file_name:
+            raise SystemExit("manifest record missing 'file' field.")
+        trace_npz_path = trace_dir / file_name
+        if not trace_npz_path.exists():
+            raise SystemExit(f"Missing trace npz referenced by manifest: {trace_npz_path}")
+        out_path = resolved_out_dir / file_name
+        try:
+            res = _proc_one(trace_npz_path=trace_npz_path, record=record, out_path=out_path, overwrite=overwrite)
+        except Exception as exc:
+            raise SystemExit(f"Failed processing {trace_npz_path}: {exc}") from exc
+        wrote += int(res.wrote)
+        wrote_with_hop += int(res.has_hop)
+    print(f"[ok] wrote {wrote} samples ({wrote_with_hop} with hop) -> {resolved_out_dir}")
+    return wrote, wrote_with_hop
+def main() -> None:
+    ap = argparse.ArgumentParser("Map exp2 trace folder(s) -> exp/proc_1/output per-sample npz files.")
+    ap.add_argument(
+        "--trace_dir",
+        type=str,
+        default=None,
+        help="Path to a single exp2 trace run directory (contains manifest.jsonl).",
+    )
+    ap.add_argument(
+        "--traces_root",
+        type=str,
+        default=None,
+        help="Path to traces root; processes all run dirs under it (each with a manifest.jsonl).",
+    )
+    ap.add_argument("--out_root", type=str, default="exp/proc_1/output", help="Root directory for proc_1 outputs.")
+    ap.add_argument(
+        "--out_dir",
+        type=str,
+        default=None,
+        help="Optional explicit output directory (only valid with --trace_dir; overrides --out_root).",
+    )
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing output files if present.")
+    ap.add_argument("--limit", type=int, default=None, help="Optional limit on number of samples per run (debug).")
+    ap.add_argument(
+        "--fail_on_empty_manifest",
+        action="store_true",
+        help="Fail (instead of skipping) when encountering an empty manifest.jsonl.",
+    )
+    args = ap.parse_args()
+    trace_dir = Path(args.trace_dir) if args.trace_dir else None
+    traces_root = Path(args.traces_root) if args.traces_root else None
+    if (trace_dir is None) == (traces_root is None):
+        raise SystemExit("Please pass exactly one of --trace_dir or --traces_root.")
+    out_root = Path(args.out_root)
+    out_dir = Path(args.out_dir) if args.out_dir else None
+    if out_dir is not None and trace_dir is None:
+        raise SystemExit("--out_dir is only valid with --trace_dir (for --traces_root use --out_root).")
+    skip_empty_manifest = not bool(args.fail_on_empty_manifest)
+    if trace_dir is not None:
+        if not trace_dir.exists() or not trace_dir.is_dir():
+            raise SystemExit(f"Missing trace_dir: {trace_dir}")
+        _process_trace_dir(
+            trace_dir=trace_dir,
+            out_root=out_root,
+            out_dir=out_dir,
+            overwrite=bool(args.overwrite),
+            limit=args.limit,
+            skip_empty_manifest=skip_empty_manifest,
+        )
+        return
+    assert traces_root is not None
+    if not traces_root.exists() or not traces_root.is_dir():
+        raise SystemExit(f"Missing traces_root: {traces_root}")
+    run_dirs = _iter_run_dirs(traces_root)
+    if not run_dirs:
+        raise SystemExit(f"No run directories found under traces_root={traces_root} (expected manifest.jsonl).")
+    total_written = 0
+    total_with_hop = 0
+    for run_dir in run_dirs:
+        wrote, wrote_with_hop = _process_trace_dir(
+            trace_dir=run_dir,
+            out_root=out_root,
+            out_dir=None,
+            overwrite=bool(args.overwrite),
+            limit=args.limit,
+            skip_empty_manifest=skip_empty_manifest,
+        )
+        total_written += wrote
+        total_with_hop += wrote_with_hop
+    print(f"[done] total wrote {total_written} samples ({total_with_hop} with hop) under out_root={out_root}")
+if __name__ == "__main__":
+    main()

flashtrace/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""FlashTrace: efficient multi-token attribution for reasoning LLMs."""
+from .model_io import load_model_and_tokenizer
+from .result import TokenScore, TraceResult
+from .tracer import FlashTrace
+__all__ = ["FlashTrace", "TraceResult", "TokenScore", "load_model_and_tokenizer"]

flashtrace/attribution.py ADDED Viewed

The diff for this file is too large to render. See raw diff

flashtrace/baselines/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Baseline attribution methods for FlashTrace."""
+from .attnlrp import LLMLRPAttribution
+__all__ = ["LLMLRPAttribution"]

flashtrace/baselines/attnlrp.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""AttnLRP baseline API."""
+from flashtrace.attribution import AttnLRPSpanAggregate, LLMLRPAttribution, MultiHopAttnLRPResult
+from flashtrace.lrp_patches import detect_model_type, lrp_context
+__all__ = [
+    "AttnLRPSpanAggregate",
+    "LLMLRPAttribution",
+    "MultiHopAttnLRPResult",
+    "detect_model_type",
+    "lrp_context",
+]