Spaces:

BART-ender
/

cot-anc

Sleeping

App Files Files Community

BART-ender commited on 4 days ago

Commit

fda8fb3

verified ·

1 Parent(s): fca1de6

Deploy Thought Anchors

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +10 -0
.env.example +13 -0
.gitignore +11 -0
Dockerfile +45 -0
README.md +80 -5
app/__init__.py +1 -0
app/analysis/__init__.py +1 -0
app/analysis/hooks.py +78 -0
app/analysis/sentence_split.py +87 -0
app/analysis/summaries.py +39 -0
app/analysis/suppression.py +161 -0
app/analysis/token_boundaries.py +75 -0
app/analysis/validation.py +120 -0
app/api/__init__.py +1 -0
app/api/auth.py +54 -0
app/api/main.py +310 -0
app/cli/__init__.py +1 -0
app/cli/run_api.py +19 -0
app/cli/run_prototype.py +91 -0
app/core/__init__.py +1 -0
app/core/config.py +62 -0
app/core/model_support.py +102 -0
app/core/runtime.py +93 -0
app/core/runtime_pipeline.py +175 -0
app/core/schemas.py +167 -0
app/frontend/app.js +340 -0
app/frontend/index.html +108 -0
app/frontend/styles.css +335 -0
app/generation/__init__.py +1 -0
app/generation/prompting.py +28 -0
app/generation/service.py +83 -0
app/services/__init__.py +1 -0
app/services/sessions.py +157 -0
app/storage/__init__.py +1 -0
app/storage/db.py +60 -0
app/storage/repository.py +182 -0
app/workers/__init__.py +1 -0
app/workers/jobs.py +20 -0
cot_anc.egg-info/PKG-INFO +147 -0
cot_anc.egg-info/SOURCES.txt +44 -0
cot_anc.egg-info/dependency_links.txt +1 -0
cot_anc.egg-info/requires.txt +15 -0
cot_anc.egg-info/top_level.txt +1 -0
docs/api.md +83 -0
docs/deploy-huggingface.md +69 -0
docs/notebook.md +38 -0
docs/runtime.md +77 -0
notebooks/hf_space_demo.ipynb +119 -0
pyproject.toml +36 -0
runpod_start.sh +11 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.git
+.venv
+.pytest_cache
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+outputs
+tests
+cot_anc.egg-info

.env.example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+DEVICE_PREFERENCE=auto
+DTYPE_PREFERENCE=auto
+ATTN_IMPLEMENTATION=eager
+TRUST_REMOTE_CODE=true
+LOW_CPU_MEM_USAGE=true
+MAX_TRACE_TOKENS=256
+MAX_SENTENCES=16
+TAKE_LOG=true
+PRELOAD_MODEL=true
+REQUIRE_AUTH=true
+MAX_QUEUED_JOBS=8
+MAX_ACTIVE_JOBS_PER_USER=2

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+/.venv
+/.pytest_cache
+/__pycache__
+/data/app.db
+.DS_Store
+*.pyc
+*.pyo
+*.pyd
+cot_anc.egg-info/
+outputs/
+uv.lock

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV UV_LINK_MODE=copy
+ENV HOME=/home/user
+ENV PATH=/home/user/.local/bin:$PATH
+ENV HF_HOME=/home/user/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
+ENV API_HOST=0.0.0.0
+ENV API_PORT=7860
+ENV DEVICE_PREFERENCE=auto
+ENV DTYPE_PREFERENCE=auto
+ENV ATTN_IMPLEMENTATION=eager
+ENV LOW_CPU_MEM_USAGE=true
+ENV TRUST_REMOTE_CODE=true
+ENV PRELOAD_MODEL=true
+ENV REQUIRE_AUTH=true
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+WORKDIR $HOME/app
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+COPY --chown=user pyproject.toml uv.lock README.md .env.example ./
+COPY --chown=user app ./app
+COPY --chown=user notebooks ./notebooks
+COPY --chown=user tests ./tests
+RUN uv sync --frozen
+EXPOSE 7860
+CMD ["uv", "run", "python", "-m", "app.cli.run_api"]

README.md CHANGED Viewed

@@ -1,10 +1,85 @@
 ---
-title: Cot Anc
-emoji: 👁
-colorFrom: indigo
-colorTo: green
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Thought Anchors
+emoji: 🧠
+colorFrom: blue
+colorTo: red
 sdk: docker
+app_port: 7860
+hf_oauth: true
 pinned: false
+models:
+  - deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 ---
+# Thought Anchors
+Thought Anchors generates visible reasoning traces from open-weight models and
+computes sentence-to-sentence influence with gradient x attention attribution.
+Current product shape:
+- Hugging Face `Docker Space` first
+- Hugging Face OAuth sign-in
+- web UI + API
+- per-user ephemeral sessions
+- JSON / CSV export
+- adaptive CPU / MPS / CUDA loading
+## Quick Start
+Install deps:
+```bash
+uv sync --extra dev
+```
+Run API:
+```bash
+uv run python -m app.cli.run_api
+```
+Run CLI:
+```bash
+uv run python -m app.cli.run_prototype "Explain why the derivative of x^2 is 2x"
+```
+Run tests:
+```bash
+uv run python -m pytest -q
+```
+## Main Endpoints
+- `GET /healthz`
+- `GET /api/me`
+- `POST /api/warmup`
+- `POST /api/analyze`
+- `GET /api/sessions`
+- `POST /api/sessions`
+- `GET /api/sessions/{id}`
+- `GET /api/sessions/{id}/result`
+- `GET /api/sessions/{id}/export.json`
+- `GET /api/sessions/{id}/export.csv`
+## Docs
+- [Hugging Face deployment](./docs/deploy-huggingface.md)
+- [Runtime and model support](./docs/runtime.md)
+- [API and product behavior](./docs/api.md)
+- [Notebook usage](./docs/notebook.md)
+## Notebook
+Colab / Kaggle smoke-test notebook:
+- [notebooks/hf_space_demo.ipynb](/Users/vibhorkumar/Desktop/codes/cot-anc/notebooks/hf_space_demo.ipynb)
+## Key Constraints
+- Attribution needs `attn_implementation="eager"`.
+- Model must expose supported decoder layers and attention modules.
+- Long traces stay capped because analysis uses full backward pass.
+- Space disk is ephemeral; export results you want to keep.

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Application package for chain-of-thought attribution analysis."""

app/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Analysis utilities for attribution patching."""

app/analysis/hooks.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+import torch
+from app.core.model_support import get_decoder_layers
+_ATTENTION_STORE: dict[int, torch.Tensor] = {}
+def clear_stored_attentions() -> None:
+    _ATTENTION_STORE.clear()
+def get_stored_attentions() -> dict[int, torch.Tensor]:
+    return dict(_ATTENTION_STORE)
+def _extract_attention_tensor(output: Any) -> torch.Tensor | None:
+    if isinstance(output, torch.Tensor):
+        return output if output.dim() == 4 else None
+    if isinstance(output, dict):
+        for value in output.values():
+            if isinstance(value, torch.Tensor) and value.dim() == 4:
+                return value
+    if isinstance(output, Iterable) and not isinstance(output, (str, bytes)):
+        for item in output:
+            if isinstance(item, torch.Tensor) and item.dim() == 4:
+                return item
+    return None
+def _get_attention_impl(model: Any) -> str | None:
+    config = getattr(model, "config", None)
+    if config is None:
+        return None
+    return getattr(config, "_attn_implementation", None) or getattr(
+        config,
+        "attn_implementation",
+        None,
+    )
+def make_attn_hook(layer_idx: int):
+    def hook(_module: Any, _inputs: Any, output: Any) -> None:
+        attn = _extract_attention_tensor(output)
+        if attn is None:
+            return
+        if attn.dim() != 4:
+            raise RuntimeError(f"Expected 4D attention tensor at layer {layer_idx}, got {attn.shape}.")
+        attn.retain_grad()
+        _ATTENTION_STORE[layer_idx] = attn
+    return hook
+def register_hooks(model: Any) -> list[Any]:
+    clear_stored_attentions()
+    layers, _layer_path, attention_attr = get_decoder_layers(model)
+    handles: list[Any] = []
+    for layer_idx, layer in enumerate(layers):
+        self_attn = getattr(layer, attention_attr, None)
+        if self_attn is None:
+            raise RuntimeError(f"Layer {layer_idx} does not expose {attention_attr}.")
+        handles.append(self_attn.register_forward_hook(make_attn_hook(layer_idx)))
+    return handles
+def remove_hooks(handles: list[Any]) -> None:
+    for handle in handles:
+        handle.remove()
+    clear_stored_attentions()

app/analysis/sentence_split.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from __future__ import annotations
+import re
+from dataclasses import dataclass
+THINK_TAG_RE = re.compile(r"</?think>", re.IGNORECASE)
+@dataclass(slots=True)
+class SentenceSpan:
+    text: str
+    start_char: int
+    end_char: int
+def normalize_trace_text(raw_trace_text: str) -> str:
+    return THINK_TAG_RE.sub("", raw_trace_text)
+def _non_whitespace_token_count(text: str) -> int:
+    return len(re.findall(r"\S+", text))
+def split_sentences(
+    text: str,
+    *,
+    min_token_like_units: int = 2,
+) -> list[SentenceSpan]:
+    if not text:
+        return []
+    raw_spans: list[tuple[int, int]] = []
+    start = 0
+    index = 0
+    text_length = len(text)
+    while index < text_length:
+        if text[index : index + 2] == "\n\n":
+            end = index + 2
+            raw_spans.append((start, end))
+            start = end
+            index = end
+            continue
+        if text[index] in ".!?":
+            end = index + 1
+            while end < text_length and text[end] in "\"')]}":
+                end += 1
+            while end < text_length and text[end].isspace() and text[end : end + 2] != "\n\n":
+                end += 1
+            raw_spans.append((start, end))
+            start = end
+            index = end
+            continue
+        index += 1
+    if start < text_length:
+        raw_spans.append((start, text_length))
+    merged: list[tuple[int, int]] = []
+    for span_start, span_end in raw_spans:
+        fragment = text[span_start:span_end]
+        if not fragment:
+            continue
+        if merged and _non_whitespace_token_count(fragment) < min_token_like_units:
+            previous_start, _ = merged[-1]
+            merged[-1] = (previous_start, span_end)
+            continue
+        merged.append((span_start, span_end))
+    if len(merged) > 1:
+        last_start, last_end = merged[-1]
+        if _non_whitespace_token_count(text[last_start:last_end]) < min_token_like_units:
+            prev_start, _ = merged[-2]
+            merged[-2] = (prev_start, last_end)
+            merged.pop()
+    return [
+        SentenceSpan(
+            text=text[span_start:span_end],
+            start_char=span_start,
+            end_char=span_end,
+        )
+        for span_start, span_end in merged
+        if text[span_start:span_end]
+    ]

app/analysis/summaries.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import annotations
+import numpy as np
+from app.core.schemas import TopEdge
+def compute_outgoing_importance(matrix: np.ndarray) -> list[float]:
+    sentence_count = matrix.shape[0]
+    scores: list[float] = []
+    for source_idx in range(sentence_count):
+        column = matrix[source_idx + 1 :, source_idx]
+        scores.append(float(column.mean()) if column.size else 0.0)
+    return scores
+def compute_incoming_importance(matrix: np.ndarray) -> list[float]:
+    sentence_count = matrix.shape[0]
+    scores: list[float] = []
+    for target_idx in range(sentence_count):
+        row = matrix[target_idx, :target_idx]
+        scores.append(float(row.mean()) if row.size else 0.0)
+    return scores
+def compute_top_edges(matrix: np.ndarray, top_k: int = 10) -> list[TopEdge]:
+    sentence_count = matrix.shape[0]
+    candidates: list[TopEdge] = []
+    for target_idx in range(sentence_count):
+        for source_idx in range(target_idx):
+            candidates.append(
+                TopEdge(
+                    source_sentence_idx=source_idx,
+                    target_sentence_idx=target_idx,
+                    score=float(matrix[target_idx, source_idx]),
+                )
+            )
+    candidates.sort(key=lambda edge: edge.score, reverse=True)
+    return candidates[:top_k]

app/analysis/suppression.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from __future__ import annotations
+import time
+from dataclasses import asdict
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+import torch
+from app.analysis.hooks import get_stored_attentions, register_hooks, remove_hooks
+from app.core.model_support import describe_model_support
+from app.core.schemas import ModelCapability, RuntimeMetadata
+@dataclass(slots=True)
+class AttributionMatrixComputation:
+    matrix: np.ndarray
+    raw_matrix: np.ndarray
+    token_nll: np.ndarray
+    runtime_metadata: RuntimeMetadata
+def compute_self_token_nll(logits: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
+    if logits.ndim != 3 or input_ids.ndim != 2:
+        raise ValueError("Expected logits [batch, seq, vocab] and input_ids [batch, seq].")
+    if logits.shape[0] != 1 or input_ids.shape[0] != 1:
+        raise ValueError("Only batch size 1 is supported for the prototype.")
+    if input_ids.shape[1] < 2:
+        raise ValueError("Need at least two tokens to compute next-token loss.")
+    shifted_logits = logits[:, :-1, :]
+    shifted_targets = input_ids[:, 1:]
+    log_probs = torch.log_softmax(shifted_logits, dim=-1)
+    gathered = log_probs.gather(-1, shifted_targets.unsqueeze(-1)).squeeze(-1)
+    return -gathered[0]
+def _current_memory_mb(device: torch.device) -> float | None:
+    if device.type != "cuda":
+        return None
+    return float(torch.cuda.memory_allocated(device) / (1024 * 1024))
+def _build_presentation_matrix(raw_matrix: np.ndarray, take_log: bool) -> np.ndarray:
+    if not take_log:
+        return raw_matrix.copy()
+    presentation = np.zeros_like(raw_matrix)
+    positive = raw_matrix > 0
+    presentation[positive] = np.log(raw_matrix[positive] + 1e-9)
+    return presentation
+def compute_attribution_matrix(
+    input_ids: torch.Tensor,
+    token_ranges: list[tuple[int, int]],
+    model: Any,
+    take_log: bool = True,
+    max_trace_tokens: int = 0,
+    max_sentences: int = 0,
+) -> AttributionMatrixComputation:
+    device = input_ids.device
+    handles = register_hooks(model)
+    model.zero_grad(set_to_none=True)
+    forward_start = time.perf_counter()
+    memory_before_mb = _current_memory_mb(device)
+    try:
+        with torch.enable_grad():
+            outputs = model(
+                input_ids=input_ids,
+                output_attentions=True,
+                return_dict=True,
+            )
+            forward_pass_ms = (time.perf_counter() - forward_start) * 1000.0
+            logits = outputs.logits
+            token_nll = compute_self_token_nll(logits, input_ids)
+            loss = token_nll.sum()
+            backward_start = time.perf_counter()
+            loss.backward()
+            backward_pass_ms = (time.perf_counter() - backward_start) * 1000.0
+        attentions = get_stored_attentions()
+        if not attentions:
+            raise RuntimeError("No attention tensors were captured. Check eager attention mode.")
+        matrix_start = time.perf_counter()
+        sentence_count = len(token_ranges)
+        raw_matrix = np.zeros((sentence_count, sentence_count), dtype=np.float32)
+        ordered_layers = [attentions[layer_idx] for layer_idx in sorted(attentions)]
+        first_attention = ordered_layers[0]
+        num_layers = len(ordered_layers)
+        num_heads = int(first_attention.shape[1])
+        for source_idx, (source_start, source_end) in enumerate(token_ranges):
+            for target_idx, (target_start, target_end) in enumerate(token_ranges):
+                if target_idx <= source_idx:
+                    continue
+                total = 0.0
+                for attention in ordered_layers:
+                    grad = attention.grad
+                    if grad is None:
+                        raise RuntimeError("Attention gradient was not retained for one or more layers.")
+                    total += -(
+                        grad[0, :, target_start:target_end, source_start:source_end]
+                        * attention[0, :, target_start:target_end, source_start:source_end]
+                    ).sum().item()
+                denominator = max(1, target_end - target_start)
+                raw_matrix[target_idx, source_idx] = total / denominator
+        matrix_computation_ms = (time.perf_counter() - matrix_start) * 1000.0
+        total_analysis_ms = (
+            forward_pass_ms + backward_pass_ms + matrix_computation_ms
+        )
+        presentation_matrix = _build_presentation_matrix(raw_matrix, take_log)
+        attention_impl = getattr(model.config, "_attn_implementation", "unknown")
+        capability = describe_model_support(model)
+        runtime_metadata = RuntimeMetadata(
+            forward_pass_ms=forward_pass_ms,
+            backward_pass_ms=backward_pass_ms,
+            matrix_computation_ms=matrix_computation_ms,
+            total_analysis_ms=total_analysis_ms,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            sequence_length_tokens=int(input_ids.shape[1]),
+            sentence_count=sentence_count,
+            device=str(device),
+            dtype=str(first_attention.dtype),
+            attention_impl=str(attention_impl),
+            max_trace_tokens=max_trace_tokens,
+            max_sentences=max_sentences,
+            capability=ModelCapability.model_validate(asdict(capability)),
+        )
+        memory_after_mb = _current_memory_mb(device)
+        if memory_before_mb is not None and memory_after_mb is not None:
+            runtime_metadata = runtime_metadata.model_copy(
+                update={
+                    "device": f"{runtime_metadata.device} (mem {memory_before_mb:.1f}->{memory_after_mb:.1f} MB)"
+                }
+            )
+        return AttributionMatrixComputation(
+            matrix=presentation_matrix,
+            raw_matrix=raw_matrix,
+            token_nll=token_nll.detach().cpu().numpy(),
+            runtime_metadata=runtime_metadata,
+        )
+    finally:
+        for attention in get_stored_attentions().values():
+            attention.grad = None
+        remove_hooks(handles)
+        model.zero_grad(set_to_none=True)
+        if device.type == "cuda":
+            torch.cuda.empty_cache()

app/analysis/token_boundaries.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+import torch
+from app.analysis.sentence_split import SentenceSpan
+@dataclass(slots=True)
+class TokenizedSentenceMapping:
+    input_ids: torch.Tensor
+    token_ranges: list[tuple[int, int]]
+    offsets: list[tuple[int, int]]
+    text: str
+def truncate_text_to_token_limit(text: str, tokenizer: Any, max_tokens: int) -> str:
+    if max_tokens <= 0:
+        raise ValueError("max_tokens must be positive.")
+    encoded = tokenizer(
+        text,
+        add_special_tokens=False,
+        return_offsets_mapping=True,
+    )
+    offsets = encoded["offset_mapping"]
+    if len(offsets) <= max_tokens:
+        return text
+    end_char = offsets[max_tokens - 1][1]
+    return text[:end_char]
+def tokenize_with_sentence_ranges(
+    text: str,
+    sentence_spans: list[SentenceSpan],
+    tokenizer: Any,
+) -> TokenizedSentenceMapping:
+    encoded = tokenizer(
+        text,
+        add_special_tokens=False,
+        return_offsets_mapping=True,
+        return_tensors="pt",
+    )
+    input_ids = encoded["input_ids"]
+    raw_offsets = encoded["offset_mapping"][0].tolist()
+    offsets = [(int(start), int(end)) for start, end in raw_offsets]
+    token_ranges: list[tuple[int, int]] = []
+    for span in sentence_spans:
+        overlapping = [
+            token_index
+            for token_index, (token_start, token_end) in enumerate(offsets)
+            if token_end > span.start_char and token_start < span.end_char
+        ]
+        if not overlapping:
+            raise ValueError(
+                f"Sentence span {span.start_char}:{span.end_char} mapped to zero tokens."
+            )
+        token_ranges.append((overlapping[0], overlapping[-1] + 1))
+    if token_ranges:
+        if token_ranges[0][0] != 0 or token_ranges[-1][1] != len(offsets):
+            raise ValueError("Sentence token ranges do not cover the full analyzed sequence.")
+        for previous, current in zip(token_ranges, token_ranges[1:]):
+            if previous[1] != current[0]:
+                raise ValueError("Sentence token ranges are not contiguous.")
+    return TokenizedSentenceMapping(
+        input_ids=input_ids,
+        token_ranges=token_ranges,
+        offsets=offsets,
+        text=text,
+    )

app/analysis/validation.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from __future__ import annotations
+from typing import Any
+import numpy as np
+import torch
+from scipy.stats import pearsonr, spearmanr
+from app.analysis.suppression import compute_self_token_nll
+from app.core.schemas import TopEdge, ValidationMetadata
+def _nll_slice_for_token_range(token_range: tuple[int, int]) -> slice:
+    start, end = token_range
+    return slice(max(0, start - 1), max(0, end - 1))
+def build_exact_suppression_mask(
+    *,
+    sequence_length: int,
+    source_range: tuple[int, int],
+    target_range: tuple[int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    fill_value = torch.finfo(dtype).min
+    mask = torch.zeros((sequence_length, sequence_length), device=device, dtype=dtype)
+    future_positions = torch.triu(
+        torch.ones((sequence_length, sequence_length), device=device, dtype=torch.bool),
+        diagonal=1,
+    )
+    mask = mask.masked_fill(future_positions, fill_value)
+    source_start, source_end = source_range
+    target_start, target_end = target_range
+    mask[target_start:target_end, source_start:source_end] = fill_value
+    return mask.unsqueeze(0).unsqueeze(0)
+def compute_exact_edge_score(
+    *,
+    model: Any,
+    input_ids: torch.Tensor,
+    source_range: tuple[int, int],
+    target_range: tuple[int, int],
+    baseline_token_nll: np.ndarray,
+) -> float:
+    model_dtype = next(model.parameters()).dtype
+    attention_mask = build_exact_suppression_mask(
+        sequence_length=int(input_ids.shape[1]),
+        source_range=source_range,
+        target_range=target_range,
+        device=input_ids.device,
+        dtype=model_dtype,
+    )
+    with torch.no_grad():
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            return_dict=True,
+        )
+    suppressed_nll = compute_self_token_nll(outputs.logits, input_ids).detach().cpu().numpy()
+    nll_slice = _nll_slice_for_token_range(target_range)
+    return float(suppressed_nll[nll_slice].sum() - baseline_token_nll[nll_slice].sum())
+def validate_top_edges(
+    *,
+    model: Any,
+    input_ids: torch.Tensor,
+    token_ranges: list[tuple[int, int]],
+    top_edges: list[TopEdge],
+    baseline_token_nll: np.ndarray,
+    top_k: int,
+) -> ValidationMetadata:
+    if top_k <= 0 or not top_edges:
+        return ValidationMetadata(enabled=False, top_k=0)
+    selected_edges = top_edges[:top_k]
+    exact_scores: list[float] = []
+    attributed_scores: list[float] = []
+    compared_edges: list[TopEdge] = []
+    try:
+        for edge in selected_edges:
+            exact_score = compute_exact_edge_score(
+                model=model,
+                input_ids=input_ids,
+                source_range=token_ranges[edge.source_sentence_idx],
+                target_range=token_ranges[edge.target_sentence_idx],
+                baseline_token_nll=baseline_token_nll,
+            )
+            exact_scores.append(exact_score)
+            attributed_scores.append(edge.score)
+            compared_edges.append(
+                TopEdge(
+                    source_sentence_idx=edge.source_sentence_idx,
+                    target_sentence_idx=edge.target_sentence_idx,
+                    score=exact_score,
+                )
+            )
+    except Exception as exc:  # pragma: no cover - environment/model dependent
+        return ValidationMetadata(
+            enabled=True,
+            top_k=top_k,
+            compared_edges=[],
+            notes=f"Exact suppression validation failed: {exc}",
+        )
+    pearson = float(pearsonr(attributed_scores, exact_scores).statistic) if len(exact_scores) >= 2 else None
+    spearman = float(spearmanr(attributed_scores, exact_scores).statistic) if len(exact_scores) >= 2 else None
+    return ValidationMetadata(
+        enabled=True,
+        top_k=top_k,
+        pearson=pearson,
+        spearman=spearman,
+        compared_edges=compared_edges,
+        notes="Exact suppression compares sentence-level NLL deltas for selected edges.",
+    )

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """API package for GPU-hosted analysis service."""

app/api/auth.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from fastapi import HTTPException, Request
+from huggingface_hub import parse_huggingface_oauth
+from app.core.config import Settings
+@dataclass(frozen=True, slots=True)
+class UserContext:
+    id: str
+    username: str
+    display_name: str | None
+    avatar_url: str | None
+    authenticated: bool
+def get_optional_user(request: Request) -> UserContext | None:
+    if "session" not in request.scope:
+        return None
+    try:
+        oauth_info = parse_huggingface_oauth(request)
+    except AssertionError:
+        return None
+    if oauth_info is None:
+        return None
+    user_info = oauth_info.user_info
+    username = user_info.preferred_username or user_info.sub or "hf-user"
+    display_name = user_info.name or username
+    return UserContext(
+        id=user_info.sub or username,
+        username=username,
+        display_name=display_name,
+        avatar_url=user_info.picture,
+        authenticated=True,
+    )
+def require_user(request: Request, settings: Settings) -> UserContext:
+    user = get_optional_user(request)
+    if user is not None:
+        return user
+    if settings.require_auth:
+        raise HTTPException(status_code=401, detail="Sign in with Hugging Face to use this service.")
+    return UserContext(
+        id="anonymous",
+        username="anonymous",
+        display_name="Anonymous",
+        avatar_url=None,
+        authenticated=False,
+    )

app/api/main.py ADDED Viewed

	@@ -0,0 +1,310 @@

+from __future__ import annotations
+import logging
+from contextlib import asynccontextmanager
+from io import StringIO
+from pathlib import Path
+from dataclasses import asdict
+import os
+import torch
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import FileResponse
+from fastapi.responses import StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from huggingface_hub import attach_huggingface_oauth
+from app.api.auth import get_optional_user, require_user
+from app.core.config import get_settings
+from app.core.runtime import load_model_bundle
+from app.core.runtime_pipeline import compute_attribution_analysis
+from app.core.schemas import (
+    AnalysisRequest,
+    AnalysisResult,
+    CurrentUserResponse,
+    HealthResponse,
+    SessionCreateRequest,
+    SessionResponse,
+    SessionResultResponse,
+    WarmupResponse,
+)
+from app.services.sessions import SessionAccessError, SessionLimitError, SessionService
+from app.storage.repository import SessionRepository
+from app.workers.jobs import build_job_runner
+logger = logging.getLogger(__name__)
+FRONTEND_DIR = Path(__file__).resolve().parents[1] / "frontend"
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    settings = get_settings()
+    repository = SessionRepository(settings.database_path)
+    jobs = build_job_runner(settings.job_workers)
+    _app.state.repository = repository
+    _app.state.jobs = jobs
+    _app.state.session_service = SessionService(settings=settings, repository=repository, jobs=jobs)
+    if settings.preload_model:
+        logger.info(
+            "Preloading model '%s' on device preference '%s'.",
+            settings.model_name,
+            settings.device_preference,
+        )
+        load_model_bundle(
+            settings.model_name,
+            device_preference=settings.device_preference,
+            dtype_preference=settings.dtype_preference,
+            attn_implementation=settings.attn_implementation,
+            trust_remote_code=settings.trust_remote_code,
+            low_cpu_mem_usage=settings.low_cpu_mem_usage,
+        )
+    yield
+    jobs.shutdown()
+app = FastAPI(
+    title="CoT Attribution Analysis API",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+if os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"):
+    attach_huggingface_oauth(app)
+app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
+def get_session_service() -> SessionService:
+    return app.state.session_service
+def _to_session_response(payload: dict) -> SessionResponse:
+    return SessionResponse(
+        id=payload["id"],
+        status=payload["status"],
+        question=payload["question"],
+        model_name=payload["model_name"],
+        error=payload.get("error"),
+        created_at=payload["created_at"],
+        updated_at=payload["updated_at"],
+        answer=payload.get("answer"),
+        raw_trace_text=payload.get("raw_trace_text"),
+        normalized_trace_text=payload.get("normalized_trace_text"),
+        sentences=payload.get("sentences"),
+        generation_metadata=payload.get("generation_metadata"),
+    )
+@app.get("/", include_in_schema=False)
+def index() -> FileResponse:
+    return FileResponse(FRONTEND_DIR / "index.html")
+@app.get("/healthz", response_model=HealthResponse)
+def healthz() -> HealthResponse:
+    settings = get_settings()
+    return HealthResponse(
+        status="ok",
+        model_name=settings.model_name,
+        device_preference=settings.device_preference,
+        dtype_preference=settings.dtype_preference,
+        preload_model=settings.preload_model,
+        cuda_available=torch.cuda.is_available(),
+        mps_available=torch.backends.mps.is_available(),
+        require_auth=settings.require_auth,
+        public_api_enabled=settings.public_api_enabled,
+        max_queued_jobs=settings.max_queued_jobs,
+        max_active_jobs_per_user=settings.max_active_jobs_per_user,
+    )
+@app.get("/api/me", response_model=CurrentUserResponse)
+def me(request: Request) -> CurrentUserResponse:
+    settings = get_settings()
+    user = get_optional_user(request)
+    return CurrentUserResponse(
+        authenticated=user is not None,
+        auth_required=settings.require_auth,
+        username=user.username if user else None,
+        full_name=user.display_name if user else None,
+        avatar_url=user.avatar_url if user else None,
+    )
+@app.post("/api/warmup", response_model=WarmupResponse)
+def warmup(model_name: str | None = None, device_preference: str | None = None) -> WarmupResponse:
+    settings = get_settings()
+    bundle = load_model_bundle(
+        model_name or settings.model_name,
+        device_preference=device_preference or settings.device_preference,
+        dtype_preference=settings.dtype_preference,
+        attn_implementation=settings.attn_implementation,
+        trust_remote_code=settings.trust_remote_code,
+        low_cpu_mem_usage=settings.low_cpu_mem_usage,
+    )
+    return WarmupResponse(
+        status="ready",
+        model_name=bundle.model_name,
+        device=str(bundle.device),
+        dtype=str(bundle.dtype),
+        capability=asdict(bundle.capability),
+    )
+@app.post("/api/analyze", response_model=AnalysisResult)
+def analyze(request: AnalysisRequest, http_request: Request) -> AnalysisResult:
+    settings = get_settings()
+    require_user(http_request, settings)
+    try:
+        return compute_attribution_analysis(
+            question=request.question,
+            model_name=request.model_name,
+            take_log=request.take_log,
+            max_sentences=request.max_sentences,
+            max_trace_tokens=request.max_trace_tokens,
+            validate_top_k=request.validate_top_k,
+            max_new_tokens=request.max_new_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            device_preference=request.device_preference,
+            dtype_preference=request.dtype_preference,
+            attn_implementation=request.attn_implementation,
+            trust_remote_code=request.trust_remote_code,
+            low_cpu_mem_usage=request.low_cpu_mem_usage,
+        )
+    except Exception as exc:  # pragma: no cover - runtime path
+        logger.exception("Analysis request failed")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+@app.get("/api/sessions", response_model=list[SessionResponse])
+def list_sessions(request: Request, limit: int = 20) -> list[SessionResponse]:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    payloads = service.list_sessions(user.id, limit=limit)
+    return [_to_session_response(payload) for payload in payloads]
+@app.post("/api/sessions", response_model=SessionResponse)
+def create_session(request: SessionCreateRequest, http_request: Request) -> SessionResponse:
+    settings = get_settings()
+    user = require_user(http_request, settings)
+    service = get_session_service()
+    try:
+        session = service.create_session(
+            AnalysisRequest(
+                question=request.question,
+                model_name=request.model_name,
+                take_log=request.take_log,
+                max_sentences=request.max_sentences,
+                max_trace_tokens=request.max_trace_tokens,
+                validate_top_k=request.validate_top_k,
+                max_new_tokens=request.max_new_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                device_preference=request.device_preference,
+                dtype_preference=request.dtype_preference,
+                attn_implementation=request.attn_implementation,
+                trust_remote_code=request.trust_remote_code,
+                low_cpu_mem_usage=request.low_cpu_mem_usage,
+            ),
+            owner_id=user.id,
+            owner_name=user.display_name,
+        )
+    except SessionLimitError as exc:
+        raise HTTPException(status_code=429, detail=str(exc)) from exc
+    payload = service.get_session_payload(session.id, owner_id=user.id)
+    return _to_session_response(payload)
+@app.get("/api/sessions/{session_id}", response_model=SessionResponse)
+def get_session(session_id: str, request: Request) -> SessionResponse:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    try:
+        payload = service.get_session_payload(session_id, owner_id=user.id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Session not found") from exc
+    except SessionAccessError as exc:
+        raise HTTPException(status_code=403, detail=str(exc)) from exc
+    return _to_session_response(payload)
+@app.post("/api/sessions/{session_id}/analyze", response_model=SessionResponse)
+def analyze_session(session_id: str, request: Request) -> SessionResponse:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    try:
+        session = service.start_analysis(session_id, owner_id=user.id)
+        payload = service.get_session_payload(session.id, owner_id=user.id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Session not found") from exc
+    except SessionAccessError as exc:
+        raise HTTPException(status_code=403, detail=str(exc)) from exc
+    return _to_session_response(payload)
+@app.get("/api/sessions/{session_id}/result", response_model=SessionResultResponse)
+def get_session_result(session_id: str, request: Request) -> SessionResultResponse:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    try:
+        payload = service.get_session_payload(session_id, owner_id=user.id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Session not found") from exc
+    except SessionAccessError as exc:
+        raise HTTPException(status_code=403, detail=str(exc)) from exc
+    session_response = _to_session_response(payload)
+    analysis_payload = payload.get("analysis")
+    return SessionResultResponse(
+        session=session_response,
+        analysis=AnalysisResult.model_validate(analysis_payload) if analysis_payload else None,
+    )
+@app.get("/api/sessions/{session_id}/export.json")
+def export_session_json(session_id: str, request: Request) -> StreamingResponse:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    try:
+        payload = service.get_session_payload(session_id, owner_id=user.id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Session not found") from exc
+    except SessionAccessError as exc:
+        raise HTTPException(status_code=403, detail=str(exc)) from exc
+    result = SessionResultResponse(
+        session=_to_session_response(payload),
+        analysis=AnalysisResult.model_validate(payload["analysis"]) if payload.get("analysis") else None,
+    )
+    return StreamingResponse(
+        iter([result.model_dump_json(indent=2)]),
+        media_type="application/json",
+        headers={"content-disposition": f'attachment; filename="{session_id}.json"'},
+    )
+@app.get("/api/sessions/{session_id}/export.csv")
+def export_session_csv(session_id: str, request: Request) -> StreamingResponse:
+    settings = get_settings()
+    user = require_user(request, settings)
+    service = get_session_service()
+    try:
+        result = service.get_analysis_result(session_id, owner_id=user.id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Analysis result not found") from exc
+    except SessionAccessError as exc:
+        raise HTTPException(status_code=403, detail=str(exc)) from exc
+    buffer = StringIO()
+    buffer.write("source_sentence_idx,target_sentence_idx,score\n")
+    for edge in result.top_edges:
+        buffer.write(f"{edge.source_sentence_idx},{edge.target_sentence_idx},{edge.score:.6f}\n")
+    return StreamingResponse(
+        iter([buffer.getvalue()]),
+        media_type="text/csv",
+        headers={"content-disposition": f'attachment; filename="{session_id}.csv"'},
+    )

app/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """CLI entrypoints."""

app/cli/run_api.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from __future__ import annotations
+import uvicorn
+from app.core.config import get_settings
+def main() -> None:
+    settings = get_settings()
+    uvicorn.run(
+        "app.api.main:app",
+        host=settings.api_host,
+        port=settings.api_port,
+        reload=False,
+    )
+if __name__ == "__main__":
+    main()

app/cli/run_prototype.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from __future__ import annotations
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+import typer
+from app.core.runtime_pipeline import compute_attribution_analysis
+cli = typer.Typer(add_completion=False)
+def _write_heatmap(path: Path, matrix: list[list[float]]) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError(
+            "Heatmap output requires the optional viz dependency: pip install .[viz]"
+        ) from exc
+    figure, axis = plt.subplots(figsize=(8, 6))
+    image = axis.imshow(matrix, aspect="auto", cmap="viridis")
+    axis.set_xlabel("Source sentence")
+    axis.set_ylabel("Target sentence")
+    axis.set_title("Sentence influence matrix")
+    figure.colorbar(image, ax=axis)
+    figure.tight_layout()
+    figure.savefig(path)
+    plt.close(figure)
+@cli.command()
+def main(
+    question: str,
+    model_name: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    output_json: Path | None = None,
+    output_heatmap: Path | None = None,
+    max_new_tokens: int = 512,
+    max_trace_tokens: int = 1024,
+    max_sentences: int = 40,
+    take_log: bool = True,
+    validate_top_k: int = 3,
+    temperature: float = 0.6,
+    top_p: float = 0.95,
+    device_preference: str = "auto",
+    dtype_preference: str = "auto",
+    attn_implementation: str = "eager",
+    trust_remote_code: bool = True,
+    low_cpu_mem_usage: bool = True,
+) -> None:
+    result = compute_attribution_analysis(
+        question=question,
+        model_name=model_name,
+        take_log=take_log,
+        max_sentences=max_sentences,
+        max_trace_tokens=max_trace_tokens,
+        validate_top_k=validate_top_k,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        device_preference=device_preference,
+        dtype_preference=dtype_preference,
+        attn_implementation=attn_implementation,
+        trust_remote_code=trust_remote_code,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+    )
+    output_dir = Path("outputs")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    json_path = output_json or output_dir / f"analysis_{timestamp}.json"
+    json_path.write_text(json.dumps(result.model_dump(), indent=2), encoding="utf-8")
+    if output_heatmap is not None:
+        _write_heatmap(output_heatmap, result.suppression_matrix)
+    typer.echo(f"Wrote analysis JSON to {json_path}")
+    typer.echo(f"Sentences: {len(result.sentences)}")
+    typer.echo(f"Top edge count: {len(result.top_edges)}")
+    if result.validation_metadata and result.validation_metadata.enabled:
+        typer.echo(
+            "Validation:"
+            f" pearson={result.validation_metadata.pearson}"
+            f" spearman={result.validation_metadata.spearman}"
+        )
+if __name__ == "__main__":
+    cli()

app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Core runtime, config, and schema definitions."""

app/core/config.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from __future__ import annotations
+import os
+from functools import lru_cache
+from dataclasses import dataclass
+from typing import Literal
+@dataclass(frozen=True, slots=True)
+class Settings:
+    model_name: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+    max_trace_tokens: int = 1024
+    max_sentences: int = 40
+    take_log: bool = True
+    device_preference: Literal["auto", "cuda", "cpu", "mps"] = "auto"
+    dtype_preference: Literal["auto", "float32", "float16", "bfloat16"] = "auto"
+    attn_implementation: str = "eager"
+    trust_remote_code: bool = True
+    low_cpu_mem_usage: bool = True
+    preload_model: bool = False
+    api_host: str = "0.0.0.0"
+    api_port: int = 7860
+    database_path: str = "data/app.db"
+    job_workers: int = 1
+    max_queued_jobs: int = 8
+    max_active_jobs_per_user: int = 2
+    require_auth: bool = True
+    public_api_enabled: bool = True
+DEFAULT_SETTINGS = Settings()
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    take_log = os.getenv("TAKE_LOG", "true").strip().lower() in {"1", "true", "yes", "on"}
+    trust_remote_code = os.getenv("TRUST_REMOTE_CODE", "true").strip().lower() in {"1", "true", "yes", "on"}
+    low_cpu_mem_usage = os.getenv("LOW_CPU_MEM_USAGE", "true").strip().lower() in {"1", "true", "yes", "on"}
+    require_auth = os.getenv("REQUIRE_AUTH", "true").strip().lower() in {"1", "true", "yes", "on"}
+    public_api_enabled = os.getenv("PUBLIC_API_ENABLED", "true").strip().lower() in {"1", "true", "yes", "on"}
+    return Settings(
+        model_name=os.getenv("MODEL_NAME", DEFAULT_SETTINGS.model_name),
+        max_trace_tokens=int(os.getenv("MAX_TRACE_TOKENS", DEFAULT_SETTINGS.max_trace_tokens)),
+        max_sentences=int(os.getenv("MAX_SENTENCES", DEFAULT_SETTINGS.max_sentences)),
+        take_log=take_log,
+        device_preference=os.getenv("DEVICE_PREFERENCE", DEFAULT_SETTINGS.device_preference),  # type: ignore[arg-type]
+        dtype_preference=os.getenv("DTYPE_PREFERENCE", DEFAULT_SETTINGS.dtype_preference),  # type: ignore[arg-type]
+        attn_implementation=os.getenv("ATTN_IMPLEMENTATION", DEFAULT_SETTINGS.attn_implementation),
+        trust_remote_code=trust_remote_code,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+        preload_model=os.getenv("PRELOAD_MODEL", "false").strip().lower() in {"1", "true", "yes", "on"},
+        api_host=os.getenv("API_HOST", DEFAULT_SETTINGS.api_host),
+        api_port=int(os.getenv("API_PORT", DEFAULT_SETTINGS.api_port)),
+        database_path=os.getenv("DATABASE_PATH", DEFAULT_SETTINGS.database_path),
+        job_workers=int(os.getenv("JOB_WORKERS", DEFAULT_SETTINGS.job_workers)),
+        max_queued_jobs=int(os.getenv("MAX_QUEUED_JOBS", DEFAULT_SETTINGS.max_queued_jobs)),
+        max_active_jobs_per_user=int(
+            os.getenv("MAX_ACTIVE_JOBS_PER_USER", DEFAULT_SETTINGS.max_active_jobs_per_user)
+        ),
+        require_auth=require_auth,
+        public_api_enabled=public_api_enabled,
+    )

app/core/model_support.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+@dataclass(frozen=True, slots=True)
+class ModelSupport:
+    supports_attribution: bool
+    reason: str | None
+    layer_path: str | None
+    attention_attr: str | None
+    layer_count: int
+    attention_impl: str | None
+_LAYER_PATH_CANDIDATES: tuple[tuple[str, ...], ...] = (
+    ("model", "layers"),
+    ("model", "model", "layers"),
+    ("transformer", "h"),
+    ("gpt_neox", "layers"),
+)
+_ATTENTION_ATTR_CANDIDATES: tuple[str, ...] = ("self_attn", "attn", "attention")
+def _resolve_attr_chain(obj: Any, path: tuple[str, ...]) -> Any | None:
+    current = obj
+    for segment in path:
+        current = getattr(current, segment, None)
+        if current is None:
+            return None
+    return current
+def _get_attention_impl(model: Any) -> str | None:
+    config = getattr(model, "config", None)
+    if config is None:
+        return None
+    return getattr(config, "_attn_implementation", None) or getattr(config, "attn_implementation", None)
+def describe_model_support(model: Any) -> ModelSupport:
+    attn_impl = _get_attention_impl(model)
+    layers = None
+    layer_path = None
+    for candidate in _LAYER_PATH_CANDIDATES:
+        maybe_layers = _resolve_attr_chain(model, candidate)
+        if maybe_layers is not None:
+            layers = list(maybe_layers)
+            layer_path = ".".join(candidate)
+            break
+    if not layers:
+        return ModelSupport(
+            supports_attribution=False,
+            reason="Unsupported model structure: unable to locate decoder layers.",
+            layer_path=layer_path,
+            attention_attr=None,
+            layer_count=0,
+            attention_impl=attn_impl,
+        )
+    for attention_attr in _ATTENTION_ATTR_CANDIDATES:
+        if all(getattr(layer, attention_attr, None) is not None for layer in layers):
+            if attn_impl != "eager":
+                return ModelSupport(
+                    supports_attribution=False,
+                    reason="Attention gradients require attn_implementation='eager'.",
+                    layer_path=layer_path,
+                    attention_attr=attention_attr,
+                    layer_count=len(layers),
+                    attention_impl=attn_impl,
+                )
+            return ModelSupport(
+                supports_attribution=True,
+                reason=None,
+                layer_path=layer_path,
+                attention_attr=attention_attr,
+                layer_count=len(layers),
+                attention_impl=attn_impl,
+            )
+    return ModelSupport(
+        supports_attribution=False,
+        reason="Unsupported attention module layout: no known attention attribute found on decoder layers.",
+        layer_path=layer_path,
+        attention_attr=None,
+        layer_count=len(layers),
+        attention_impl=attn_impl,
+    )
+def get_decoder_layers(model: Any) -> tuple[list[Any], str, str]:
+    support = describe_model_support(model)
+    if not support.supports_attribution or support.layer_path is None or support.attention_attr is None:
+        reason = support.reason or "Model does not support attribution analysis."
+        raise RuntimeError(reason)
+    layers = _resolve_attr_chain(model, tuple(support.layer_path.split(".")))
+    if layers is None:
+        raise RuntimeError("Model support metadata became inconsistent while resolving layers.")
+    return list(layers), support.layer_path, support.attention_attr

app/core/runtime.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase
+from app.core.model_support import ModelSupport, describe_model_support
+@dataclass(slots=True)
+class ModelBundle:
+    model_name: str
+    model: PreTrainedModel
+    tokenizer: PreTrainedTokenizerBase
+    device: torch.device
+    dtype: torch.dtype
+    capability: ModelSupport
+def resolve_dtype(preference: str, device: torch.device) -> torch.dtype:
+    if preference == "float32":
+        return torch.float32
+    if preference == "float16":
+        return torch.float16
+    if preference == "bfloat16":
+        return torch.bfloat16
+    if device.type == "cuda":
+        return torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+    if device.type == "mps":
+        return torch.float16
+    return torch.float32
+def resolve_device(preference: str = "auto") -> torch.device:
+    if preference == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA requested but not available.")
+        return torch.device("cuda")
+    if preference == "mps":
+        if not torch.backends.mps.is_available():
+            raise RuntimeError("MPS requested but not available.")
+        return torch.device("mps")
+    if preference == "cpu":
+        return torch.device("cpu")
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+@lru_cache(maxsize=2)
+def load_model_bundle(
+    model_name: str,
+    device_preference: str = "auto",
+    dtype_preference: str = "auto",
+    attn_implementation: str = "eager",
+    trust_remote_code: bool = True,
+    low_cpu_mem_usage: bool = True,
+) -> ModelBundle:
+    device = resolve_device(device_preference)
+    dtype = resolve_dtype(dtype_preference, device)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        trust_remote_code=trust_remote_code,
+        attn_implementation=attn_implementation,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+    )
+    model.to(device)
+    model.eval()
+    capability = describe_model_support(model)
+    return ModelBundle(
+        model_name=model_name,
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+        dtype=dtype,
+        capability=capability,
+    )
+def compute_attribution_analysis(**kwargs):
+    from app.core.runtime_pipeline import compute_attribution_analysis as _compute_attribution_analysis
+    return _compute_attribution_analysis(**kwargs)

app/core/runtime_pipeline.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from __future__ import annotations
+from typing import Any
+import torch
+from app.analysis.sentence_split import split_sentences
+from app.analysis.summaries import (
+    compute_incoming_importance,
+    compute_outgoing_importance,
+    compute_top_edges,
+)
+from app.analysis.suppression import compute_attribution_matrix
+from app.analysis.token_boundaries import tokenize_with_sentence_ranges, truncate_text_to_token_limit
+from app.analysis.validation import validate_top_edges
+from app.core.config import get_settings
+from app.core.runtime import load_model_bundle
+from app.core.schemas import AnalysisResult, GenerationResult
+from app.generation.service import generate_answer_and_trace
+def compute_attribution_analysis(
+    *,
+    question: str,
+    model_name: str | None = None,
+    take_log: bool | None = None,
+    max_sentences: int | None = None,
+    max_trace_tokens: int | None = None,
+    validate_top_k: int = 0,
+    max_new_tokens: int = 512,
+    temperature: float = 0.6,
+    top_p: float = 0.95,
+    device_preference: str | None = None,
+    dtype_preference: str | None = None,
+    attn_implementation: str | None = None,
+    trust_remote_code: bool | None = None,
+    low_cpu_mem_usage: bool | None = None,
+) -> AnalysisResult:
+    generation = None
+    return analyze_generation_result(
+        question=question,
+        generation=generation,
+        model_name=model_name,
+        take_log=take_log,
+        max_sentences=max_sentences,
+        max_trace_tokens=max_trace_tokens,
+        validate_top_k=validate_top_k,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        device_preference=device_preference,
+        dtype_preference=dtype_preference,
+        attn_implementation=attn_implementation,
+        trust_remote_code=trust_remote_code,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+    )
+def analyze_generation_result(
+    *,
+    question: str,
+    generation: GenerationResult | None = None,
+    model_name: str | None = None,
+    take_log: bool | None = None,
+    max_sentences: int | None = None,
+    max_trace_tokens: int | None = None,
+    validate_top_k: int = 0,
+    max_new_tokens: int = 512,
+    temperature: float = 0.6,
+    top_p: float = 0.95,
+    device_preference: str | None = None,
+    dtype_preference: str | None = None,
+    attn_implementation: str | None = None,
+    trust_remote_code: bool | None = None,
+    low_cpu_mem_usage: bool | None = None,
+) -> AnalysisResult:
+    settings = get_settings()
+    resolved_model_name = model_name or settings.model_name
+    resolved_take_log = settings.take_log if take_log is None else take_log
+    resolved_max_sentences = max_sentences or settings.max_sentences
+    resolved_max_trace_tokens = max_trace_tokens or settings.max_trace_tokens
+    resolved_device = device_preference or settings.device_preference
+    resolved_dtype = dtype_preference or settings.dtype_preference
+    resolved_attn_implementation = attn_implementation or settings.attn_implementation
+    resolved_trust_remote_code = settings.trust_remote_code if trust_remote_code is None else trust_remote_code
+    resolved_low_cpu_mem_usage = (
+        settings.low_cpu_mem_usage if low_cpu_mem_usage is None else low_cpu_mem_usage
+    )
+    bundle = load_model_bundle(
+        resolved_model_name,
+        device_preference=resolved_device,
+        dtype_preference=resolved_dtype,
+        attn_implementation=resolved_attn_implementation,
+        trust_remote_code=resolved_trust_remote_code,
+        low_cpu_mem_usage=resolved_low_cpu_mem_usage,
+    )
+    if not bundle.capability.supports_attribution:
+        reason = bundle.capability.reason or "Model does not support attribution analysis."
+        raise RuntimeError(reason)
+    if generation is None:
+        generation = generate_answer_and_trace(
+            question=question,
+            model_name=resolved_model_name,
+            model=bundle.model,
+            tokenizer=bundle.tokenizer,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+    truncated_text = truncate_text_to_token_limit(
+        generation.normalized_trace_text,
+        bundle.tokenizer,
+        resolved_max_trace_tokens,
+    )
+    sentence_spans = split_sentences(truncated_text)
+    if resolved_max_sentences > 0 and len(sentence_spans) > resolved_max_sentences:
+        sentence_spans = sentence_spans[:resolved_max_sentences]
+        truncated_text = truncated_text[: sentence_spans[-1].end_char]
+        sentence_spans = split_sentences(truncated_text)
+    if not sentence_spans:
+        raise RuntimeError("Trace normalization produced no analyzable sentences.")
+    mapping = tokenize_with_sentence_ranges(truncated_text, sentence_spans, bundle.tokenizer)
+    input_ids = mapping.input_ids.to(bundle.device)
+    computation = compute_attribution_matrix(
+        input_ids=input_ids,
+        token_ranges=mapping.token_ranges,
+        model=bundle.model,
+        take_log=resolved_take_log,
+        max_trace_tokens=resolved_max_trace_tokens,
+        max_sentences=resolved_max_sentences,
+    )
+    outgoing = compute_outgoing_importance(computation.raw_matrix)
+    incoming = compute_incoming_importance(computation.raw_matrix)
+    top_edges = compute_top_edges(computation.raw_matrix, top_k=10)
+    validation = validate_top_edges(
+        model=bundle.model,
+        input_ids=input_ids,
+        token_ranges=mapping.token_ranges,
+        top_edges=top_edges,
+        baseline_token_nll=computation.token_nll,
+        top_k=validate_top_k,
+    )
+    return AnalysisResult(
+        question=question,
+        model_name=resolved_model_name,
+        answer=generation.answer,
+        raw_trace_text=generation.raw_trace_text,
+        normalized_trace_text=truncated_text,
+        sentences=[span.text for span in sentence_spans],
+        sentence_token_ranges=mapping.token_ranges,
+        suppression_matrix=computation.matrix.tolist(),
+        raw_suppression_matrix=computation.raw_matrix.tolist(),
+        outgoing_importance=outgoing,
+        incoming_importance=incoming,
+        top_edges=top_edges,
+        runtime_metadata=computation.runtime_metadata,
+        validation_metadata=validation,
+        extra_metadata={
+            "raw_generation_text": generation.raw_generation_text,
+            "generation_metadata": generation.generation_metadata.model_dump(),
+            "effective_runtime": {
+                "device_preference": resolved_device,
+                "dtype_preference": resolved_dtype,
+                "attn_implementation": resolved_attn_implementation,
+                "trust_remote_code": resolved_trust_remote_code,
+                "low_cpu_mem_usage": resolved_low_cpu_mem_usage,
+            },
+        },
+    )

app/core/schemas.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+class TopEdge(BaseModel):
+    source_sentence_idx: int
+    target_sentence_idx: int
+    score: float
+class ModelCapability(BaseModel):
+    supports_attribution: bool
+    reason: str | None = None
+    layer_path: str | None = None
+    attention_attr: str | None = None
+    layer_count: int = 0
+    attention_impl: str | None = None
+class RuntimeMetadata(BaseModel):
+    forward_pass_ms: float = 0.0
+    backward_pass_ms: float = 0.0
+    matrix_computation_ms: float = 0.0
+    total_analysis_ms: float = 0.0
+    num_layers: int = 0
+    num_heads: int = 0
+    sequence_length_tokens: int = 0
+    sentence_count: int = 0
+    device: str = "unknown"
+    dtype: str = "unknown"
+    attention_impl: str = "unknown"
+    max_trace_tokens: int = 0
+    max_sentences: int = 0
+    capability: ModelCapability = Field(default_factory=lambda: ModelCapability(supports_attribution=False))
+class ValidationMetadata(BaseModel):
+    enabled: bool = False
+    top_k: int = 0
+    pearson: float | None = None
+    spearman: float | None = None
+    compared_edges: list[TopEdge] = Field(default_factory=list)
+    notes: str | None = None
+class GenerationMetadata(BaseModel):
+    max_new_tokens: int
+    temperature: float
+    top_p: float
+    do_sample: bool
+class GenerationResult(BaseModel):
+    question: str
+    model_name: str
+    answer: str
+    raw_generation_text: str
+    raw_trace_text: str
+    normalized_trace_text: str
+    generation_metadata: GenerationMetadata
+class AnalysisResult(BaseModel):
+    question: str
+    model_name: str
+    answer: str
+    raw_trace_text: str
+    normalized_trace_text: str
+    sentences: list[str]
+    sentence_token_ranges: list[tuple[int, int]]
+    suppression_matrix: list[list[float]]
+    raw_suppression_matrix: list[list[float]] | None = None
+    outgoing_importance: list[float]
+    incoming_importance: list[float]
+    top_edges: list[TopEdge]
+    runtime_metadata: RuntimeMetadata
+    validation_metadata: ValidationMetadata | None = None
+    extra_metadata: dict[str, Any] = Field(default_factory=dict)
+class AnalysisRequest(BaseModel):
+    question: str
+    model_name: str | None = None
+    take_log: bool | None = None
+    max_sentences: int | None = None
+    max_trace_tokens: int | None = None
+    validate_top_k: int = 0
+    max_new_tokens: int = 256
+    temperature: float = 0.6
+    top_p: float = 0.95
+    device_preference: str | None = None
+    dtype_preference: str | None = None
+    attn_implementation: str | None = None
+    trust_remote_code: bool | None = None
+    low_cpu_mem_usage: bool | None = None
+class HealthResponse(BaseModel):
+    status: str
+    model_name: str
+    device_preference: str
+    dtype_preference: str
+    preload_model: bool
+    cuda_available: bool
+    mps_available: bool
+    require_auth: bool
+    public_api_enabled: bool
+    max_queued_jobs: int
+    max_active_jobs_per_user: int
+class WarmupResponse(BaseModel):
+    status: str
+    model_name: str
+    device: str
+    dtype: str
+    capability: ModelCapability
+class CurrentUserResponse(BaseModel):
+    authenticated: bool
+    auth_required: bool
+    username: str | None = None
+    full_name: str | None = None
+    avatar_url: str | None = None
+    login_url: str = "/oauth/huggingface/login"
+    logout_url: str = "/oauth/huggingface/logout"
+class SessionCreateRequest(BaseModel):
+    question: str
+    model_name: str | None = None
+    take_log: bool | None = None
+    max_sentences: int | None = None
+    max_trace_tokens: int | None = None
+    validate_top_k: int = 0
+    max_new_tokens: int = 256
+    temperature: float = 0.6
+    top_p: float = 0.95
+    device_preference: str | None = None
+    dtype_preference: str | None = None
+    attn_implementation: str | None = None
+    trust_remote_code: bool | None = None
+    low_cpu_mem_usage: bool | None = None
+class SessionResponse(BaseModel):
+    id: str
+    status: str
+    question: str
+    model_name: str
+    error: str | None = None
+    created_at: str
+    updated_at: str
+    answer: str | None = None
+    raw_trace_text: str | None = None
+    normalized_trace_text: str | None = None
+    sentences: list[str] | None = None
+    generation_metadata: dict[str, Any] | None = None
+class SessionResultResponse(BaseModel):
+    session: SessionResponse
+    analysis: AnalysisResult | None = None

app/frontend/app.js ADDED Viewed

	@@ -0,0 +1,340 @@

+const form = document.querySelector("#question-form");
+const submitButton = document.querySelector("#submit-button");
+const statusChip = document.querySelector("#status-chip");
+const statusDetail = document.querySelector("#status-detail");
+const authSummary = document.querySelector("#auth-summary");
+const loginLink = document.querySelector("#login-link");
+const logoutLink = document.querySelector("#logout-link");
+const recentSessions = document.querySelector("#recent-sessions");
+const answerOutput = document.querySelector("#answer-output");
+const sessionIdOutput = document.querySelector("#session-id");
+const sentenceList = document.querySelector("#sentence-list");
+const traceCount = document.querySelector("#trace-count");
+const heatmap = document.querySelector("#heatmap");
+const selectionDetails = document.querySelector("#selection-details");
+const metrics = document.querySelector("#metrics");
+const topEdges = document.querySelector("#top-edges");
+const exportJson = document.querySelector("#export-json");
+const exportCsv = document.querySelector("#export-csv");
+let activeSessionId = null;
+let pollHandle = null;
+let activePayload = null;
+let selectedSentenceIdx = null;
+let selectedCell = null;
+let currentUser = null;
+function currentSentences() {
+  return activePayload?.analysis?.sentences || activePayload?.session?.sentences || [];
+}
+function setStatus(label, detail) {
+  statusChip.textContent = label;
+  statusDetail.textContent = detail;
+}
+function clearSelection() {
+  selectedSentenceIdx = null;
+  selectedCell = null;
+  selectionDetails.textContent = "Choose a sentence or heatmap cell.";
+  metrics.innerHTML = "";
+}
+function setExportLinks(sessionId, enabled) {
+  exportJson.href = enabled ? `/api/sessions/${sessionId}/export.json` : "#";
+  exportCsv.href = enabled ? `/api/sessions/${sessionId}/export.csv` : "#";
+  exportJson.classList.toggle("is-disabled", !enabled);
+  exportCsv.classList.toggle("is-disabled", !enabled);
+  exportJson.setAttribute("aria-disabled", String(!enabled));
+  exportCsv.setAttribute("aria-disabled", String(!enabled));
+}
+function renderRecentSessions(sessions) {
+  recentSessions.innerHTML = "";
+  if (!currentUser?.authenticated) {
+    recentSessions.textContent = currentUser?.auth_required
+      ? "Sign in to view your jobs."
+      : "Anonymous mode is enabled.";
+    return;
+  }
+  if (!sessions.length) {
+    recentSessions.textContent = "No jobs yet on this running Space instance.";
+    return;
+  }
+  sessions.forEach((session) => {
+    const item = document.createElement("button");
+    item.type = "button";
+    item.className = "session-card";
+    if (session.id === activeSessionId) {
+      item.classList.add("is-active");
+    }
+    item.innerHTML = `<strong>${session.status}</strong>${session.question}`;
+    item.addEventListener("click", () => {
+      startPolling(session.id);
+    });
+    recentSessions.appendChild(item);
+  });
+}
+async function loadRecentSessions() {
+  if (!currentUser?.authenticated && currentUser?.auth_required) {
+    renderRecentSessions([]);
+    return;
+  }
+  const response = await fetch("/api/sessions?limit=8");
+  if (!response.ok) {
+    renderRecentSessions([]);
+    return;
+  }
+  renderRecentSessions(await response.json());
+}
+function renderAuth() {
+  if (!currentUser) {
+    authSummary.textContent = "Checking sign-in status.";
+    return;
+  }
+  loginLink.hidden = currentUser.authenticated;
+  logoutLink.hidden = !currentUser.authenticated;
+  if (currentUser.authenticated) {
+    authSummary.textContent = `Signed in as ${currentUser.full_name || currentUser.username}.`;
+    return;
+  }
+  authSummary.textContent = currentUser.auth_required
+    ? "Sign in with Hugging Face to run analysis jobs."
+    : "Anonymous access is enabled for this Space.";
+}
+function renderSession(session) {
+  sessionIdOutput.textContent = session.id ? `Session ${session.id.slice(0, 8)}` : "";
+  answerOutput.textContent = session.answer || "Waiting for generated answer.";
+  const sentences = currentSentences();
+  traceCount.textContent = sentences.length ? `${sentences.length} sentences` : "";
+  sentenceList.innerHTML = "";
+  sentences.forEach((sentence, index) => {
+    const item = document.createElement("li");
+    item.className = "sentence-item";
+    item.innerHTML = `<strong>${index}</strong> ${sentence}`;
+    item.addEventListener("click", () => {
+      selectedSentenceIdx = index;
+      selectedCell = null;
+      renderSelection();
+    });
+    sentenceList.appendChild(item);
+  });
+}
+function colorForValue(value, maxAbs) {
+  if (!maxAbs) {
+    return "rgba(224, 223, 218, 0.8)";
+  }
+  const normalized = Math.max(-1, Math.min(1, value / maxAbs));
+  if (normalized >= 0) {
+    const alpha = 0.12 + normalized * 0.88;
+    return `rgba(14, 90, 138, ${alpha.toFixed(3)})`;
+  }
+  const alpha = 0.12 + Math.abs(normalized) * 0.88;
+  return `rgba(215, 106, 52, ${alpha.toFixed(3)})`;
+}
+function renderHeatmap(result) {
+  const matrix = result?.suppression_matrix;
+  if (!matrix || !matrix.length) {
+    heatmap.className = "heatmap placeholder-box";
+    heatmap.textContent = "Analysis pending.";
+    return;
+  }
+  const flatValues = matrix.flat();
+  const maxAbs = Math.max(...flatValues.map((value) => Math.abs(value)));
+  heatmap.className = "heatmap";
+  heatmap.innerHTML = "";
+  const grid = document.createElement("div");
+  grid.className = "heatmap-grid";
+  grid.style.gridTemplateColumns = `repeat(${matrix.length}, 32px)`;
+  matrix.forEach((row, rowIndex) => {
+    row.forEach((value, colIndex) => {
+      const cell = document.createElement("button");
+      cell.type = "button";
+      cell.className = "heatmap-cell";
+      cell.style.background = colorForValue(value, maxAbs);
+      cell.title = `target ${rowIndex} ← source ${colIndex}: ${value.toFixed(4)}`;
+      if (selectedCell && selectedCell.row === rowIndex && selectedCell.col === colIndex) {
+        cell.classList.add("is-selected");
+      }
+      cell.addEventListener("click", () => {
+        selectedSentenceIdx = null;
+        selectedCell = { row: rowIndex, col: colIndex, value };
+        renderSelection();
+      });
+      grid.appendChild(cell);
+    });
+  });
+  heatmap.appendChild(grid);
+}
+function renderTopEdges(result) {
+  topEdges.innerHTML = "";
+  const edges = result?.top_edges || [];
+  if (!edges.length) {
+    return;
+  }
+  edges.slice(0, 5).forEach((edge) => {
+    const item = document.createElement("div");
+    item.className = "edge-card";
+    item.innerHTML = `<strong>${edge.source_sentence_idx} → ${edge.target_sentence_idx}</strong>${edge.score.toFixed(4)}`;
+    topEdges.appendChild(item);
+  });
+}
+function renderSelection() {
+  Array.from(sentenceList.children).forEach((item, index) => {
+    item.classList.toggle("is-active", selectedSentenceIdx === index);
+  });
+  const result = activePayload?.analysis;
+  const session = activePayload?.session;
+  if (!session) {
+    clearSelection();
+    return;
+  }
+  metrics.innerHTML = "";
+  if (selectedSentenceIdx != null && result) {
+    const outgoing = result.outgoing_importance[selectedSentenceIdx] ?? 0;
+    const incoming = result.incoming_importance[selectedSentenceIdx] ?? 0;
+    selectionDetails.innerHTML = `<strong>Sentence ${selectedSentenceIdx}</strong><br>${currentSentences()[selectedSentenceIdx] || ""}`;
+    metrics.innerHTML = `
+      <div class="metric-card"><strong>Outgoing impact</strong>${outgoing.toFixed(4)}</div>
+      <div class="metric-card"><strong>Incoming dependence</strong>${incoming.toFixed(4)}</div>
+    `;
+    return;
+  }
+  if (selectedCell) {
+    selectionDetails.innerHTML = `<strong>Edge ${selectedCell.col} → ${selectedCell.row}</strong><br>Influence score ${selectedCell.value.toFixed(4)}`;
+    metrics.innerHTML = `
+      <div class="metric-card"><strong>Source sentence</strong>${selectedCell.col}</div>
+      <div class="metric-card"><strong>Target sentence</strong>${selectedCell.row}</div>
+    `;
+    return;
+  }
+  selectionDetails.textContent = "Choose a sentence or heatmap cell.";
+}
+async function fetchSession(sessionId) {
+  const response = await fetch(`/api/sessions/${sessionId}/result`);
+  if (!response.ok) {
+    throw new Error(`Failed to fetch session ${sessionId}`);
+  }
+  return response.json();
+}
+function updateFromPayload(payload) {
+  activePayload = payload;
+  activeSessionId = payload.session.id;
+  renderSession(payload.session);
+  renderHeatmap(payload.analysis);
+  renderTopEdges(payload.analysis);
+  renderSelection();
+  setExportLinks(payload.session.id, payload.session.status === "completed");
+  const { status, error } = payload.session;
+  if (status === "queued") setStatus("Queued", "Waiting for a worker slot.");
+  if (status === "generating") setStatus("Generating", "The model is producing an answer and visible trace.");
+  if (status === "answer_ready") setStatus("Analysis pending", "Answer is ready. Attribution analysis is starting.");
+  if (status === "analyzing") setStatus("Analyzing", "Running forward and backward passes for sentence influence.");
+  if (status === "completed") setStatus("Completed", "Analysis finished.");
+  if (status === "failed") setStatus("Failed", error || "The session failed.");
+  if (["completed", "failed"].includes(status) && pollHandle) {
+    window.clearInterval(pollHandle);
+    pollHandle = null;
+  }
+  loadRecentSessions().catch(() => {});
+}
+async function startPolling(sessionId) {
+  activeSessionId = sessionId;
+  if (pollHandle) {
+    window.clearInterval(pollHandle);
+  }
+  const tick = async () => {
+    try {
+      const payload = await fetchSession(sessionId);
+      updateFromPayload(payload);
+    } catch (error) {
+      setStatus("Error", String(error));
+      window.clearInterval(pollHandle);
+      pollHandle = null;
+    }
+  };
+  await tick();
+  pollHandle = window.setInterval(tick, 2500);
+}
+form.addEventListener("submit", async (event) => {
+  event.preventDefault();
+  if (!currentUser?.authenticated && currentUser?.auth_required) {
+    window.location.href = loginLink.href;
+    return;
+  }
+  submitButton.disabled = true;
+  clearSelection();
+  sentenceList.innerHTML = "";
+  heatmap.className = "heatmap placeholder-box";
+  heatmap.textContent = "Analysis pending.";
+  topEdges.innerHTML = "";
+  answerOutput.textContent = "Waiting for generated answer.";
+  sessionIdOutput.textContent = "";
+  traceCount.textContent = "";
+  setExportLinks("", false);
+  setStatus("Submitting", "Creating session.");
+  const payload = {
+    question: document.querySelector("#question").value,
+    max_new_tokens: Number(document.querySelector("#max-new-tokens").value),
+    max_trace_tokens: Number(document.querySelector("#max-trace-tokens").value),
+    max_sentences: Number(document.querySelector("#max-sentences").value),
+  };
+  try {
+    const response = await fetch("/api/sessions", {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify(payload),
+    });
+    if (!response.ok) {
+      throw new Error(await response.text());
+    }
+    const session = await response.json();
+    await startPolling(session.id);
+  } catch (error) {
+    setStatus("Error", String(error));
+  } finally {
+    submitButton.disabled = false;
+  }
+});
+async function initialize() {
+  setExportLinks("", false);
+  try {
+    const response = await fetch("/api/me");
+    currentUser = await response.json();
+  } catch (_error) {
+    currentUser = {
+      authenticated: false,
+      auth_required: true,
+      login_url: "/oauth/huggingface/login",
+      logout_url: "/oauth/huggingface/logout",
+    };
+  }
+  renderAuth();
+  await loadRecentSessions();
+}
+initialize();

app/frontend/index.html ADDED Viewed

	@@ -0,0 +1,108 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Thought Anchors</title>
+    <link rel="stylesheet" href="/static/styles.css" />
+  </head>
+  <body>
+    <main class="page">
+      <section class="auth-strip panel">
+        <div>
+          <p class="eyebrow">Public Hugging Face Space</p>
+          <p id="auth-summary" class="lede compact">Checking sign-in status.</p>
+        </div>
+        <div class="auth-actions">
+          <a id="login-link" class="pill-link" href="/oauth/huggingface/login">Sign in with Hugging Face</a>
+          <a id="logout-link" class="pill-link secondary" href="/oauth/huggingface/logout">Sign out</a>
+        </div>
+      </section>
+      <section class="hero">
+        <p class="eyebrow">Online Chain-of-Thought Analysis</p>
+        <h1>Reasoning traces with white-box sentence influence.</h1>
+        <p class="lede">
+          Submit a question, watch the answer appear first, then inspect the sentence-to-sentence
+          attribution matrix once analysis completes. Results can be exported as JSON or CSV.
+        </p>
+      </section>
+      <section class="panel composer">
+        <form id="question-form">
+          <label class="label" for="question">Question</label>
+          <textarea id="question" name="question" rows="5" placeholder="Explain why the derivative of x^2 is 2x" required></textarea>
+          <div class="controls">
+            <label>
+              <span>Max new tokens</span>
+              <input id="max-new-tokens" type="number" min="16" max="512" value="128" />
+            </label>
+            <label>
+              <span>Max trace tokens</span>
+              <input id="max-trace-tokens" type="number" min="64" max="1024" value="256" />
+            </label>
+            <label>
+              <span>Max sentences</span>
+              <input id="max-sentences" type="number" min="4" max="40" value="16" />
+            </label>
+          </div>
+          <button id="submit-button" type="submit">Start analysis</button>
+        </form>
+      </section>
+      <section class="status-row">
+        <div class="status-chip" id="status-chip">Idle</div>
+        <div class="status-detail" id="status-detail">Submit a question to create a session.</div>
+      </section>
+      <section class="grid">
+        <article class="panel jobs-panel">
+          <div class="panel-heading">
+            <h2>Your Sessions</h2>
+            <span class="muted">Ephemeral instance history</span>
+          </div>
+          <div id="recent-sessions" class="recent-sessions placeholder">Sign in to view your jobs.</div>
+        </article>
+        <article class="panel answer-panel">
+          <div class="panel-heading">
+            <h2>Answer</h2>
+            <span id="session-id" class="muted"></span>
+          </div>
+          <p id="answer-output" class="placeholder">No answer yet.</p>
+        </article>
+        <article class="panel trace-panel">
+          <div class="panel-heading">
+            <h2>Reasoning Trace</h2>
+            <span class="muted" id="trace-count"></span>
+          </div>
+          <ol id="sentence-list" class="sentence-list"></ol>
+        </article>
+        <article class="panel heatmap-panel">
+          <div class="panel-heading">
+            <h2>Sentence Influence Matrix</h2>
+            <span class="muted">Rows: targets, columns: sources</span>
+          </div>
+          <div id="heatmap" class="heatmap placeholder-box">Analysis pending.</div>
+        </article>
+        <article class="panel details-panel">
+          <div class="panel-heading">
+            <h2>Selection</h2>
+          </div>
+          <div id="selection-details" class="placeholder">Choose a sentence or heatmap cell.</div>
+          <div class="metrics" id="metrics"></div>
+          <div class="edges" id="top-edges"></div>
+          <div class="export-actions">
+            <a id="export-json" class="pill-link is-disabled" href="#" aria-disabled="true">Export JSON</a>
+            <a id="export-csv" class="pill-link secondary is-disabled" href="#" aria-disabled="true">Export CSV</a>
+          </div>
+        </article>
+      </section>
+    </main>
+    <script src="/static/app.js" defer></script>
+  </body>
+  </html>

app/frontend/styles.css ADDED Viewed

	@@ -0,0 +1,335 @@

+:root {
+  --bg: #f3efe6;
+  --panel: rgba(255, 252, 247, 0.9);
+  --ink: #1e1c19;
+  --muted: #6f655c;
+  --accent: #0e5a8a;
+  --accent-soft: #d2ecff;
+  --line: rgba(30, 28, 25, 0.12);
+  --warm: #d76a34;
+  --shadow: 0 20px 60px rgba(34, 25, 16, 0.08);
+}
+* {
+  box-sizing: border-box;
+}
+body {
+  margin: 0;
+  font-family: "Iowan Old Style", "Palatino Linotype", serif;
+  color: var(--ink);
+  background:
+    radial-gradient(circle at top left, rgba(14, 90, 138, 0.12), transparent 32%),
+    radial-gradient(circle at top right, rgba(215, 106, 52, 0.18), transparent 26%),
+    linear-gradient(180deg, #f8f4ec 0%, var(--bg) 100%);
+}
+.page {
+  max-width: 1440px;
+  margin: 0 auto;
+  padding: 40px 24px 48px;
+}
+.hero {
+  max-width: 760px;
+  margin-bottom: 28px;
+}
+.compact {
+  margin: 0;
+}
+.auth-strip {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 16px;
+  margin-bottom: 18px;
+}
+.auth-actions,
+.export-actions {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 10px;
+}
+.eyebrow {
+  margin: 0 0 8px;
+  text-transform: uppercase;
+  letter-spacing: 0.12em;
+  font-size: 0.78rem;
+  color: var(--accent);
+}
+h1, h2 {
+  margin: 0;
+  font-weight: 600;
+}
+h1 {
+  font-size: clamp(2.4rem, 5vw, 4.4rem);
+  line-height: 0.95;
+}
+.lede {
+  margin: 18px 0 0;
+  font-size: 1.08rem;
+  color: var(--muted);
+  max-width: 60ch;
+}
+.panel {
+  background: var(--panel);
+  border: 1px solid var(--line);
+  border-radius: 24px;
+  padding: 20px;
+  box-shadow: var(--shadow);
+  backdrop-filter: blur(18px);
+}
+.composer {
+  margin-bottom: 16px;
+}
+.label,
+.controls span {
+  display: block;
+  margin-bottom: 8px;
+  font-size: 0.92rem;
+  color: var(--muted);
+}
+textarea,
+input {
+  width: 100%;
+  border: 1px solid rgba(30, 28, 25, 0.16);
+  border-radius: 14px;
+  padding: 14px 16px;
+  font: inherit;
+  background: rgba(255, 255, 255, 0.72);
+}
+textarea {
+  resize: vertical;
+  min-height: 132px;
+}
+.controls {
+  display: grid;
+  grid-template-columns: repeat(3, minmax(0, 1fr));
+  gap: 12px;
+  margin: 16px 0;
+}
+button {
+  appearance: none;
+  border: none;
+  border-radius: 999px;
+  padding: 14px 20px;
+  font: inherit;
+  font-weight: 600;
+  color: white;
+  background: linear-gradient(135deg, var(--accent), #123954);
+  cursor: pointer;
+}
+.pill-link {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  border-radius: 999px;
+  padding: 12px 18px;
+  text-decoration: none;
+  font-weight: 600;
+  color: white;
+  background: linear-gradient(135deg, var(--accent), #123954);
+}
+.pill-link.secondary {
+  color: var(--ink);
+  background: rgba(255, 255, 255, 0.78);
+  border: 1px solid var(--line);
+}
+.pill-link.is-disabled {
+  pointer-events: none;
+  opacity: 0.45;
+}
+button:disabled {
+  opacity: 0.5;
+  cursor: wait;
+}
+.status-row {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  margin-bottom: 16px;
+}
+.status-chip {
+  display: inline-flex;
+  align-items: center;
+  border-radius: 999px;
+  padding: 8px 14px;
+  background: var(--accent-soft);
+  color: var(--accent);
+  font-size: 0.9rem;
+}
+.status-detail,
+.muted,
+.placeholder {
+  color: var(--muted);
+}
+.grid {
+  display: grid;
+  grid-template-columns: 1.2fr 1fr;
+  gap: 16px;
+}
+.jobs-panel {
+  grid-column: 1 / -1;
+}
+.answer-panel,
+.trace-panel,
+.heatmap-panel,
+.details-panel {
+  min-height: 260px;
+}
+.answer-panel,
+.trace-panel {
+  grid-column: span 1;
+}
+.heatmap-panel,
+.details-panel {
+  grid-column: span 1;
+}
+.panel-heading {
+  display: flex;
+  justify-content: space-between;
+  gap: 12px;
+  align-items: baseline;
+  margin-bottom: 16px;
+}
+.sentence-list {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  max-height: 420px;
+  overflow: auto;
+}
+.sentence-item {
+  padding: 10px 12px;
+  border-radius: 14px;
+  border: 1px solid transparent;
+  margin-bottom: 8px;
+  background: rgba(255,255,255,0.45);
+  cursor: pointer;
+}
+.sentence-item:hover,
+.sentence-item.is-active {
+  border-color: rgba(14, 90, 138, 0.25);
+  background: rgba(210, 236, 255, 0.5);
+}
+.sentence-item strong {
+  display: inline-block;
+  min-width: 2.5rem;
+  color: var(--accent);
+}
+.heatmap {
+  display: grid;
+  gap: 4px;
+  overflow: auto;
+  min-height: 300px;
+}
+.heatmap-grid {
+  display: grid;
+  gap: 4px;
+}
+.heatmap-cell {
+  width: 32px;
+  height: 32px;
+  border-radius: 8px;
+  border: 1px solid rgba(255,255,255,0.4);
+  cursor: pointer;
+  position: relative;
+}
+.heatmap-cell.is-selected {
+  outline: 2px solid var(--ink);
+}
+.placeholder-box {
+  display: grid;
+  place-items: center;
+  border: 1px dashed var(--line);
+  border-radius: 18px;
+}
+.metric-card,
+.edge-card {
+  padding: 12px 14px;
+  border-radius: 16px;
+  background: rgba(255,255,255,0.45);
+  border: 1px solid var(--line);
+  margin-top: 10px;
+}
+.recent-sessions {
+  display: grid;
+  gap: 10px;
+}
+.session-card {
+  color: var(--ink);
+  border: 1px solid var(--line);
+  border-radius: 16px;
+  background: rgba(255, 255, 255, 0.48);
+  padding: 14px;
+  cursor: pointer;
+  text-align: left;
+}
+.session-card.is-active {
+  border-color: rgba(14, 90, 138, 0.25);
+  background: rgba(210, 236, 255, 0.42);
+}
+.session-card strong {
+  display: block;
+  margin-bottom: 4px;
+  color: var(--accent);
+}
+.metric-card strong,
+.edge-card strong {
+  display: block;
+  color: var(--accent);
+}
+@media (max-width: 960px) {
+  .auth-strip,
+  .grid,
+  .controls {
+    grid-template-columns: 1fr;
+  }
+  .auth-strip {
+    align-items: flex-start;
+  }
+}

app/generation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Generation utilities for trace-producing model calls."""

app/generation/prompting.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from __future__ import annotations
+from typing import Any
+SYSTEM_PROMPT = (
+    "You are a careful reasoning assistant. Respond with your full reasoning inside "
+    "<think>...</think> and then provide a concise final answer after the closing tag."
+)
+def build_messages(question: str) -> list[dict[str, str]]:
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": question},
+    ]
+def render_prompt(tokenizer: Any, question: str) -> str:
+    messages = build_messages(question)
+    if hasattr(tokenizer, "apply_chat_template"):
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    return f"{SYSTEM_PROMPT}\n\nQuestion: {question}\n"

app/generation/service.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from __future__ import annotations
+import re
+from typing import Any
+import torch
+from app.analysis.sentence_split import normalize_trace_text
+from app.core.schemas import GenerationMetadata, GenerationResult
+from app.generation.prompting import render_prompt
+THINK_BLOCK_RE = re.compile(r"<think>(.*?)</think>", re.IGNORECASE | re.DOTALL)
+ANSWER_MARKER_RE = re.compile(r"(?:^|\n)(?:final answer|answer)\s*:\s*", re.IGNORECASE)
+def _extract_trace_and_answer(text: str) -> tuple[str, str]:
+    match = THINK_BLOCK_RE.search(text)
+    if match:
+        raw_trace = match.group(0)
+        answer = text[match.end() :].strip()
+        if not answer:
+            answer = match.group(1).strip()
+        return raw_trace, answer
+    raw_trace = text.strip()
+    answer_match = ANSWER_MARKER_RE.search(text)
+    if answer_match:
+        answer = text[answer_match.end() :].strip()
+    else:
+        paragraphs = [part.strip() for part in text.split("\n\n") if part.strip()]
+        answer = paragraphs[-1] if paragraphs else raw_trace
+    return raw_trace, answer
+def generate_answer_and_trace(
+    *,
+    question: str,
+    model_name: str,
+    model: Any,
+    tokenizer: Any,
+    max_new_tokens: int = 512,
+    temperature: float = 0.6,
+    top_p: float = 0.95,
+) -> GenerationResult:
+    prompt_text = render_prompt(tokenizer, question)
+    encoded = tokenizer(prompt_text, return_tensors="pt")
+    model_device = next(model.parameters()).device
+    encoded = {key: value.to(model_device) for key, value in encoded.items()}
+    input_length = int(encoded["input_ids"].shape[-1])
+    do_sample = temperature > 0.0
+    generation_kwargs: dict[str, Any] = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        "top_p": top_p,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+    if do_sample:
+        generation_kwargs["temperature"] = temperature
+    with torch.no_grad():
+        output_ids = model.generate(**encoded, **generation_kwargs)
+    generated_ids = output_ids[0, input_length:]
+    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
+    raw_trace_text, answer = _extract_trace_and_answer(generated_text)
+    normalized_trace_text = normalize_trace_text(raw_trace_text)
+    return GenerationResult(
+        question=question,
+        model_name=model_name,
+        answer=answer,
+        raw_generation_text=generated_text,
+        raw_trace_text=raw_trace_text,
+        normalized_trace_text=normalized_trace_text,
+        generation_metadata=GenerationMetadata(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=do_sample,
+        ),
+    )

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Application services for session orchestration."""

app/services/sessions.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from app.analysis.sentence_split import split_sentences
+from app.core.config import Settings
+from app.core.runtime import load_model_bundle
+from app.core.runtime_pipeline import analyze_generation_result
+from app.core.schemas import AnalysisRequest, AnalysisResult, GenerationMetadata, GenerationResult
+from app.generation.service import generate_answer_and_trace
+from app.storage.repository import SessionRecord, SessionRepository
+from app.workers.jobs import JobRunner
+class SessionLimitError(RuntimeError):
+    pass
+class SessionAccessError(PermissionError):
+    pass
+@dataclass(slots=True)
+class SessionService:
+    settings: Settings
+    repository: SessionRepository
+    jobs: JobRunner
+    def create_session(self, request: AnalysisRequest, *, owner_id: str, owner_name: str | None) -> SessionRecord:
+        if self.repository.count_incomplete_sessions() >= self.settings.max_queued_jobs:
+            raise SessionLimitError("The service queue is full. Try again after a few minutes.")
+        if self.repository.count_incomplete_sessions_for_owner(owner_id) >= self.settings.max_active_jobs_per_user:
+            raise SessionLimitError("You already have the maximum number of active analysis jobs.")
+        model_name = request.model_name or self.settings.model_name
+        session = self.repository.create_session(
+            question=request.question,
+            model_name=model_name,
+            owner_id=owner_id,
+            owner_name=owner_name,
+        )
+        self.jobs.submit(self._run_session_pipeline, session.id, request)
+        return session
+    def start_analysis(
+        self,
+        session_id: str,
+        *,
+        owner_id: str,
+        request: AnalysisRequest | None = None,
+    ) -> SessionRecord:
+        session = self.repository.get_session(session_id)
+        self._assert_owner(session, owner_id)
+        effective_request = request or AnalysisRequest(question=session.question, model_name=session.model_name)
+        self.jobs.submit(self._run_analysis_only, session_id, effective_request)
+        return session
+    def get_session_payload(self, session_id: str, *, owner_id: str) -> dict:
+        session = self.repository.get_session(session_id)
+        self._assert_owner(session, owner_id)
+        return self.repository.list_session_payload(session_id)
+    def list_sessions(self, owner_id: str, *, limit: int = 20) -> list[dict]:
+        return self.repository.list_sessions_for_owner(owner_id, limit=limit)
+    def get_analysis_result(self, session_id: str, *, owner_id: str) -> AnalysisResult:
+        payload = self.get_session_payload(session_id, owner_id=owner_id)
+        analysis = payload.get("analysis")
+        if analysis is None:
+            raise KeyError(session_id)
+        return AnalysisResult.model_validate(analysis)
+    @staticmethod
+    def _assert_owner(session: SessionRecord, owner_id: str) -> None:
+        if session.owner_id != owner_id:
+            raise SessionAccessError("Session belongs to a different user.")
+    def _run_session_pipeline(self, session_id: str, request: AnalysisRequest) -> None:
+        try:
+            self.repository.update_status(session_id, status="generating")
+            bundle = load_model_bundle(
+                request.model_name or self.settings.model_name,
+                device_preference=request.device_preference or self.settings.device_preference,
+                dtype_preference=request.dtype_preference or self.settings.dtype_preference,
+                attn_implementation=request.attn_implementation or self.settings.attn_implementation,
+                trust_remote_code=(
+                    self.settings.trust_remote_code if request.trust_remote_code is None else request.trust_remote_code
+                ),
+                low_cpu_mem_usage=(
+                    self.settings.low_cpu_mem_usage
+                    if request.low_cpu_mem_usage is None
+                    else request.low_cpu_mem_usage
+                ),
+            )
+            generation = generate_answer_and_trace(
+                question=request.question,
+                model_name=bundle.model_name,
+                model=bundle.model,
+                tokenizer=bundle.tokenizer,
+                max_new_tokens=request.max_new_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p,
+            )
+            sentences = [span.text for span in split_sentences(generation.normalized_trace_text)]
+            self.repository.save_generation_result(session_id, generation, sentences)
+            self.repository.update_status(session_id, status="answer_ready")
+            self._run_analysis_only(session_id, request, generation=generation)
+        except Exception as exc:
+            self.repository.update_status(session_id, status="failed", error=str(exc))
+    def _run_analysis_only(
+        self,
+        session_id: str,
+        request: AnalysisRequest,
+        *,
+        generation=None,
+    ) -> None:
+        try:
+            self.repository.update_status(session_id, status="analyzing")
+            if generation is None:
+                payload = self.repository.list_session_payload(session_id)
+                if payload.get("generation_metadata") is not None:
+                    generation = GenerationResult(
+                        question=payload["question"],
+                        model_name=payload["model_name"],
+                        answer=payload["answer"],
+                        raw_generation_text=payload.get("raw_generation_text", ""),
+                        raw_trace_text=payload["raw_trace_text"],
+                        normalized_trace_text=payload["normalized_trace_text"],
+                        generation_metadata=GenerationMetadata.model_validate(payload["generation_metadata"]),
+                    )
+            result = analyze_generation_result(
+                question=request.question,
+                generation=generation,
+                model_name=request.model_name or self.settings.model_name,
+                take_log=request.take_log,
+                max_sentences=request.max_sentences,
+                max_trace_tokens=request.max_trace_tokens,
+                validate_top_k=request.validate_top_k,
+                device_preference=request.device_preference or self.settings.device_preference,
+                dtype_preference=request.dtype_preference or self.settings.dtype_preference,
+                attn_implementation=request.attn_implementation or self.settings.attn_implementation,
+                trust_remote_code=(
+                    self.settings.trust_remote_code if request.trust_remote_code is None else request.trust_remote_code
+                ),
+                low_cpu_mem_usage=(
+                    self.settings.low_cpu_mem_usage
+                    if request.low_cpu_mem_usage is None
+                    else request.low_cpu_mem_usage
+                ),
+                max_new_tokens=request.max_new_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p,
+            )
+            self.repository.save_analysis_result(session_id, result)
+            self.repository.update_status(session_id, status="completed")
+        except Exception as exc:
+            self.repository.update_status(session_id, status="failed", error=str(exc))

app/storage/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Persistence layer for sessions and analysis results."""

app/storage/db.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from __future__ import annotations
+import sqlite3
+from pathlib import Path
+def connect(database_path: str) -> sqlite3.Connection:
+    path = Path(database_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    connection = sqlite3.connect(path, check_same_thread=False)
+    connection.row_factory = sqlite3.Row
+    connection.execute("PRAGMA journal_mode=WAL")
+    connection.execute("PRAGMA foreign_keys=ON")
+    return connection
+def initialize_schema(connection: sqlite3.Connection) -> None:
+    connection.executescript(
+        """
+        CREATE TABLE IF NOT EXISTS sessions (
+            id TEXT PRIMARY KEY,
+            status TEXT NOT NULL,
+            question TEXT NOT NULL,
+            model_name TEXT NOT NULL,
+            owner_id TEXT NOT NULL,
+            owner_name TEXT,
+            error TEXT,
+            created_at TEXT NOT NULL,
+            updated_at TEXT NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS generation_results (
+            session_id TEXT PRIMARY KEY REFERENCES sessions(id) ON DELETE CASCADE,
+            answer TEXT NOT NULL,
+            raw_generation_text TEXT NOT NULL,
+            raw_trace_text TEXT NOT NULL,
+            normalized_trace_text TEXT NOT NULL,
+            sentences_json TEXT NOT NULL,
+            generation_metadata_json TEXT NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS analysis_results (
+            session_id TEXT PRIMARY KEY REFERENCES sessions(id) ON DELETE CASCADE,
+            result_json TEXT NOT NULL
+        );
+        """
+    )
+    columns = {
+        row["name"] for row in connection.execute("PRAGMA table_info(generation_results)").fetchall()
+    }
+    if "raw_generation_text" not in columns:
+        connection.execute(
+            "ALTER TABLE generation_results ADD COLUMN raw_generation_text TEXT NOT NULL DEFAULT ''"
+        )
+    session_columns = {row["name"] for row in connection.execute("PRAGMA table_info(sessions)").fetchall()}
+    if "owner_id" not in session_columns:
+        connection.execute("ALTER TABLE sessions ADD COLUMN owner_id TEXT NOT NULL DEFAULT 'legacy-user'")
+    if "owner_name" not in session_columns:
+        connection.execute("ALTER TABLE sessions ADD COLUMN owner_name TEXT")
+    connection.commit()

app/storage/repository.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from __future__ import annotations
+import json
+import threading
+import uuid
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from typing import Any
+from app.core.schemas import AnalysisResult, GenerationResult
+from app.storage.db import connect, initialize_schema
+def _utc_now() -> str:
+    return datetime.now(UTC).isoformat()
+@dataclass(slots=True)
+class SessionRecord:
+    id: str
+    status: str
+    question: str
+    model_name: str
+    owner_id: str
+    owner_name: str | None
+    error: str | None
+    created_at: str
+    updated_at: str
+class SessionRepository:
+    def __init__(self, database_path: str) -> None:
+        self.connection = connect(database_path)
+        initialize_schema(self.connection)
+        self.lock = threading.Lock()
+    def create_session(self, *, question: str, model_name: str, owner_id: str, owner_name: str | None) -> SessionRecord:
+        session_id = str(uuid.uuid4())
+        now = _utc_now()
+        with self.lock:
+            self.connection.execute(
+                """
+                INSERT INTO sessions (id, status, question, model_name, owner_id, owner_name, error, created_at, updated_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (session_id, "queued", question, model_name, owner_id, owner_name, None, now, now),
+            )
+            self.connection.commit()
+        return self.get_session(session_id)
+    def get_session(self, session_id: str) -> SessionRecord:
+        row = self.connection.execute(
+            "SELECT * FROM sessions WHERE id = ?",
+            (session_id,),
+        ).fetchone()
+        if row is None:
+            raise KeyError(session_id)
+        return SessionRecord(**dict(row))
+    def list_session_payload(self, session_id: str) -> dict[str, Any]:
+        session = self.get_session(session_id)
+        payload: dict[str, Any] = {
+            "id": session.id,
+            "status": session.status,
+            "question": session.question,
+            "model_name": session.model_name,
+            "owner_id": session.owner_id,
+            "owner_name": session.owner_name,
+            "error": session.error,
+            "created_at": session.created_at,
+            "updated_at": session.updated_at,
+        }
+        generation_row = self.connection.execute(
+            "SELECT * FROM generation_results WHERE session_id = ?",
+            (session_id,),
+        ).fetchone()
+        if generation_row is not None:
+            payload["answer"] = generation_row["answer"]
+            payload["raw_generation_text"] = generation_row["raw_generation_text"]
+            payload["raw_trace_text"] = generation_row["raw_trace_text"]
+            payload["normalized_trace_text"] = generation_row["normalized_trace_text"]
+            payload["sentences"] = json.loads(generation_row["sentences_json"])
+            payload["generation_metadata"] = json.loads(generation_row["generation_metadata_json"])
+        analysis_row = self.connection.execute(
+            "SELECT result_json FROM analysis_results WHERE session_id = ?",
+            (session_id,),
+        ).fetchone()
+        if analysis_row is not None:
+            payload["analysis"] = json.loads(analysis_row["result_json"])
+        return payload
+    def list_sessions_for_owner(self, owner_id: str, *, limit: int = 20) -> list[dict[str, Any]]:
+        rows = self.connection.execute(
+            """
+            SELECT id
+            FROM sessions
+            WHERE owner_id = ?
+            ORDER BY updated_at DESC
+            LIMIT ?
+            """,
+            (owner_id, limit),
+        ).fetchall()
+        return [self.list_session_payload(row["id"]) for row in rows]
+    def update_status(self, session_id: str, *, status: str, error: str | None = None) -> None:
+        now = _utc_now()
+        with self.lock:
+            self.connection.execute(
+                "UPDATE sessions SET status = ?, error = ?, updated_at = ? WHERE id = ?",
+                (status, error, now, session_id),
+            )
+            self.connection.commit()
+    def count_incomplete_sessions(self) -> int:
+        row = self.connection.execute(
+            "SELECT COUNT(*) AS count FROM sessions WHERE status NOT IN ('completed', 'failed')"
+        ).fetchone()
+        return int(row["count"])
+    def count_incomplete_sessions_for_owner(self, owner_id: str) -> int:
+        row = self.connection.execute(
+            """
+            SELECT COUNT(*) AS count
+            FROM sessions
+            WHERE owner_id = ? AND status NOT IN ('completed', 'failed')
+            """,
+            (owner_id,),
+        ).fetchone()
+        return int(row["count"])
+    def save_generation_result(self, session_id: str, generation: GenerationResult, sentences: list[str]) -> None:
+        with self.lock:
+            self.connection.execute(
+                """
+                INSERT INTO generation_results (
+                    session_id,
+                    answer,
+                    raw_generation_text,
+                    raw_trace_text,
+                    normalized_trace_text,
+                    sentences_json,
+                    generation_metadata_json
+                ) VALUES (?, ?, ?, ?, ?, ?, ?)
+                ON CONFLICT(session_id) DO UPDATE SET
+                    answer = excluded.answer,
+                    raw_generation_text = excluded.raw_generation_text,
+                    raw_trace_text = excluded.raw_trace_text,
+                    normalized_trace_text = excluded.normalized_trace_text,
+                    sentences_json = excluded.sentences_json,
+                    generation_metadata_json = excluded.generation_metadata_json
+                """,
+                (
+                    session_id,
+                    generation.answer,
+                    generation.raw_generation_text,
+                    generation.raw_trace_text,
+                    generation.normalized_trace_text,
+                    json.dumps(sentences),
+                    json.dumps(generation.generation_metadata.model_dump()),
+                ),
+            )
+            self.connection.execute(
+                "UPDATE sessions SET updated_at = ? WHERE id = ?",
+                (_utc_now(), session_id),
+            )
+            self.connection.commit()
+    def save_analysis_result(self, session_id: str, result: AnalysisResult) -> None:
+        with self.lock:
+            self.connection.execute(
+                """
+                INSERT INTO analysis_results (session_id, result_json)
+                VALUES (?, ?)
+                ON CONFLICT(session_id) DO UPDATE SET result_json = excluded.result_json
+                """,
+                (session_id, result.model_dump_json()),
+            )
+            self.connection.execute(
+                "UPDATE sessions SET updated_at = ? WHERE id = ?",
+                (_utc_now(), session_id),
+            )
+            self.connection.commit()

app/workers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Background job runner."""

app/workers/jobs.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from __future__ import annotations
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Callable
+@dataclass(slots=True)
+class JobRunner:
+    executor: ThreadPoolExecutor
+    def submit(self, fn: Callable[..., None], *args, **kwargs) -> Future:
+        return self.executor.submit(fn, *args, **kwargs)
+    def shutdown(self) -> None:
+        self.executor.shutdown(wait=False, cancel_futures=True)
+def build_job_runner(max_workers: int) -> JobRunner:
+    return JobRunner(executor=ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="cot-anc"))

cot_anc.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,147 @@

+Metadata-Version: 2.4
+Name: cot-anc
+Version: 0.1.0
+Summary: Online chain-of-thought analysis with attribution patching
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: fastapi>=0.115.0
+Requires-Dist: huggingface_hub[oauth]>=0.33.0
+Requires-Dist: numpy>=2.0.0
+Requires-Dist: pydantic>=2.7.0
+Requires-Dist: scipy>=1.13.0
+Requires-Dist: torch>=2.2.0
+Requires-Dist: transformers>=4.44.0
+Requires-Dist: typer>=0.12.3
+Requires-Dist: uvicorn>=0.30.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.2.0; extra == "dev"
+Provides-Extra: viz
+Requires-Dist: matplotlib>=3.8.0; extra == "viz"
+---
+title: Thought Anchors
+emoji: 🧠
+colorFrom: blue
+colorTo: orange
+sdk: docker
+app_port: 7860
+hf_oauth: true
+pinned: false
+models:
+  - deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+---
+# Thought Anchors
+Public-facing FastAPI service for generating a visible reasoning trace and computing
+sentence-to-sentence attribution on open-weight reasoning models.
+The app is now shaped for deployment as a Hugging Face `Docker Space`:
+- Hugging Face OAuth sign-in for end users
+- browser UI plus programmatic API
+- per-user ephemeral sessions on the running instance
+- JSON and CSV export for completed analyses
+- adaptive device and dtype loading for CPU, MPS, CUDA, Colab, Kaggle, and cloud GPUs
+## What It Can Do
+- generate an answer plus visible `<think>...</think>` trace from a supported causal LM
+- normalize and split the trace into sentences
+- compute a sentence influence matrix with gradient x attention attribution
+- summarize incoming / outgoing importance and top edges
+- expose the workflow through:
+  - CLI
+  - FastAPI
+  - web UI
+  - async session queue
+## Current Deployment Target
+The primary deployment target is a Hugging Face `Docker Space` running on upgraded GPU
+hardware. The same app can also be run locally or on other cloud GPU hosts.
+Important runtime constraints:
+- attribution requires `attn_implementation="eager"`
+- the model must expose usable attention tensors and a supported decoder-layer layout
+- long traces are intentionally capped because the analysis path uses a full backward pass
+## Local Development
+Install dependencies:
+```bash
+uv sync
+```
+Run the API:
+```bash
+uv run python -m app.cli.run_api
+```
+Run the CLI:
+```bash
+uv run python -m app.cli.run_prototype "Explain why the derivative of x^2 is 2x"
+```
+## API
+Main endpoints:
+- `GET /healthz`
+- `GET /api/me`
+- `POST /api/warmup`
+- `POST /api/analyze`
+- `GET /api/sessions`
+- `POST /api/sessions`
+- `GET /api/sessions/{id}`
+- `GET /api/sessions/{id}/result`
+- `GET /api/sessions/{id}/export.json`
+- `GET /api/sessions/{id}/export.csv`
+Example:
+```bash
+curl -X POST http://localhost:7860/api/analyze \
+  -H 'content-type: application/json' \
+  -d '{
+    "question": "Explain why the derivative of x^2 is 2x",
+    "max_new_tokens": 128,
+    "max_trace_tokens": 256,
+    "max_sentences": 16,
+    "validate_top_k": 0
+  }'
+```
+## Hugging Face Space Setup
+Recommended environment variables:
+- `MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
+- `DEVICE_PREFERENCE=auto`
+- `DTYPE_PREFERENCE=auto`
+- `ATTN_IMPLEMENTATION=eager`
+- `LOW_CPU_MEM_USAGE=true`
+- `TRUST_REMOTE_CODE=true`
+- `PRELOAD_MODEL=true`
+- `MAX_TRACE_TOKENS=256`
+- `MAX_SENTENCES=16`
+- `JOB_WORKERS=1`
+- `MAX_QUEUED_JOBS=8`
+- `MAX_ACTIVE_JOBS_PER_USER=2`
+- `REQUIRE_AUTH=true`
+Notes:
+- local disk is ephemeral; users should export results they want to keep
+- use upgraded GPU hardware for real attribution runs
+- keep trace limits conservative for public traffic
+## Notebook
+Use [notebooks/hf_space_demo.ipynb](/Users/vibhorkumar/Desktop/codes/cot-anc/notebooks/hf_space_demo.ipynb)
+for Colab or Kaggle smoke testing. It installs dependencies, warms the model, runs one
+short attribution analysis, prints the top edges, and renders a simple heatmap.

cot_anc.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+README.md
+pyproject.toml
+app/__init__.py
+app/analysis/__init__.py
+app/analysis/hooks.py
+app/analysis/sentence_split.py
+app/analysis/summaries.py
+app/analysis/suppression.py
+app/analysis/token_boundaries.py
+app/analysis/validation.py
+app/api/__init__.py
+app/api/auth.py
+app/api/main.py
+app/cli/__init__.py
+app/cli/run_api.py
+app/cli/run_prototype.py
+app/core/__init__.py
+app/core/config.py
+app/core/model_support.py
+app/core/runtime.py
+app/core/runtime_pipeline.py
+app/core/schemas.py
+app/generation/__init__.py
+app/generation/prompting.py
+app/generation/service.py
+app/services/__init__.py
+app/services/sessions.py
+app/storage/__init__.py
+app/storage/db.py
+app/storage/repository.py
+app/workers/__init__.py
+app/workers/jobs.py
+cot_anc.egg-info/PKG-INFO
+cot_anc.egg-info/SOURCES.txt
+cot_anc.egg-info/dependency_links.txt
+cot_anc.egg-info/requires.txt
+cot_anc.egg-info/top_level.txt
+tests/test_api.py
+tests/test_model_support.py
+tests/test_runtime_pipeline.py
+tests/test_sentence_split.py
+tests/test_summaries.py
+tests/test_suppression_shape.py
+tests/test_token_boundaries.py

cot_anc.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

cot_anc.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi>=0.115.0
+huggingface_hub[oauth]>=0.33.0
+numpy>=2.0.0
+pydantic>=2.7.0
+scipy>=1.13.0
+torch>=2.2.0
+transformers>=4.44.0
+typer>=0.12.3
+uvicorn>=0.30.0
+[dev]
+pytest>=8.2.0
+[viz]
+matplotlib>=3.8.0

cot_anc.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ app

docs/api.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# API And Product Behavior
+## Auth Model
+- If `REQUIRE_AUTH=true`, analysis endpoints require Hugging Face sign-in.
+- Sessions are scoped to user identity.
+- One user cannot fetch another user’s session or export.
+## Session Model
+Statuses:
+- `queued`
+- `generating`
+- `answer_ready`
+- `analyzing`
+- `completed`
+- `failed`
+Sessions are ephemeral for current running instance.
+## Endpoints
+### `GET /healthz`
+Returns:
+- model name
+- device preference
+- dtype preference
+- auth requirement
+- queue limits
+- CUDA / MPS availability
+### `GET /api/me`
+Returns current auth state plus login/logout URLs.
+### `POST /api/warmup`
+Loads model with current runtime policy and returns:
+- resolved device
+- resolved dtype
+- model attribution capability
+### `POST /api/analyze`
+Direct synchronous analysis call. Good for trusted programmatic use, not best path for UI.
+### `GET /api/sessions`
+List current user’s recent sessions on current instance.
+### `POST /api/sessions`
+Create async session job.
+Queue protections:
+- global queue cap
+- per-user active-job cap
+### `GET /api/sessions/{id}`
+Return session summary.
+### `GET /api/sessions/{id}/result`
+Return session summary + full analysis if ready.
+### Export
+- `GET /api/sessions/{id}/export.json`
+- `GET /api/sessions/{id}/export.csv`
+JSON contains session + analysis payload.
+CSV contains top edges:
+- `source_sentence_idx`
+- `target_sentence_idx`
+- `score`

docs/deploy-huggingface.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Hugging Face Deployment
+Primary target: Hugging Face `Docker Space` on upgraded GPU hardware.
+## What Gets Deployed
+- FastAPI backend
+- static web frontend
+- Hugging Face OAuth routes
+- ephemeral SQLite-backed session queue
+## Required Space Settings
+- SDK: `Docker`
+- Port: `7860`
+- OAuth: enabled via README metadata
+- Hardware: upgraded GPU recommended
+## Recommended Runtime Variables
+Core:
+- `MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`
+- `DEVICE_PREFERENCE=auto`
+- `DTYPE_PREFERENCE=auto`
+- `ATTN_IMPLEMENTATION=eager`
+- `LOW_CPU_MEM_USAGE=true`
+- `TRUST_REMOTE_CODE=true`
+- `PRELOAD_MODEL=true`
+Traffic limits:
+- `MAX_TRACE_TOKENS=256`
+- `MAX_SENTENCES=16`
+- `JOB_WORKERS=1`
+- `MAX_QUEUED_JOBS=8`
+- `MAX_ACTIVE_JOBS_PER_USER=2`
+- `REQUIRE_AUTH=true`
+## Deploy Flow
+1. Create new Hugging Face Space with `Docker` SDK.
+2. Push repo contents.
+3. Set runtime variables in Space settings.
+4. Upgrade hardware.
+5. Wait for build.
+6. Verify:
+   - `GET /healthz`
+   - sign-in works
+   - one short analysis completes
+   - JSON / CSV export works
+## Operational Notes
+- Local disk is ephemeral. Session history disappears on restart.
+- OAuth helper is mocked locally but real inside Space.
+- Keep public defaults conservative. Long traces can OOM small GPUs.
+- If queue pressure grows, lower token caps before increasing worker count.
+## Common Failure Modes
+- `attn_implementation` not eager:
+  - attribution disabled for model
+- unsupported model layout:
+  - generation may work, attribution fails early with clear error
+- OOM:
+  - reduce `MAX_TRACE_TOKENS`, `MAX_SENTENCES`, or choose larger GPU
+- cold start slow:
+  - keep `PRELOAD_MODEL=true`

docs/notebook.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Notebook Usage
+Notebook path:
+- [notebooks/hf_space_demo.ipynb](/Users/vibhorkumar/Desktop/codes/cot-anc/notebooks/hf_space_demo.ipynb)
+## Purpose
+Short smoke test for:
+- Colab GPU
+- Kaggle GPU
+- quick local validation
+## What Notebook Does
+1. Install runtime deps.
+2. Set conservative env vars.
+3. Import project pipeline.
+4. Run one short attribution job.
+5. Print answer, runtime metadata, top edges.
+6. Render heatmap.
+## Best Use
+- validate model availability
+- validate driver / torch / transformers stack
+- sanity-check latency before public deploy
+## If Notebook Fails
+Check:
+- GPU available
+- model access permissions
+- enough VRAM
+- eager attention enabled
+- trace limits not too high

docs/runtime.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Runtime And Model Support
+## Execution Model
+Pipeline:
+1. Generate visible reasoning trace.
+2. Normalize and split trace into sentences.
+3. Map sentence spans to token spans.
+4. Run forward + backward pass.
+5. Build sentence influence matrix from gradient x attention.
+6. Summarize top edges and importance scores.
+## Device And Dtype Policy
+Default policy:
+- CUDA:
+  - `bfloat16` if supported
+  - else `float16`
+- MPS:
+  - `float16`
+- CPU:
+  - `float32`
+Override with:
+- `DTYPE_PREFERENCE`
+- request `dtype_preference`
+## Model Requirements
+Model must support all of:
+- causal LM generation
+- `output_attentions=True`
+- eager attention
+- supported decoder layer layout
+- supported attention module attribute
+Supported layer paths:
+- `model.layers`
+- `model.model.layers`
+- `transformer.h`
+- `gpt_neox.layers`
+Supported attention attrs:
+- `self_attn`
+- `attn`
+- `attention`
+## Why Trace Limits Exist
+Attribution path uses full backward pass over attention tensors. Cost grows with:
+- sequence length
+- layer count
+- head count
+- sentence count
+Public defaults stay small to protect uptime.
+## Good First Runtime Settings
+For public demo:
+- `max_new_tokens=128`
+- `max_trace_tokens=256`
+- `max_sentences=16`
+- `validate_top_k=0`
+For deeper analysis on bigger GPU:
+- raise trace tokens slowly
+- watch latency and memory first

notebooks/hf_space_demo.ipynb ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Thought Anchors Demo\n",
+        "\n",
+        "Colab/Kaggle notebook for a short end-to-end attribution smoke test."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install -q uv\n",
+        "!uv pip install --system fastapi huggingface_hub matplotlib numpy pydantic scipy torch transformers typer uvicorn"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from pathlib import Path\n",
+        "\n",
+        "if not Path('app').exists():\n",
+        "    raise RuntimeError('Upload or clone the repository before running the notebook.')\n",
+        "\n",
+        "os.environ.setdefault('MODEL_NAME', 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')\n",
+        "os.environ.setdefault('DEVICE_PREFERENCE', 'auto')\n",
+        "os.environ.setdefault('DTYPE_PREFERENCE', 'auto')\n",
+        "os.environ.setdefault('ATTN_IMPLEMENTATION', 'eager')\n",
+        "os.environ.setdefault('LOW_CPU_MEM_USAGE', 'true')\n",
+        "os.environ.setdefault('TRUST_REMOTE_CODE', 'true')\n",
+        "os.environ.setdefault('MAX_TRACE_TOKENS', '256')\n",
+        "os.environ.setdefault('MAX_SENTENCES', '16')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from app.core.runtime_pipeline import compute_attribution_analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "question = 'Explain why the derivative of x^2 is 2x.'\n",
+        "\n",
+        "result = compute_attribution_analysis(\n",
+        "    question=question,\n",
+        "    max_new_tokens=128,\n",
+        "    max_trace_tokens=256,\n",
+        "    max_sentences=16,\n",
+        "    validate_top_k=0,\n",
+        "    temperature=0.0,\n",
+        "    top_p=1.0,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "print('Answer:\\n', result.answer)\n",
+        "print('\\nSentences:', len(result.sentences))\n",
+        "print('Runtime:', result.runtime_metadata.model_dump())\n",
+        "print('\\nTop edges:')\n",
+        "for edge in result.top_edges[:10]:\n",
+        "    print(edge)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "\n",
+        "matrix = np.array(result.suppression_matrix)\n",
+        "figure, axis = plt.subplots(figsize=(7, 5))\n",
+        "image = axis.imshow(matrix, aspect='auto', cmap='viridis')\n",
+        "axis.set_title('Sentence Influence Matrix')\n",
+        "axis.set_xlabel('Source sentence')\n",
+        "axis.set_ylabel('Target sentence')\n",
+        "figure.colorbar(image, ax=axis)\n",
+        "plt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,36 @@

+[project]
+name = "cot-anc"
+version = "0.1.0"
+description = "Online chain-of-thought analysis with attribution patching"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "fastapi>=0.115.0",
+  "huggingface_hub[oauth]>=0.33.0",
+  "numpy>=2.0.0",
+  "pydantic>=2.7.0",
+  "scipy>=1.13.0",
+  "torch>=2.2.0",
+  "transformers>=4.44.0",
+  "typer>=0.12.3",
+  "uvicorn>=0.30.0",
+]
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.2.0",
+]
+viz = [
+  "matplotlib>=3.8.0",
+]
+[build-system]
+requires = ["setuptools>=68.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["app*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

runpod_start.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+set -euo pipefail
+export API_HOST="${API_HOST:-0.0.0.0}"
+export API_PORT="${API_PORT:-7860}"
+export DEVICE_PREFERENCE="${DEVICE_PREFERENCE:-auto}"
+export DTYPE_PREFERENCE="${DTYPE_PREFERENCE:-auto}"
+export ATTN_IMPLEMENTATION="${ATTN_IMPLEMENTATION:-eager}"
+export PRELOAD_MODEL="${PRELOAD_MODEL:-true}"
+exec uv run python -m app.cli.run_api